1/*
2 * Loongson LASX optimized h264qpel
3 *
4 * Copyright (c) 2020 Loongson Technology Corporation Limited
5 * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "h264qpel_lasx.h"
25#include "libavutil/loongarch/loongson_intrinsics.h"
26#include "libavutil/attributes.h"
27
28static const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = {
29    /* 8 width cases */
30    0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
31    0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
32    1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
33    1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
34    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
35    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
36};
37
38#define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2)  \
39( {                                                        \
40    __m256i out0_m;                                        \
41    __m256i tmp0_m;                                        \
42                                                           \
43    tmp0_m = __lasx_xvshuf_b(in1, in0, mask0);             \
44    out0_m = __lasx_xvhaddw_h_b(tmp0_m, tmp0_m);           \
45    tmp0_m = __lasx_xvshuf_b(in1, in0, mask1);             \
46    out0_m = __lasx_xvdp2add_h_b(out0_m, minus5b, tmp0_m); \
47    tmp0_m = __lasx_xvshuf_b(in1, in0, mask2);             \
48    out0_m = __lasx_xvdp2add_h_b(out0_m, plus20b, tmp0_m); \
49                                                           \
50    out0_m;                                                \
51} )
52
53#define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)  \
54( {                                                            \
55    __m256i out0_m;                                            \
56                                                               \
57    out0_m = __lasx_xvdp2_h_b(in0, coeff0);                    \
58    DUP2_ARG3(__lasx_xvdp2add_h_b, out0_m, in1, coeff1, out0_m,\
59              in2, coeff2, out0_m, out0_m);                    \
60                                                               \
61    out0_m;                                                    \
62} )
63
64static av_always_inline
65void avc_luma_hv_qrt_and_aver_dst_16x16_lasx(uint8_t *src_x,
66                                             uint8_t *src_y,
67                                             uint8_t *dst, ptrdiff_t stride)
68{
69    const int16_t filt_const0 = 0xfb01;
70    const int16_t filt_const1 = 0x1414;
71    const int16_t filt_const2 = 0x1fb;
72    uint32_t loop_cnt;
73    ptrdiff_t stride_2x = stride << 1;
74    ptrdiff_t stride_3x = stride_2x + stride;
75    ptrdiff_t stride_4x = stride << 2;
76    __m256i tmp0, tmp1;
77    __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
78    __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
79    __m256i src_vt7, src_vt8;
80    __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
81    __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
82    __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
83    __m256i vt_out3, out0, out1, out2, out3;
84    __m256i minus5b = __lasx_xvldi(0xFB);
85    __m256i plus20b = __lasx_xvldi(20);
86
87    filt0 = __lasx_xvreplgr2vr_h(filt_const0);
88    filt1 = __lasx_xvreplgr2vr_h(filt_const1);
89    filt2 = __lasx_xvreplgr2vr_h(filt_const2);
90
91    mask0 = __lasx_xvld(luma_mask_arr, 0);
92    DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2);
93    src_vt0 = __lasx_xvld(src_y, 0);
94    DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x,
95              src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
96    src_y += stride_4x;
97
98    src_vt0 = __lasx_xvxori_b(src_vt0, 128);
99    DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
100              src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
101
102    for (loop_cnt = 4; loop_cnt--;) {
103        src_hz0 = __lasx_xvld(src_x, 0);
104        DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x,
105                  src_hz1, src_hz2);
106        src_hz3 = __lasx_xvldx(src_x, stride_3x);
107        src_x  += stride_4x;
108        src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
109        src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
110        src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
111        src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
112        DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
113                  src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
114
115        hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
116        hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
117        hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
118        hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
119        hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
120        hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
121
122        DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x,
123                  src_y, stride_3x, src_y, stride_4x,
124                  src_vt5, src_vt6, src_vt7, src_vt8);
125        src_y += stride_4x;
126
127        DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
128                  src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
129
130        DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
131                  0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
132                  src_vt0, src_vt1, src_vt2, src_vt3);
133        src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
134        DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
135                  src_vt3, src_vt2, src_vt87_h, src_vt3,
136                  src_hz0, src_hz1, src_hz2, src_hz3);
137        DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
138                  src_vt3, src_vt2, src_vt87_h, src_vt3,
139                  src_vt0, src_vt1, src_vt2, src_vt3);
140        DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1, src_hz1,
141                  0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3, 0x02,
142                  src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
143        DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1, src_hz1,
144                  0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3, 0x13,
145                  src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
146        vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h, filt0,
147                                 filt1, filt2);
148        vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h, filt0,
149                                 filt1, filt2);
150        vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h, filt0,
151                                 filt1, filt2);
152        vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h, filt0,
153                                 filt1, filt2);
154        vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
155        vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
156
157        DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
158                  out0, out2);
159        DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
160                  out1, out3);
161        tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
162        tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
163
164        DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
165        out0 = __lasx_xvld(dst, 0);
166        DUP2_ARG2(__lasx_xvldx, dst, stride, dst, stride_2x, out1, out2);
167        out3 = __lasx_xvldx(dst, stride_3x);
168        out0 = __lasx_xvpermi_q(out0, out2, 0x02);
169        out1 = __lasx_xvpermi_q(out1, out3, 0x02);
170        out2 = __lasx_xvilvl_d(out1, out0);
171        out3 = __lasx_xvilvh_d(out1, out0);
172        out0 = __lasx_xvpermi_q(out2, out3, 0x02);
173        out1 = __lasx_xvpermi_q(out2, out3, 0x13);
174        tmp0 = __lasx_xvavgr_bu(out0, tmp0);
175        tmp1 = __lasx_xvavgr_bu(out1, tmp1);
176
177        __lasx_xvstelm_d(tmp0, dst, 0, 0);
178        __lasx_xvstelm_d(tmp0, dst + stride, 0, 1);
179        __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
180        __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
181
182        __lasx_xvstelm_d(tmp0, dst, 8, 2);
183        __lasx_xvstelm_d(tmp0, dst + stride, 8, 3);
184        __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
185        __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
186
187        dst    += stride_4x;
188        src_vt0 = src_vt4;
189        src_vt1 = src_vt5;
190        src_vt2 = src_vt6;
191        src_vt3 = src_vt7;
192        src_vt4 = src_vt8;
193    }
194}
195
196static av_always_inline void
197avc_luma_hv_qrt_16x16_lasx(uint8_t *src_x, uint8_t *src_y,
198                           uint8_t *dst, ptrdiff_t stride)
199{
200    const int16_t filt_const0 = 0xfb01;
201    const int16_t filt_const1 = 0x1414;
202    const int16_t filt_const2 = 0x1fb;
203    uint32_t loop_cnt;
204    ptrdiff_t stride_2x = stride << 1;
205    ptrdiff_t stride_3x = stride_2x + stride;
206    ptrdiff_t stride_4x = stride << 2;
207    __m256i tmp0, tmp1;
208    __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
209    __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
210    __m256i src_vt7, src_vt8;
211    __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
212    __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
213    __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
214    __m256i vt_out3, out0, out1, out2, out3;
215    __m256i minus5b = __lasx_xvldi(0xFB);
216    __m256i plus20b = __lasx_xvldi(20);
217
218    filt0 = __lasx_xvreplgr2vr_h(filt_const0);
219    filt1 = __lasx_xvreplgr2vr_h(filt_const1);
220    filt2 = __lasx_xvreplgr2vr_h(filt_const2);
221
222    mask0 = __lasx_xvld(luma_mask_arr, 0);
223    DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2);
224    src_vt0 = __lasx_xvld(src_y, 0);
225    DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x,
226              src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
227    src_y += stride_4x;
228
229    src_vt0 = __lasx_xvxori_b(src_vt0, 128);
230    DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
231              src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
232
233    for (loop_cnt = 4; loop_cnt--;) {
234        src_hz0 = __lasx_xvld(src_x, 0);
235        DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x,
236                  src_hz1, src_hz2);
237        src_hz3 = __lasx_xvldx(src_x, stride_3x);
238        src_x  += stride_4x;
239        src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
240        src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
241        src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
242        src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
243        DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
244                  src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
245
246        hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
247        hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
248        hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
249        hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
250        hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
251        hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
252
253        DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x,
254                  src_y, stride_3x, src_y, stride_4x,
255                  src_vt5, src_vt6, src_vt7, src_vt8);
256        src_y += stride_4x;
257
258        DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
259                  src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
260        DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
261                  0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
262                  src_vt0, src_vt1, src_vt2, src_vt3);
263        src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
264        DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
265                  src_vt3, src_vt2, src_vt87_h, src_vt3,
266                  src_hz0, src_hz1, src_hz2, src_hz3);
267        DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
268                  src_vt3, src_vt2, src_vt87_h, src_vt3,
269                  src_vt0, src_vt1, src_vt2, src_vt3);
270        DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1,
271                  src_hz1, 0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3,
272                  0x02, src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
273        DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1,
274                  src_hz1, 0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3,
275                  0x13, src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
276
277        vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h,
278                                 filt0, filt1, filt2);
279        vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h,
280                                 filt0, filt1, filt2);
281        vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h,
282                                 filt0, filt1, filt2);
283        vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h,
284                                 filt0, filt1, filt2);
285        vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
286        vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
287
288        DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
289                  out0, out2);
290        DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
291                  out1, out3);
292        tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
293        tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
294
295        DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
296        __lasx_xvstelm_d(tmp0, dst, 0, 0);
297        __lasx_xvstelm_d(tmp0, dst + stride, 0, 1);
298        __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
299        __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
300
301        __lasx_xvstelm_d(tmp0, dst, 8, 2);
302        __lasx_xvstelm_d(tmp0, dst + stride, 8, 3);
303        __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
304        __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
305
306        dst    += stride_4x;
307        src_vt0 = src_vt4;
308        src_vt1 = src_vt5;
309        src_vt2 = src_vt6;
310        src_vt3 = src_vt7;
311        src_vt4 = src_vt8;
312    }
313}
314
315/* put_pixels8_8_inline_asm: dst = src */
316static av_always_inline void
317put_pixels8_8_inline_asm(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
318{
319    uint64_t tmp[8];
320    ptrdiff_t stride_2, stride_3, stride_4;
321    __asm__ volatile (
322    "slli.d     %[stride_2],     %[stride],   1           \n\t"
323    "add.d      %[stride_3],     %[stride_2], %[stride]   \n\t"
324    "slli.d     %[stride_4],     %[stride_2], 1           \n\t"
325    "ld.d       %[tmp0],         %[src],      0x0         \n\t"
326    "ldx.d      %[tmp1],         %[src],      %[stride]   \n\t"
327    "ldx.d      %[tmp2],         %[src],      %[stride_2] \n\t"
328    "ldx.d      %[tmp3],         %[src],      %[stride_3] \n\t"
329    "add.d      %[src],          %[src],      %[stride_4] \n\t"
330    "ld.d       %[tmp4],         %[src],      0x0         \n\t"
331    "ldx.d      %[tmp5],         %[src],      %[stride]   \n\t"
332    "ldx.d      %[tmp6],         %[src],      %[stride_2] \n\t"
333    "ldx.d      %[tmp7],         %[src],      %[stride_3] \n\t"
334
335    "st.d       %[tmp0],         %[dst],      0x0         \n\t"
336    "stx.d      %[tmp1],         %[dst],      %[stride]   \n\t"
337    "stx.d      %[tmp2],         %[dst],      %[stride_2] \n\t"
338    "stx.d      %[tmp3],         %[dst],      %[stride_3] \n\t"
339    "add.d      %[dst],          %[dst],      %[stride_4] \n\t"
340    "st.d       %[tmp4],         %[dst],      0x0         \n\t"
341    "stx.d      %[tmp5],         %[dst],      %[stride]   \n\t"
342    "stx.d      %[tmp6],         %[dst],      %[stride_2] \n\t"
343    "stx.d      %[tmp7],         %[dst],      %[stride_3] \n\t"
344    : [tmp0]"=&r"(tmp[0]),        [tmp1]"=&r"(tmp[1]),
345      [tmp2]"=&r"(tmp[2]),        [tmp3]"=&r"(tmp[3]),
346      [tmp4]"=&r"(tmp[4]),        [tmp5]"=&r"(tmp[5]),
347      [tmp6]"=&r"(tmp[6]),        [tmp7]"=&r"(tmp[7]),
348      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
349      [stride_4]"=&r"(stride_4),
350      [dst]"+&r"(dst),            [src]"+&r"(src)
351    : [stride]"r"(stride)
352    : "memory"
353    );
354}
355
356/* avg_pixels8_8_lsx   : dst = avg(src, dst)
357 * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
358 * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
359static av_always_inline void
360avg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
361{
362    uint8_t *tmp = dst;
363    ptrdiff_t stride_2, stride_3, stride_4;
364    __asm__ volatile (
365    /* h0~h7 */
366    "slli.d     %[stride_2],     %[stride],   1           \n\t"
367    "add.d      %[stride_3],     %[stride_2], %[stride]   \n\t"
368    "slli.d     %[stride_4],     %[stride_2], 1           \n\t"
369    "vld        $vr0,            %[src],      0           \n\t"
370    "vldx       $vr1,            %[src],      %[stride]   \n\t"
371    "vldx       $vr2,            %[src],      %[stride_2] \n\t"
372    "vldx       $vr3,            %[src],      %[stride_3] \n\t"
373    "add.d      %[src],          %[src],      %[stride_4] \n\t"
374    "vld        $vr4,            %[src],      0           \n\t"
375    "vldx       $vr5,            %[src],      %[stride]   \n\t"
376    "vldx       $vr6,            %[src],      %[stride_2] \n\t"
377    "vldx       $vr7,            %[src],      %[stride_3] \n\t"
378
379    "vld        $vr8,            %[tmp],      0           \n\t"
380    "vldx       $vr9,            %[tmp],      %[stride]   \n\t"
381    "vldx       $vr10,           %[tmp],      %[stride_2] \n\t"
382    "vldx       $vr11,           %[tmp],      %[stride_3] \n\t"
383    "add.d      %[tmp],          %[tmp],      %[stride_4] \n\t"
384    "vld        $vr12,           %[tmp],      0           \n\t"
385    "vldx       $vr13,           %[tmp],      %[stride]   \n\t"
386    "vldx       $vr14,           %[tmp],      %[stride_2] \n\t"
387    "vldx       $vr15,           %[tmp],      %[stride_3] \n\t"
388
389    "vavgr.bu    $vr0,           $vr8,        $vr0        \n\t"
390    "vavgr.bu    $vr1,           $vr9,        $vr1        \n\t"
391    "vavgr.bu    $vr2,           $vr10,       $vr2        \n\t"
392    "vavgr.bu    $vr3,           $vr11,       $vr3        \n\t"
393    "vavgr.bu    $vr4,           $vr12,       $vr4        \n\t"
394    "vavgr.bu    $vr5,           $vr13,       $vr5        \n\t"
395    "vavgr.bu    $vr6,           $vr14,       $vr6        \n\t"
396    "vavgr.bu    $vr7,           $vr15,       $vr7        \n\t"
397
398    "vstelm.d    $vr0,           %[dst],      0,  0       \n\t"
399    "add.d       %[dst],         %[dst],      %[stride]   \n\t"
400    "vstelm.d    $vr1,           %[dst],      0,  0       \n\t"
401    "add.d       %[dst],         %[dst],      %[stride]   \n\t"
402    "vstelm.d    $vr2,           %[dst],      0,  0       \n\t"
403    "add.d       %[dst],         %[dst],      %[stride]   \n\t"
404    "vstelm.d    $vr3,           %[dst],      0,  0       \n\t"
405    "add.d       %[dst],         %[dst],      %[stride]   \n\t"
406    "vstelm.d    $vr4,           %[dst],      0,  0       \n\t"
407    "add.d       %[dst],         %[dst],      %[stride]   \n\t"
408    "vstelm.d    $vr5,           %[dst],      0,  0       \n\t"
409    "add.d       %[dst],         %[dst],      %[stride]   \n\t"
410    "vstelm.d    $vr6,           %[dst],      0,  0       \n\t"
411    "add.d       %[dst],         %[dst],      %[stride]   \n\t"
412    "vstelm.d    $vr7,           %[dst],      0,  0       \n\t"
413    : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src),
414      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
415      [stride_4]"=&r"(stride_4)
416    : [stride]"r"(stride)
417    : "memory"
418    );
419}
420
421/* avg_pixels8_8_lsx   : dst = avg(src, dst)
422 * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
423 * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
424static av_always_inline void
425put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
426                     ptrdiff_t dstStride, ptrdiff_t srcStride)
427{
428    ptrdiff_t stride_2, stride_3, stride_4;
429    __asm__ volatile (
430    /* h0~h7 */
431    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
432    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
433    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
434    "vld        $vr0,            %[src],         0            \n\t"
435    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
436    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
437    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
438    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
439    "vld        $vr4,            %[src],         0            \n\t"
440    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
441    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
442    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
443
444    "vld        $vr8,            %[half],        0x00         \n\t"
445    "vld        $vr9,            %[half],        0x08         \n\t"
446    "vld        $vr10,           %[half],        0x10         \n\t"
447    "vld        $vr11,           %[half],        0x18         \n\t"
448    "vld        $vr12,           %[half],        0x20         \n\t"
449    "vld        $vr13,           %[half],        0x28         \n\t"
450    "vld        $vr14,           %[half],        0x30         \n\t"
451    "vld        $vr15,           %[half],        0x38         \n\t"
452
453    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
454    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
455    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
456    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
457    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
458    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
459    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
460    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
461
462    "vstelm.d   $vr0,            %[dst],         0,  0        \n\t"
463    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
464    "vstelm.d   $vr1,            %[dst],         0,  0        \n\t"
465    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
466    "vstelm.d   $vr2,            %[dst],         0,  0        \n\t"
467    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
468    "vstelm.d   $vr3,            %[dst],         0,  0        \n\t"
469    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
470    "vstelm.d   $vr4,            %[dst],         0,  0        \n\t"
471    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
472    "vstelm.d   $vr5,            %[dst],         0,  0        \n\t"
473    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
474    "vstelm.d   $vr6,            %[dst],         0,  0        \n\t"
475    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
476    "vstelm.d   $vr7,            %[dst],         0,  0        \n\t"
477    : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
478      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
479      [stride_4]"=&r"(stride_4)
480    : [srcStride]"r"(srcStride), [dstStride]"r"(dstStride)
481    : "memory"
482    );
483}
484
485/* avg_pixels8_8_lsx   : dst = avg(src, dst)
486 * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
487 * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
488static av_always_inline void
489avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
490                     ptrdiff_t dstStride, ptrdiff_t srcStride)
491{
492    uint8_t *tmp = dst;
493    ptrdiff_t stride_2, stride_3, stride_4;
494    __asm__ volatile (
495    /* h0~h7 */
496    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
497    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
498    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
499    "vld        $vr0,            %[src],         0            \n\t"
500    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
501    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
502    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
503    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
504    "vld        $vr4,            %[src],         0            \n\t"
505    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
506    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
507    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
508
509    "vld        $vr8,            %[half],        0x00         \n\t"
510    "vld        $vr9,            %[half],        0x08         \n\t"
511    "vld        $vr10,           %[half],        0x10         \n\t"
512    "vld        $vr11,           %[half],        0x18         \n\t"
513    "vld        $vr12,           %[half],        0x20         \n\t"
514    "vld        $vr13,           %[half],        0x28         \n\t"
515    "vld        $vr14,           %[half],        0x30         \n\t"
516    "vld        $vr15,           %[half],        0x38         \n\t"
517
518    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
519    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
520    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
521    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
522    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
523    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
524    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
525    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
526
527    "slli.d     %[stride_2],     %[dstStride],   1            \n\t"
528    "add.d      %[stride_3],     %[stride_2],    %[dstStride] \n\t"
529    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
530    "vld        $vr8,            %[tmp],         0            \n\t"
531    "vldx       $vr9,            %[tmp],         %[dstStride] \n\t"
532    "vldx       $vr10,           %[tmp],         %[stride_2]  \n\t"
533    "vldx       $vr11,           %[tmp],         %[stride_3]  \n\t"
534    "add.d      %[tmp],          %[tmp],         %[stride_4]  \n\t"
535    "vld        $vr12,           %[tmp],         0            \n\t"
536    "vldx       $vr13,           %[tmp],         %[dstStride] \n\t"
537    "vldx       $vr14,           %[tmp],         %[stride_2]  \n\t"
538    "vldx       $vr15,           %[tmp],         %[stride_3]  \n\t"
539
540    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
541    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
542    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
543    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
544    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
545    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
546    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
547    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
548
549    "vstelm.d    $vr0,           %[dst],         0,  0        \n\t"
550    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
551    "vstelm.d    $vr1,           %[dst],         0,  0        \n\t"
552    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
553    "vstelm.d    $vr2,           %[dst],         0,  0        \n\t"
554    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
555    "vstelm.d    $vr3,           %[dst],         0,  0        \n\t"
556    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
557    "vstelm.d    $vr4,           %[dst],         0,  0        \n\t"
558    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
559    "vstelm.d    $vr5,           %[dst],         0,  0        \n\t"
560    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
561    "vstelm.d    $vr6,           %[dst],         0,  0        \n\t"
562    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
563    "vstelm.d    $vr7,           %[dst],         0,  0        \n\t"
564    : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half),
565      [src]"+&r"(src), [stride_2]"=&r"(stride_2),
566      [stride_3]"=&r"(stride_3), [stride_4]"=&r"(stride_4)
567    : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
568    : "memory"
569    );
570}
571
572/* put_pixels16_8_lsx: dst = src */
573static av_always_inline void
574put_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
575{
576    ptrdiff_t stride_2, stride_3, stride_4;
577    __asm__ volatile (
578    "slli.d     %[stride_2],     %[stride],      1            \n\t"
579    "add.d      %[stride_3],     %[stride_2],    %[stride]    \n\t"
580    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
581    "vld        $vr0,            %[src],         0            \n\t"
582    "vldx       $vr1,            %[src],         %[stride]    \n\t"
583    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
584    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
585    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
586    "vld        $vr4,            %[src],         0            \n\t"
587    "vldx       $vr5,            %[src],         %[stride]    \n\t"
588    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
589    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
590    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
591
592    "vst        $vr0,            %[dst],         0            \n\t"
593    "vstx       $vr1,            %[dst],         %[stride]    \n\t"
594    "vstx       $vr2,            %[dst],         %[stride_2]  \n\t"
595    "vstx       $vr3,            %[dst],         %[stride_3]  \n\t"
596    "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
597    "vst        $vr4,            %[dst],         0            \n\t"
598    "vstx       $vr5,            %[dst],         %[stride]    \n\t"
599    "vstx       $vr6,            %[dst],         %[stride_2]  \n\t"
600    "vstx       $vr7,            %[dst],         %[stride_3]  \n\t"
601    "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
602
603    "vld        $vr0,            %[src],         0            \n\t"
604    "vldx       $vr1,            %[src],         %[stride]    \n\t"
605    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
606    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
607    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
608    "vld        $vr4,            %[src],         0            \n\t"
609    "vldx       $vr5,            %[src],         %[stride]    \n\t"
610    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
611    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
612
613    "vst        $vr0,            %[dst],         0            \n\t"
614    "vstx       $vr1,            %[dst],         %[stride]    \n\t"
615    "vstx       $vr2,            %[dst],         %[stride_2]  \n\t"
616    "vstx       $vr3,            %[dst],         %[stride_3]  \n\t"
617    "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
618    "vst        $vr4,            %[dst],         0            \n\t"
619    "vstx       $vr5,            %[dst],         %[stride]    \n\t"
620    "vstx       $vr6,            %[dst],         %[stride_2]  \n\t"
621    "vstx       $vr7,            %[dst],         %[stride_3]  \n\t"
622    : [dst]"+&r"(dst),            [src]"+&r"(src),
623      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
624      [stride_4]"=&r"(stride_4)
625    : [stride]"r"(stride)
626    : "memory"
627    );
628}
629
630/* avg_pixels16_8_lsx    : dst = avg(src, dst)
631 * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
632 * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
633static av_always_inline void
634avg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
635{
636    uint8_t *tmp = dst;
637    ptrdiff_t stride_2, stride_3, stride_4;
638    __asm__ volatile (
639    /* h0~h7 */
640    "slli.d     %[stride_2],     %[stride],      1            \n\t"
641    "add.d      %[stride_3],     %[stride_2],    %[stride]    \n\t"
642    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
643    "vld        $vr0,            %[src],         0            \n\t"
644    "vldx       $vr1,            %[src],         %[stride]    \n\t"
645    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
646    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
647    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
648    "vld        $vr4,            %[src],         0            \n\t"
649    "vldx       $vr5,            %[src],         %[stride]    \n\t"
650    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
651    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
652    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
653
654    "vld        $vr8,            %[tmp],         0            \n\t"
655    "vldx       $vr9,            %[tmp],         %[stride]    \n\t"
656    "vldx       $vr10,           %[tmp],         %[stride_2]  \n\t"
657    "vldx       $vr11,           %[tmp],         %[stride_3]  \n\t"
658    "add.d      %[tmp],          %[tmp],         %[stride_4]  \n\t"
659    "vld        $vr12,           %[tmp],         0            \n\t"
660    "vldx       $vr13,           %[tmp],         %[stride]    \n\t"
661    "vldx       $vr14,           %[tmp],         %[stride_2]  \n\t"
662    "vldx       $vr15,           %[tmp],         %[stride_3]  \n\t"
663    "add.d      %[tmp],          %[tmp],         %[stride_4]  \n\t"
664
665    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
666    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
667    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
668    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
669    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
670    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
671    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
672    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
673
674    "vst        $vr0,            %[dst],         0            \n\t"
675    "vstx       $vr1,            %[dst],         %[stride]    \n\t"
676    "vstx       $vr2,            %[dst],         %[stride_2]  \n\t"
677    "vstx       $vr3,            %[dst],         %[stride_3]  \n\t"
678    "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
679    "vst        $vr4,            %[dst],         0            \n\t"
680    "vstx       $vr5,            %[dst],         %[stride]    \n\t"
681    "vstx       $vr6,            %[dst],         %[stride_2]  \n\t"
682    "vstx       $vr7,            %[dst],         %[stride_3]  \n\t"
683    "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
684
685    /* h8~h15 */
686    "vld        $vr0,            %[src],         0            \n\t"
687    "vldx       $vr1,            %[src],         %[stride]    \n\t"
688    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
689    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
690    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
691    "vld        $vr4,            %[src],         0            \n\t"
692    "vldx       $vr5,            %[src],         %[stride]    \n\t"
693    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
694    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
695
696    "vld        $vr8,            %[tmp],         0            \n\t"
697    "vldx       $vr9,            %[tmp],         %[stride]    \n\t"
698    "vldx       $vr10,           %[tmp],         %[stride_2]  \n\t"
699    "vldx       $vr11,           %[tmp],         %[stride_3]  \n\t"
700    "add.d      %[tmp],          %[tmp],         %[stride_4]  \n\t"
701    "vld        $vr12,           %[tmp],         0            \n\t"
702    "vldx       $vr13,           %[tmp],         %[stride]    \n\t"
703    "vldx       $vr14,           %[tmp],         %[stride_2]  \n\t"
704    "vldx       $vr15,           %[tmp],         %[stride_3]  \n\t"
705
706    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
707    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
708    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
709    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
710    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
711    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
712    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
713    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
714
715    "vst        $vr0,            %[dst],         0            \n\t"
716    "vstx       $vr1,            %[dst],         %[stride]    \n\t"
717    "vstx       $vr2,            %[dst],         %[stride_2]  \n\t"
718    "vstx       $vr3,            %[dst],         %[stride_3]  \n\t"
719    "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
720    "vst        $vr4,            %[dst],         0            \n\t"
721    "vstx       $vr5,            %[dst],         %[stride]    \n\t"
722    "vstx       $vr6,            %[dst],         %[stride_2]  \n\t"
723    "vstx       $vr7,            %[dst],         %[stride_3]  \n\t"
724    : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src),
725      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
726      [stride_4]"=&r"(stride_4)
727    : [stride]"r"(stride)
728    : "memory"
729    );
730}
731
732/* avg_pixels16_8_lsx   : dst = avg(src, dst)
733 * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
734 * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
735static av_always_inline void
736put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
737                      ptrdiff_t dstStride, ptrdiff_t srcStride)
738{
739    ptrdiff_t stride_2, stride_3, stride_4;
740    ptrdiff_t dstride_2, dstride_3, dstride_4;
741    __asm__ volatile (
742    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
743    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
744    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
745    "slli.d     %[dstride_2],    %[dstStride],   1            \n\t"
746    "add.d      %[dstride_3],    %[dstride_2],   %[dstStride] \n\t"
747    "slli.d     %[dstride_4],    %[dstride_2],   1            \n\t"
748    /* h0~h7 */
749    "vld        $vr0,            %[src],         0            \n\t"
750    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
751    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
752    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
753    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
754    "vld        $vr4,            %[src],         0            \n\t"
755    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
756    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
757    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
758    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
759
760    "vld        $vr8,            %[half],        0x00         \n\t"
761    "vld        $vr9,            %[half],        0x10         \n\t"
762    "vld        $vr10,           %[half],        0x20         \n\t"
763    "vld        $vr11,           %[half],        0x30         \n\t"
764    "vld        $vr12,           %[half],        0x40         \n\t"
765    "vld        $vr13,           %[half],        0x50         \n\t"
766    "vld        $vr14,           %[half],        0x60         \n\t"
767    "vld        $vr15,           %[half],        0x70         \n\t"
768
769    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
770    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
771    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
772    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
773    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
774    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
775    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
776    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
777
778    "vst        $vr0,            %[dst],         0            \n\t"
779    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
780    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
781    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
782    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
783    "vst        $vr4,            %[dst],         0            \n\t"
784    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
785    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
786    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
787    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
788
789    /* h8~h15 */
790    "vld        $vr0,            %[src],         0            \n\t"
791    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
792    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
793    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
794    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
795    "vld        $vr4,            %[src],         0            \n\t"
796    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
797    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
798    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
799
800    "vld        $vr8,            %[half],        0x80         \n\t"
801    "vld        $vr9,            %[half],        0x90         \n\t"
802    "vld        $vr10,           %[half],        0xa0         \n\t"
803    "vld        $vr11,           %[half],        0xb0         \n\t"
804    "vld        $vr12,           %[half],        0xc0         \n\t"
805    "vld        $vr13,           %[half],        0xd0         \n\t"
806    "vld        $vr14,           %[half],        0xe0         \n\t"
807    "vld        $vr15,           %[half],        0xf0         \n\t"
808
809    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
810    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
811    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
812    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
813    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
814    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
815    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
816    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
817
818    "vst        $vr0,            %[dst],         0            \n\t"
819    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
820    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
821    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
822    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
823    "vst        $vr4,            %[dst],         0            \n\t"
824    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
825    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
826    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
827    : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
828      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
829      [stride_4]"=&r"(stride_4),  [dstride_2]"=&r"(dstride_2),
830      [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
831    : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
832    : "memory"
833    );
834}
835
836/* avg_pixels16_8_lsx    : dst = avg(src, dst)
837 * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
838 * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
839static av_always_inline void
840avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
841                      ptrdiff_t dstStride, ptrdiff_t srcStride)
842{
843    uint8_t *tmp = dst;
844    ptrdiff_t stride_2, stride_3, stride_4;
845    ptrdiff_t dstride_2, dstride_3, dstride_4;
846    __asm__ volatile (
847    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
848    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
849    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
850    "slli.d     %[dstride_2],    %[dstStride],   1            \n\t"
851    "add.d      %[dstride_3],    %[dstride_2],   %[dstStride] \n\t"
852    "slli.d     %[dstride_4],    %[dstride_2],   1            \n\t"
853    /* h0~h7 */
854    "vld        $vr0,            %[src],         0            \n\t"
855    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
856    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
857    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
858    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
859    "vld        $vr4,            %[src],         0            \n\t"
860    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
861    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
862    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
863    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
864
865    "vld        $vr8,            %[half],        0x00         \n\t"
866    "vld        $vr9,            %[half],        0x10         \n\t"
867    "vld        $vr10,           %[half],        0x20         \n\t"
868    "vld        $vr11,           %[half],        0x30         \n\t"
869    "vld        $vr12,           %[half],        0x40         \n\t"
870    "vld        $vr13,           %[half],        0x50         \n\t"
871    "vld        $vr14,           %[half],        0x60         \n\t"
872    "vld        $vr15,           %[half],        0x70         \n\t"
873
874    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
875    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
876    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
877    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
878    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
879    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
880    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
881    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
882
883    "vld        $vr8,            %[tmp],         0            \n\t"
884    "vldx       $vr9,            %[tmp],         %[dstStride] \n\t"
885    "vldx       $vr10,           %[tmp],         %[dstride_2] \n\t"
886    "vldx       $vr11,           %[tmp],         %[dstride_3] \n\t"
887    "add.d      %[tmp],          %[tmp],         %[dstride_4] \n\t"
888    "vld        $vr12,           %[tmp],         0            \n\t"
889    "vldx       $vr13,           %[tmp],         %[dstStride] \n\t"
890    "vldx       $vr14,           %[tmp],         %[dstride_2] \n\t"
891    "vldx       $vr15,           %[tmp],         %[dstride_3] \n\t"
892    "add.d      %[tmp],          %[tmp],         %[dstride_4] \n\t"
893
894    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
895    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
896    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
897    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
898    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
899    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
900    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
901    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
902
903    "vst        $vr0,            %[dst],         0            \n\t"
904    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
905    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
906    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
907    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
908    "vst        $vr4,            %[dst],         0            \n\t"
909    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
910    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
911    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
912    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
913
914    /* h8~h15    */
915    "vld        $vr0,            %[src],         0            \n\t"
916    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
917    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
918    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
919    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
920    "vld        $vr4,            %[src],         0            \n\t"
921    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
922    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
923    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
924
925    "vld        $vr8,            %[half],        0x80         \n\t"
926    "vld        $vr9,            %[half],        0x90         \n\t"
927    "vld        $vr10,           %[half],        0xa0         \n\t"
928    "vld        $vr11,           %[half],        0xb0         \n\t"
929    "vld        $vr12,           %[half],        0xc0         \n\t"
930    "vld        $vr13,           %[half],        0xd0         \n\t"
931    "vld        $vr14,           %[half],        0xe0         \n\t"
932    "vld        $vr15,           %[half],        0xf0         \n\t"
933
934    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
935    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
936    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
937    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
938    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
939    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
940    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
941    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
942
943    "vld        $vr8,            %[tmp],         0            \n\t"
944    "vldx       $vr9,            %[tmp],         %[dstStride] \n\t"
945    "vldx       $vr10,           %[tmp],         %[dstride_2] \n\t"
946    "vldx       $vr11,           %[tmp],         %[dstride_3] \n\t"
947    "add.d      %[tmp],          %[tmp],         %[dstride_4] \n\t"
948    "vld        $vr12,           %[tmp],         0            \n\t"
949    "vldx       $vr13,           %[tmp],         %[dstStride] \n\t"
950    "vldx       $vr14,           %[tmp],         %[dstride_2] \n\t"
951    "vldx       $vr15,           %[tmp],         %[dstride_3] \n\t"
952
953    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
954    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
955    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
956    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
957    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
958    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
959    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
960    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
961
962    "vst        $vr0,            %[dst],         0            \n\t"
963    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
964    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
965    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
966    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
967    "vst        $vr4,            %[dst],         0            \n\t"
968    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
969    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
970    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
971    : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half), [src]"+&r"(src),
972      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
973      [stride_4]"=&r"(stride_4),  [dstride_2]"=&r"(dstride_2),
974      [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
975    : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
976    : "memory"
977    );
978}
979
980#define QPEL8_H_LOWPASS(out_v)                                               \
981    src00 = __lasx_xvld(src, - 2);                                           \
982    src += srcStride;                                                        \
983    src10 = __lasx_xvld(src, - 2);                                           \
984    src += srcStride;                                                        \
985    src00 = __lasx_xvpermi_q(src00, src10, 0x02);                            \
986    src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1);                   \
987    src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2);                   \
988    src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3);                   \
989    src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4);                   \
990    src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5);                   \
991    DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
992    src00 = __lasx_xvaddwl_h_bu(src00, src05);                               \
993    src02 = __lasx_xvmul_h(src02, h_20);                                     \
994    src01 = __lasx_xvmul_h(src01, h_5);                                      \
995    src02 = __lasx_xvssub_h(src02, src01);                                   \
996    src02 = __lasx_xvsadd_h(src02, src00);                                   \
997    src02 = __lasx_xvsadd_h(src02, h_16);                                    \
998    out_v = __lasx_xvssrani_bu_h(src02, src02, 5);                           \
999
1000static av_always_inline void
1001put_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride,
1002                              int srcStride)
1003{
1004    int dstStride_2x = dstStride << 1;
1005    __m256i src00, src01, src02, src03, src04, src05, src10;
1006    __m256i out0, out1, out2, out3;
1007    __m256i h_20 = __lasx_xvldi(0x414);
1008    __m256i h_5  = __lasx_xvldi(0x405);
1009    __m256i h_16 = __lasx_xvldi(0x410);
1010    __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1011    __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1012    __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1013    __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1014    __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1015
1016    QPEL8_H_LOWPASS(out0)
1017    QPEL8_H_LOWPASS(out1)
1018    QPEL8_H_LOWPASS(out2)
1019    QPEL8_H_LOWPASS(out3)
1020    __lasx_xvstelm_d(out0, dst, 0, 0);
1021    __lasx_xvstelm_d(out0, dst + dstStride, 0, 2);
1022    dst += dstStride_2x;
1023    __lasx_xvstelm_d(out1, dst, 0, 0);
1024    __lasx_xvstelm_d(out1, dst + dstStride, 0, 2);
1025    dst += dstStride_2x;
1026    __lasx_xvstelm_d(out2, dst, 0, 0);
1027    __lasx_xvstelm_d(out2, dst + dstStride, 0, 2);
1028    dst += dstStride_2x;
1029    __lasx_xvstelm_d(out3, dst, 0, 0);
1030    __lasx_xvstelm_d(out3, dst + dstStride, 0, 2);
1031}
1032
1033#define QPEL8_V_LOWPASS(src0, src1, src2, src3, src4, src5, src6,       \
1034                        tmp0, tmp1, tmp2, tmp3, tmp4, tmp5)             \
1035{                                                                       \
1036    tmp0 = __lasx_xvpermi_q(src0, src1, 0x02);                          \
1037    tmp1 = __lasx_xvpermi_q(src1, src2, 0x02);                          \
1038    tmp2 = __lasx_xvpermi_q(src2, src3, 0x02);                          \
1039    tmp3 = __lasx_xvpermi_q(src3, src4, 0x02);                          \
1040    tmp4 = __lasx_xvpermi_q(src4, src5, 0x02);                          \
1041    tmp5 = __lasx_xvpermi_q(src5, src6, 0x02);                          \
1042    DUP2_ARG2(__lasx_xvaddwl_h_bu, tmp2, tmp3, tmp1, tmp4, tmp2, tmp1); \
1043    tmp0 = __lasx_xvaddwl_h_bu(tmp0, tmp5);                             \
1044    tmp2 = __lasx_xvmul_h(tmp2, h_20);                                  \
1045    tmp1 = __lasx_xvmul_h(tmp1, h_5);                                   \
1046    tmp2 = __lasx_xvssub_h(tmp2, tmp1);                                 \
1047    tmp2 = __lasx_xvsadd_h(tmp2, tmp0);                                 \
1048    tmp2 = __lasx_xvsadd_h(tmp2, h_16);                                 \
1049    tmp2 = __lasx_xvssrani_bu_h(tmp2, tmp2, 5);                         \
1050}
1051
1052static av_always_inline void
1053put_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride,
1054                              int srcStride)
1055{
1056    int srcStride_2x = srcStride << 1;
1057    int dstStride_2x = dstStride << 1;
1058    int srcStride_4x = srcStride << 2;
1059    int srcStride_3x = srcStride_2x + srcStride;
1060    __m256i src00, src01, src02, src03, src04, src05, src06;
1061    __m256i src07, src08, src09, src10, src11, src12;
1062    __m256i tmp00, tmp01, tmp02, tmp03, tmp04, tmp05;
1063    __m256i h_20 = __lasx_xvldi(0x414);
1064    __m256i h_5  = __lasx_xvldi(0x405);
1065    __m256i h_16 = __lasx_xvldi(0x410);
1066
1067    DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0,
1068              src00, src01);
1069    src02 = __lasx_xvld(src, 0);
1070    DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1071              srcStride_3x, src, srcStride_4x, src03, src04, src05, src06);
1072    src += srcStride_4x;
1073    DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1074              srcStride_3x, src, srcStride_4x, src07, src08, src09, src10);
1075    src += srcStride_4x;
1076    DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12);
1077
1078    QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06,
1079                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1080    __lasx_xvstelm_d(tmp02, dst, 0, 0);
1081    __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1082    dst += dstStride_2x;
1083    QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08,
1084                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1085    __lasx_xvstelm_d(tmp02, dst, 0, 0);
1086    __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1087    dst += dstStride_2x;
1088    QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10,
1089                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1090    __lasx_xvstelm_d(tmp02, dst, 0, 0);
1091    __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1092    dst += dstStride_2x;
1093    QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12,
1094                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1095    __lasx_xvstelm_d(tmp02, dst, 0, 0);
1096    __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1097}
1098
1099static av_always_inline void
1100avg_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride,
1101                              int srcStride)
1102{
1103    int srcStride_2x = srcStride << 1;
1104    int srcStride_4x = srcStride << 2;
1105    int dstStride_2x = dstStride << 1;
1106    int dstStride_4x = dstStride << 2;
1107    int srcStride_3x = srcStride_2x + srcStride;
1108    int dstStride_3x = dstStride_2x + dstStride;
1109    __m256i src00, src01, src02, src03, src04, src05, src06;
1110    __m256i src07, src08, src09, src10, src11, src12, tmp00;
1111    __m256i tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp08, tmp09;
1112    __m256i h_20 = __lasx_xvldi(0x414);
1113    __m256i h_5  = __lasx_xvldi(0x405);
1114    __m256i h_16 = __lasx_xvldi(0x410);
1115
1116
1117    DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0,
1118              src00, src01);
1119    src02 = __lasx_xvld(src, 0);
1120    DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1121              srcStride_3x, src, srcStride_4x, src03, src04, src05, src06);
1122    src += srcStride_4x;
1123    DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1124              srcStride_3x, src, srcStride_4x, src07, src08, src09, src10);
1125    src += srcStride_4x;
1126    DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12);
1127
1128    tmp06 = __lasx_xvld(dst, 0);
1129    DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
1130              dst, dstStride_3x, dst, dstStride_4x,
1131              tmp07, tmp02, tmp03, tmp04);
1132    dst += dstStride_4x;
1133    DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
1134              tmp05, tmp00);
1135    tmp01 = __lasx_xvldx(dst, dstStride_3x);
1136    dst -= dstStride_4x;
1137
1138    tmp06 = __lasx_xvpermi_q(tmp06, tmp07, 0x02);
1139    tmp07 = __lasx_xvpermi_q(tmp02, tmp03, 0x02);
1140    tmp08 = __lasx_xvpermi_q(tmp04, tmp05, 0x02);
1141    tmp09 = __lasx_xvpermi_q(tmp00, tmp01, 0x02);
1142
1143    QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06,
1144                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1145    tmp06 = __lasx_xvavgr_bu(tmp06, tmp02);
1146    __lasx_xvstelm_d(tmp06, dst, 0, 0);
1147    __lasx_xvstelm_d(tmp06, dst + dstStride, 0, 2);
1148    dst += dstStride_2x;
1149    QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08,
1150                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1151    tmp07 = __lasx_xvavgr_bu(tmp07, tmp02);
1152    __lasx_xvstelm_d(tmp07, dst, 0, 0);
1153    __lasx_xvstelm_d(tmp07, dst + dstStride, 0, 2);
1154    dst += dstStride_2x;
1155    QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10,
1156                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1157    tmp08 = __lasx_xvavgr_bu(tmp08, tmp02);
1158    __lasx_xvstelm_d(tmp08, dst, 0, 0);
1159    __lasx_xvstelm_d(tmp08, dst + dstStride, 0, 2);
1160    dst += dstStride_2x;
1161    QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12,
1162                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1163    tmp09 = __lasx_xvavgr_bu(tmp09, tmp02);
1164    __lasx_xvstelm_d(tmp09, dst, 0, 0);
1165    __lasx_xvstelm_d(tmp09, dst + dstStride, 0, 2);
1166}
1167
1168#define QPEL8_HV_LOWPASS_H(tmp)                                              \
1169{                                                                            \
1170    src00 = __lasx_xvld(src, -2);                                            \
1171    src += srcStride;                                                        \
1172    src10 = __lasx_xvld(src, -2);                                            \
1173    src += srcStride;                                                        \
1174    src00 = __lasx_xvpermi_q(src00, src10, 0x02);                            \
1175    src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1);                   \
1176    src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2);                   \
1177    src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3);                   \
1178    src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4);                   \
1179    src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5);                   \
1180    DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
1181    src00 = __lasx_xvaddwl_h_bu(src00, src05);                               \
1182    src02 = __lasx_xvmul_h(src02, h_20);                                     \
1183    src01 = __lasx_xvmul_h(src01, h_5);                                      \
1184    src02 = __lasx_xvssub_h(src02, src01);                                   \
1185    tmp  = __lasx_xvsadd_h(src02, src00);                                    \
1186}
1187
1188#define QPEL8_HV_LOWPASS_V(src0, src1, src2, src3,                       \
1189                           src4, src5, temp0, temp1,                     \
1190                           temp2, temp3, temp4, temp5,                   \
1191                           out)                                          \
1192{                                                                        \
1193    DUP2_ARG2(__lasx_xvaddwl_w_h, src2, src3, src1, src4, temp0, temp2); \
1194    DUP2_ARG2(__lasx_xvaddwh_w_h, src2, src3, src1, src4, temp1, temp3); \
1195    temp4 = __lasx_xvaddwl_w_h(src0, src5);                              \
1196    temp5 = __lasx_xvaddwh_w_h(src0, src5);                              \
1197    temp0 = __lasx_xvmul_w(temp0, w_20);                                 \
1198    temp1 = __lasx_xvmul_w(temp1, w_20);                                 \
1199    temp2 = __lasx_xvmul_w(temp2, w_5);                                  \
1200    temp3 = __lasx_xvmul_w(temp3, w_5);                                  \
1201    temp0 = __lasx_xvssub_w(temp0, temp2);                               \
1202    temp1 = __lasx_xvssub_w(temp1, temp3);                               \
1203    temp0 = __lasx_xvsadd_w(temp0, temp4);                               \
1204    temp1 = __lasx_xvsadd_w(temp1, temp5);                               \
1205    temp0 = __lasx_xvsadd_w(temp0, w_512);                               \
1206    temp1 = __lasx_xvsadd_w(temp1, w_512);                               \
1207    temp0 = __lasx_xvssrani_hu_w(temp0, temp0, 10);                      \
1208    temp1 = __lasx_xvssrani_hu_w(temp1, temp1, 10);                      \
1209    temp0 = __lasx_xvpackev_d(temp1, temp0);                             \
1210    out   = __lasx_xvssrani_bu_h(temp0, temp0, 0);                       \
1211}
1212
1213static av_always_inline void
1214put_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1215                               ptrdiff_t dstStride, ptrdiff_t srcStride)
1216{
1217    __m256i src00, src01, src02, src03, src04, src05, src10;
1218    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1219    __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
1220    __m256i h_20 = __lasx_xvldi(0x414);
1221    __m256i h_5  = __lasx_xvldi(0x405);
1222    __m256i w_20 = __lasx_xvldi(0x814);
1223    __m256i w_5  = __lasx_xvldi(0x805);
1224    __m256i w_512 = {512};
1225    __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1226    __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1227    __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1228    __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1229    __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1230
1231    w_512 = __lasx_xvreplve0_w(w_512);
1232
1233    src -= srcStride << 1;
1234    QPEL8_HV_LOWPASS_H(tmp0)
1235    QPEL8_HV_LOWPASS_H(tmp2)
1236    QPEL8_HV_LOWPASS_H(tmp4)
1237    QPEL8_HV_LOWPASS_H(tmp6)
1238    QPEL8_HV_LOWPASS_H(tmp8)
1239    QPEL8_HV_LOWPASS_H(tmp10)
1240    QPEL8_HV_LOWPASS_H(tmp12)
1241    tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
1242    tmp9  = __lasx_xvpermi_q(tmp10, tmp8,  0x21);
1243    tmp7  = __lasx_xvpermi_q(tmp8,  tmp6,  0x21);
1244    tmp5  = __lasx_xvpermi_q(tmp6,  tmp4,  0x21);
1245    tmp3  = __lasx_xvpermi_q(tmp4,  tmp2,  0x21);
1246    tmp1  = __lasx_xvpermi_q(tmp2,  tmp0,  0x21);
1247
1248    QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01,
1249                       src02, src03, src04, src05, tmp0)
1250    QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01,
1251                       src02, src03, src04, src05, tmp2)
1252    QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01,
1253                       src02, src03, src04, src05, tmp4)
1254    QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01,
1255                       src02, src03, src04, src05, tmp6)
1256    __lasx_xvstelm_d(tmp0, dst, 0, 0);
1257    dst += dstStride;
1258    __lasx_xvstelm_d(tmp0, dst, 0, 2);
1259    dst += dstStride;
1260    __lasx_xvstelm_d(tmp2, dst, 0, 0);
1261    dst += dstStride;
1262    __lasx_xvstelm_d(tmp2, dst, 0, 2);
1263    dst += dstStride;
1264    __lasx_xvstelm_d(tmp4, dst, 0, 0);
1265    dst += dstStride;
1266    __lasx_xvstelm_d(tmp4, dst, 0, 2);
1267    dst += dstStride;
1268    __lasx_xvstelm_d(tmp6, dst, 0, 0);
1269    dst += dstStride;
1270    __lasx_xvstelm_d(tmp6, dst, 0, 2);
1271}
1272
1273static av_always_inline void
1274avg_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride,
1275                              int srcStride)
1276{
1277    int dstStride_2x = dstStride << 1;
1278    int dstStride_4x = dstStride << 2;
1279    int dstStride_3x = dstStride_2x + dstStride;
1280    __m256i src00, src01, src02, src03, src04, src05, src10;
1281    __m256i dst00, dst01, dst0, dst1, dst2, dst3;
1282    __m256i out0, out1, out2, out3;
1283    __m256i h_20 = __lasx_xvldi(0x414);
1284    __m256i h_5  = __lasx_xvldi(0x405);
1285    __m256i h_16 = __lasx_xvldi(0x410);
1286    __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1287    __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1288    __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1289    __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1290    __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1291
1292    QPEL8_H_LOWPASS(out0)
1293    QPEL8_H_LOWPASS(out1)
1294    QPEL8_H_LOWPASS(out2)
1295    QPEL8_H_LOWPASS(out3)
1296    src00 = __lasx_xvld(dst, 0);
1297    DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
1298              dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
1299    dst += dstStride_4x;
1300    DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, dst00);
1301    dst01 = __lasx_xvldx(dst, dstStride_3x);
1302    dst -= dstStride_4x;
1303    dst0 = __lasx_xvpermi_q(src00, src01, 0x02);
1304    dst1 = __lasx_xvpermi_q(src02, src03, 0x02);
1305    dst2 = __lasx_xvpermi_q(src04, src05, 0x02);
1306    dst3 = __lasx_xvpermi_q(dst00, dst01, 0x02);
1307    dst0 = __lasx_xvavgr_bu(dst0, out0);
1308    dst1 = __lasx_xvavgr_bu(dst1, out1);
1309    dst2 = __lasx_xvavgr_bu(dst2, out2);
1310    dst3 = __lasx_xvavgr_bu(dst3, out3);
1311    __lasx_xvstelm_d(dst0, dst, 0, 0);
1312    __lasx_xvstelm_d(dst0, dst + dstStride, 0, 2);
1313    __lasx_xvstelm_d(dst1, dst + dstStride_2x, 0, 0);
1314    __lasx_xvstelm_d(dst1, dst + dstStride_3x, 0, 2);
1315    dst += dstStride_4x;
1316    __lasx_xvstelm_d(dst2, dst, 0, 0);
1317    __lasx_xvstelm_d(dst2, dst + dstStride, 0, 2);
1318    __lasx_xvstelm_d(dst3, dst + dstStride_2x, 0, 0);
1319    __lasx_xvstelm_d(dst3, dst + dstStride_3x, 0, 2);
1320}
1321
1322static av_always_inline void
1323avg_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1324                               ptrdiff_t dstStride, ptrdiff_t srcStride)
1325{
1326    __m256i src00, src01, src02, src03, src04, src05, src10;
1327    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1328    __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
1329    __m256i h_20 = __lasx_xvldi(0x414);
1330    __m256i h_5  = __lasx_xvldi(0x405);
1331    __m256i w_20 = __lasx_xvldi(0x814);
1332    __m256i w_5  = __lasx_xvldi(0x805);
1333    __m256i w_512 = {512};
1334    __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1335    __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1336    __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1337    __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1338    __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1339    ptrdiff_t dstStride_2x = dstStride << 1;
1340    ptrdiff_t dstStride_4x = dstStride << 2;
1341    ptrdiff_t dstStride_3x = dstStride_2x + dstStride;
1342
1343    w_512 = __lasx_xvreplve0_w(w_512);
1344
1345    src -= srcStride << 1;
1346    QPEL8_HV_LOWPASS_H(tmp0)
1347    QPEL8_HV_LOWPASS_H(tmp2)
1348    QPEL8_HV_LOWPASS_H(tmp4)
1349    QPEL8_HV_LOWPASS_H(tmp6)
1350    QPEL8_HV_LOWPASS_H(tmp8)
1351    QPEL8_HV_LOWPASS_H(tmp10)
1352    QPEL8_HV_LOWPASS_H(tmp12)
1353    tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
1354    tmp9  = __lasx_xvpermi_q(tmp10, tmp8,  0x21);
1355    tmp7  = __lasx_xvpermi_q(tmp8,  tmp6,  0x21);
1356    tmp5  = __lasx_xvpermi_q(tmp6,  tmp4,  0x21);
1357    tmp3  = __lasx_xvpermi_q(tmp4,  tmp2,  0x21);
1358    tmp1  = __lasx_xvpermi_q(tmp2,  tmp0,  0x21);
1359
1360    QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01,
1361                       src02, src03, src04, src05, tmp0)
1362    QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01,
1363                       src02, src03, src04, src05, tmp2)
1364    QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01,
1365                       src02, src03, src04, src05, tmp4)
1366    QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01,
1367                       src02, src03, src04, src05, tmp6)
1368
1369    src00 = __lasx_xvld(dst, 0);
1370    DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
1371              dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
1372    dst += dstStride_4x;
1373    DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, tmp8);
1374    tmp9 = __lasx_xvldx(dst, dstStride_3x);
1375    dst -= dstStride_4x;
1376    tmp1 = __lasx_xvpermi_q(src00, src01, 0x02);
1377    tmp3 = __lasx_xvpermi_q(src02, src03, 0x02);
1378    tmp5 = __lasx_xvpermi_q(src04, src05, 0x02);
1379    tmp7 = __lasx_xvpermi_q(tmp8,  tmp9,  0x02);
1380    tmp0 = __lasx_xvavgr_bu(tmp0, tmp1);
1381    tmp2 = __lasx_xvavgr_bu(tmp2, tmp3);
1382    tmp4 = __lasx_xvavgr_bu(tmp4, tmp5);
1383    tmp6 = __lasx_xvavgr_bu(tmp6, tmp7);
1384    __lasx_xvstelm_d(tmp0, dst, 0, 0);
1385    dst += dstStride;
1386    __lasx_xvstelm_d(tmp0, dst, 0, 2);
1387    dst += dstStride;
1388    __lasx_xvstelm_d(tmp2, dst, 0, 0);
1389    dst += dstStride;
1390    __lasx_xvstelm_d(tmp2, dst, 0, 2);
1391    dst += dstStride;
1392    __lasx_xvstelm_d(tmp4, dst, 0, 0);
1393    dst += dstStride;
1394    __lasx_xvstelm_d(tmp4, dst, 0, 2);
1395    dst += dstStride;
1396    __lasx_xvstelm_d(tmp6, dst, 0, 0);
1397    dst += dstStride;
1398    __lasx_xvstelm_d(tmp6, dst, 0, 2);
1399}
1400
1401static av_always_inline void
1402put_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1403                               int dstStride, int srcStride)
1404{
1405    put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1406    put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1407    src += srcStride << 3;
1408    dst += dstStride << 3;
1409    put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1410    put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1411}
1412
1413static av_always_inline void
1414avg_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1415                               int dstStride, int srcStride)
1416{
1417    avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1418    avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1419    src += srcStride << 3;
1420    dst += dstStride << 3;
1421    avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1422    avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1423}
1424
1425static void put_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1426                                           int dstStride, int srcStride)
1427{
1428    put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1429    put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1430    src += 8*srcStride;
1431    dst += 8*dstStride;
1432    put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1433    put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1434}
1435
1436static void avg_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1437                                           int dstStride, int srcStride)
1438{
1439    avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1440    avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1441    src += 8*srcStride;
1442    dst += 8*dstStride;
1443    avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1444    avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1445}
1446
1447static void put_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1448                                     ptrdiff_t dstStride, ptrdiff_t srcStride)
1449{
1450    put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1451    put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1452    src += srcStride << 3;
1453    dst += dstStride << 3;
1454    put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1455    put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1456}
1457
1458static void avg_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1459                                     ptrdiff_t dstStride, ptrdiff_t srcStride)
1460{
1461    avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1462    avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1463    src += srcStride << 3;
1464    dst += dstStride << 3;
1465    avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1466    avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1467}
1468
1469void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
1470                                 ptrdiff_t stride)
1471{
1472    /* In mmi optimization, it used function ff_put_pixels8_8_mmi
1473     * which implemented in hpeldsp_mmi.c */
1474    put_pixels8_8_inline_asm(dst, src, stride);
1475}
1476
1477void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
1478                                 ptrdiff_t stride)
1479{
1480    uint8_t half[64];
1481
1482    put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1483    /* in qpel8, the stride of half and height of block is 8 */
1484    put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
1485}
1486
1487void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
1488                                 ptrdiff_t stride)
1489{
1490    put_h264_qpel8_h_lowpass_lasx(dst, src, stride, stride);
1491}
1492
1493void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
1494                                 ptrdiff_t stride)
1495{
1496    uint8_t half[64];
1497
1498    put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1499    put_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
1500}
1501
1502void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src,
1503                                 ptrdiff_t stride)
1504{
1505    uint8_t half[64];
1506
1507    put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride);
1508    put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
1509}
1510
1511void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
1512                                 ptrdiff_t stride)
1513{
1514    uint8_t halfH[64];
1515    uint8_t halfV[64];
1516
1517    put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1518    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1519    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1520}
1521
1522void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
1523                                 ptrdiff_t stride)
1524{
1525    uint8_t temp[128];
1526    uint8_t *const halfH  = temp;
1527    uint8_t *const halfHV = temp + 64;
1528
1529    put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1530    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1531    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1532}
1533
1534void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
1535                                 ptrdiff_t stride)
1536{
1537    uint8_t halfH[64];
1538    uint8_t halfV[64];
1539
1540    put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1541    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1542    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1543}
1544
1545void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
1546                                 ptrdiff_t stride)
1547{
1548    put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride);
1549}
1550
1551void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
1552                                 ptrdiff_t stride)
1553{
1554    uint8_t temp[128];
1555    uint8_t *const halfHV = temp;
1556    uint8_t *const halfH  = temp + 64;
1557
1558    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1559    put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride);
1560    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1561}
1562
1563void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
1564                                 ptrdiff_t stride)
1565{
1566    put_h264_qpel8_hv_lowpass_lasx(dst, src, stride, stride);
1567}
1568
1569void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
1570                                 ptrdiff_t stride)
1571{
1572    uint8_t temp[128];
1573    uint8_t *const halfHV = temp;
1574    uint8_t *const halfH  = temp + 64;
1575
1576    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1577    put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride);
1578    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1579}
1580
1581void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src,
1582                                 ptrdiff_t stride)
1583{
1584    uint8_t half[64];
1585
1586    put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride);
1587    put_pixels8_l2_8_lsx(dst, src + stride, half, stride, stride);
1588}
1589
1590void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
1591                                 ptrdiff_t stride)
1592{
1593    uint8_t halfH[64];
1594    uint8_t halfV[64];
1595
1596    put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1597    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1598    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1599}
1600
1601void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
1602                                 ptrdiff_t stride)
1603{
1604    uint8_t temp[128];
1605    uint8_t *const halfH  = temp;
1606    uint8_t *const halfHV = temp + 64;
1607
1608    put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1609    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1610    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1611}
1612
1613void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
1614                                 ptrdiff_t stride)
1615{
1616    uint8_t halfH[64];
1617    uint8_t halfV[64];
1618
1619    put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1620    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1621    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1622}
1623
1624void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
1625                                 ptrdiff_t stride)
1626{
1627    /* In mmi optimization, it used function ff_avg_pixels8_8_mmi
1628     * which implemented in hpeldsp_mmi.c */
1629    avg_pixels8_8_lsx(dst, src, stride);
1630}
1631
1632void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
1633                                 ptrdiff_t stride)
1634{
1635    uint8_t half[64];
1636
1637    put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1638    avg_pixels8_l2_8_lsx(dst, src, half, stride, stride);
1639}
1640
1641void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
1642                                 ptrdiff_t stride)
1643{
1644    avg_h264_qpel8_h_lowpass_lasx(dst, src, stride, stride);
1645}
1646
1647void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
1648                                 ptrdiff_t stride)
1649{
1650    uint8_t half[64];
1651
1652    put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1653    avg_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
1654}
1655
1656void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
1657                                 ptrdiff_t stride)
1658{
1659    uint8_t halfH[64];
1660    uint8_t halfV[64];
1661
1662    put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1663    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1664    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1665}
1666
1667void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
1668                                 ptrdiff_t stride)
1669{
1670    uint8_t temp[128];
1671    uint8_t *const halfH  = temp;
1672    uint8_t *const halfHV = temp + 64;
1673
1674    put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1675    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1676    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1677}
1678
1679void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
1680                                 ptrdiff_t stride)
1681{
1682    uint8_t halfH[64];
1683    uint8_t halfV[64];
1684
1685    put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1686    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1687    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1688}
1689
1690void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
1691                                 ptrdiff_t stride)
1692{
1693    avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride);
1694}
1695
1696void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
1697                                 ptrdiff_t stride)
1698{
1699    uint8_t temp[128];
1700    uint8_t *const halfHV = temp;
1701    uint8_t *const halfH  = temp + 64;
1702
1703    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1704    put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride);
1705    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1706}
1707
1708void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
1709                                 ptrdiff_t stride)
1710{
1711    avg_h264_qpel8_hv_lowpass_lasx(dst, src, stride, stride);
1712}
1713
1714void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
1715                                 ptrdiff_t stride)
1716{
1717    uint8_t temp[128];
1718    uint8_t *const halfHV = temp;
1719    uint8_t *const halfH  = temp + 64;
1720
1721    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1722    put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride);
1723    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1724}
1725
1726void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
1727                                 ptrdiff_t stride)
1728{
1729    uint8_t halfH[64];
1730    uint8_t halfV[64];
1731
1732    put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1733    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1734    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1735}
1736
1737void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
1738                                 ptrdiff_t stride)
1739{
1740    uint8_t temp[128];
1741    uint8_t *const halfH  = temp;
1742    uint8_t *const halfHV = temp + 64;
1743
1744    put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1745    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1746    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1747}
1748
1749void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
1750                                 ptrdiff_t stride)
1751{
1752    uint8_t halfH[64];
1753    uint8_t halfV[64];
1754
1755    put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1756    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1757    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1758}
1759
1760void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
1761                                  ptrdiff_t stride)
1762{
1763    /* In mmi optimization, it used function ff_put_pixels16_8_mmi
1764     * which implemented in hpeldsp_mmi.c */
1765    put_pixels16_8_lsx(dst, src, stride);
1766}
1767
1768void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
1769                                  ptrdiff_t stride)
1770{
1771    uint8_t half[256];
1772
1773    put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1774    put_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1775}
1776
1777void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
1778                                  ptrdiff_t stride)
1779{
1780    put_h264_qpel16_h_lowpass_lasx(dst, src, stride, stride);
1781}
1782
1783void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
1784                                  ptrdiff_t stride)
1785{
1786    uint8_t half[256];
1787
1788    put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1789    put_pixels16_l2_8_lsx(dst, src+1, half, stride, stride);
1790}
1791
1792void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
1793                                  ptrdiff_t stride)
1794{
1795    uint8_t half[256];
1796
1797    put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
1798    put_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1799}
1800
1801void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
1802                                  ptrdiff_t stride)
1803{
1804    avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2),
1805                               dst, stride);
1806}
1807
1808void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
1809                                  ptrdiff_t stride)
1810{
1811    uint8_t temp[512];
1812    uint8_t *const halfH  = temp;
1813    uint8_t *const halfHV = temp + 256;
1814
1815    put_h264_qpel16_h_lowpass_lasx(halfH, src, 16, stride);
1816    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1817    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1818}
1819
1820void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
1821                                  ptrdiff_t stride)
1822{
1823    avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2) + 1,
1824                               dst, stride);
1825}
1826
1827void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
1828                                  ptrdiff_t stride)
1829{
1830    put_h264_qpel16_v_lowpass_lasx(dst, src, stride, stride);
1831}
1832
1833void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
1834                                  ptrdiff_t stride)
1835{
1836    uint8_t temp[512];
1837    uint8_t *const halfHV = temp;
1838    uint8_t *const halfH  = temp + 256;
1839
1840    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1841    put_h264_qpel16_v_lowpass_lasx(halfH, src, 16, stride);
1842    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1843}
1844
1845void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
1846                                  ptrdiff_t stride)
1847{
1848    put_h264_qpel16_hv_lowpass_lasx(dst, src, stride, stride);
1849}
1850
1851void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
1852                                  ptrdiff_t stride)
1853{
1854    uint8_t temp[512];
1855    uint8_t *const halfHV = temp;
1856    uint8_t *const halfH  = temp + 256;
1857
1858    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1859    put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride);
1860    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1861}
1862
1863void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
1864                                  ptrdiff_t stride)
1865{
1866    uint8_t half[256];
1867
1868    put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
1869    put_pixels16_l2_8_lsx(dst, src+stride, half, stride, stride);
1870}
1871
1872void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
1873                                  ptrdiff_t stride)
1874{
1875    avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2, (uint8_t*)src - (stride * 2),
1876                               dst, stride);
1877}
1878
1879void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
1880                                  ptrdiff_t stride)
1881{
1882    uint8_t temp[512];
1883    uint8_t *const halfH  = temp;
1884    uint8_t *const halfHV = temp + 256;
1885
1886    put_h264_qpel16_h_lowpass_lasx(halfH, src + stride, 16, stride);
1887    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1888    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1889}
1890
1891void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
1892                                  ptrdiff_t stride)
1893{
1894    avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2,
1895                               (uint8_t*)src - (stride * 2) + 1, dst, stride);
1896}
1897
1898void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
1899                                  ptrdiff_t stride)
1900{
1901    /* In mmi optimization, it used function ff_avg_pixels16_8_mmi
1902     * which implemented in hpeldsp_mmi.c */
1903    avg_pixels16_8_lsx(dst, src, stride);
1904}
1905
1906void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
1907                                  ptrdiff_t stride)
1908{
1909    uint8_t half[256];
1910
1911    put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1912    avg_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1913}
1914
1915void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
1916                                  ptrdiff_t stride)
1917{
1918    avg_h264_qpel16_h_lowpass_lasx(dst, src, stride, stride);
1919}
1920
1921void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
1922                                  ptrdiff_t stride)
1923{
1924    uint8_t half[256];
1925
1926    put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1927    avg_pixels16_l2_8_lsx(dst, src+1, half, stride, stride);
1928}
1929
1930void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
1931                                  ptrdiff_t stride)
1932{
1933    uint8_t half[256];
1934
1935    put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
1936    avg_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1937}
1938
1939void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
1940                                  ptrdiff_t stride)
1941{
1942    avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src - 2,
1943                                           (uint8_t*)src - (stride * 2),
1944                                           dst, stride);
1945}
1946
1947void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
1948                                  ptrdiff_t stride)
1949{
1950    uint8_t temp[512];
1951    uint8_t *const halfH  = temp;
1952    uint8_t *const halfHV = temp + 256;
1953
1954    put_h264_qpel16_h_lowpass_lasx(halfH, src, 16, stride);
1955    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1956    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1957}
1958
1959void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
1960                                  ptrdiff_t stride)
1961{
1962    avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src - 2,
1963                                            (uint8_t*)src - (stride * 2) + 1,
1964                                            dst, stride);
1965}
1966
1967void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
1968                                  ptrdiff_t stride)
1969{
1970    avg_h264_qpel16_v_lowpass_lasx(dst, src, stride, stride);
1971}
1972
1973void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
1974                                  ptrdiff_t stride)
1975{
1976    uint8_t temp[512];
1977    uint8_t *const halfHV = temp;
1978    uint8_t *const halfH  = temp + 256;
1979
1980    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1981    put_h264_qpel16_v_lowpass_lasx(halfH, src, 16, stride);
1982    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1983}
1984
1985void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
1986                                  ptrdiff_t stride)
1987{
1988    avg_h264_qpel16_hv_lowpass_lasx(dst, src, stride, stride);
1989}
1990
1991void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
1992                                  ptrdiff_t stride)
1993{
1994    uint8_t temp[512];
1995    uint8_t *const halfHV = temp;
1996    uint8_t *const halfH  = temp + 256;
1997
1998    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1999    put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride);
2000    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
2001}
2002
2003void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
2004                                  ptrdiff_t stride)
2005{
2006    uint8_t half[256];
2007
2008    put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
2009    avg_pixels16_l2_8_lsx(dst, src + stride, half, stride, stride);
2010}
2011
2012void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
2013                                  ptrdiff_t stride)
2014{
2015    avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src + stride - 2,
2016                                            (uint8_t*)src - (stride * 2),
2017                                            dst, stride);
2018}
2019
2020void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
2021                                  ptrdiff_t stride)
2022{
2023    uint8_t temp[512];
2024    uint8_t *const halfH  = temp;
2025    uint8_t *const halfHV = temp + 256;
2026
2027    put_h264_qpel16_h_lowpass_lasx(halfH, src + stride, 16, stride);
2028    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
2029    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
2030}
2031
2032void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
2033                                  ptrdiff_t stride)
2034{
2035    avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src + stride - 2,
2036                                            (uint8_t*)src - (stride * 2) + 1,
2037                                            dst, stride);
2038}
2039