1 /*
2  * Loongson LASX optimized h264qpel
3  *
4  * Copyright (c) 2020 Loongson Technology Corporation Limited
5  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "h264qpel_lasx.h"
25 #include "libavutil/loongarch/loongson_intrinsics.h"
26 #include "libavutil/attributes.h"
27 
28 static const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = {
29     /* 8 width cases */
30     0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
31     0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
32     1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
33     1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
34     2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
35     2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
36 };
37 
38 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2)  \
39 ( {                                                        \
40     __m256i out0_m;                                        \
41     __m256i tmp0_m;                                        \
42                                                            \
43     tmp0_m = __lasx_xvshuf_b(in1, in0, mask0);             \
44     out0_m = __lasx_xvhaddw_h_b(tmp0_m, tmp0_m);           \
45     tmp0_m = __lasx_xvshuf_b(in1, in0, mask1);             \
46     out0_m = __lasx_xvdp2add_h_b(out0_m, minus5b, tmp0_m); \
47     tmp0_m = __lasx_xvshuf_b(in1, in0, mask2);             \
48     out0_m = __lasx_xvdp2add_h_b(out0_m, plus20b, tmp0_m); \
49                                                            \
50     out0_m;                                                \
51 } )
52 
53 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)  \
54 ( {                                                            \
55     __m256i out0_m;                                            \
56                                                                \
57     out0_m = __lasx_xvdp2_h_b(in0, coeff0);                    \
58     DUP2_ARG3(__lasx_xvdp2add_h_b, out0_m, in1, coeff1, out0_m,\
59               in2, coeff2, out0_m, out0_m);                    \
60                                                                \
61     out0_m;                                                    \
62 } )
63 
64 static av_always_inline
avc_luma_hv_qrt_and_aver_dst_16x16_lasx(uint8_t *src_x, uint8_t *src_y, uint8_t *dst, ptrdiff_t stride)65 void avc_luma_hv_qrt_and_aver_dst_16x16_lasx(uint8_t *src_x,
66                                              uint8_t *src_y,
67                                              uint8_t *dst, ptrdiff_t stride)
68 {
69     const int16_t filt_const0 = 0xfb01;
70     const int16_t filt_const1 = 0x1414;
71     const int16_t filt_const2 = 0x1fb;
72     uint32_t loop_cnt;
73     ptrdiff_t stride_2x = stride << 1;
74     ptrdiff_t stride_3x = stride_2x + stride;
75     ptrdiff_t stride_4x = stride << 2;
76     __m256i tmp0, tmp1;
77     __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
78     __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
79     __m256i src_vt7, src_vt8;
80     __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
81     __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
82     __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
83     __m256i vt_out3, out0, out1, out2, out3;
84     __m256i minus5b = __lasx_xvldi(0xFB);
85     __m256i plus20b = __lasx_xvldi(20);
86 
87     filt0 = __lasx_xvreplgr2vr_h(filt_const0);
88     filt1 = __lasx_xvreplgr2vr_h(filt_const1);
89     filt2 = __lasx_xvreplgr2vr_h(filt_const2);
90 
91     mask0 = __lasx_xvld(luma_mask_arr, 0);
92     DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2);
93     src_vt0 = __lasx_xvld(src_y, 0);
94     DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x,
95               src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
96     src_y += stride_4x;
97 
98     src_vt0 = __lasx_xvxori_b(src_vt0, 128);
99     DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
100               src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
101 
102     for (loop_cnt = 4; loop_cnt--;) {
103         src_hz0 = __lasx_xvld(src_x, 0);
104         DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x,
105                   src_hz1, src_hz2);
106         src_hz3 = __lasx_xvldx(src_x, stride_3x);
107         src_x  += stride_4x;
108         src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
109         src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
110         src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
111         src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
112         DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
113                   src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
114 
115         hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
116         hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
117         hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
118         hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
119         hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
120         hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
121 
122         DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x,
123                   src_y, stride_3x, src_y, stride_4x,
124                   src_vt5, src_vt6, src_vt7, src_vt8);
125         src_y += stride_4x;
126 
127         DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
128                   src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
129 
130         DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
131                   0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
132                   src_vt0, src_vt1, src_vt2, src_vt3);
133         src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
134         DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
135                   src_vt3, src_vt2, src_vt87_h, src_vt3,
136                   src_hz0, src_hz1, src_hz2, src_hz3);
137         DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
138                   src_vt3, src_vt2, src_vt87_h, src_vt3,
139                   src_vt0, src_vt1, src_vt2, src_vt3);
140         DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1, src_hz1,
141                   0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3, 0x02,
142                   src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
143         DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1, src_hz1,
144                   0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3, 0x13,
145                   src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
146         vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h, filt0,
147                                  filt1, filt2);
148         vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h, filt0,
149                                  filt1, filt2);
150         vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h, filt0,
151                                  filt1, filt2);
152         vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h, filt0,
153                                  filt1, filt2);
154         vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
155         vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
156 
157         DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
158                   out0, out2);
159         DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
160                   out1, out3);
161         tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
162         tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
163 
164         DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
165         out0 = __lasx_xvld(dst, 0);
166         DUP2_ARG2(__lasx_xvldx, dst, stride, dst, stride_2x, out1, out2);
167         out3 = __lasx_xvldx(dst, stride_3x);
168         out0 = __lasx_xvpermi_q(out0, out2, 0x02);
169         out1 = __lasx_xvpermi_q(out1, out3, 0x02);
170         out2 = __lasx_xvilvl_d(out1, out0);
171         out3 = __lasx_xvilvh_d(out1, out0);
172         out0 = __lasx_xvpermi_q(out2, out3, 0x02);
173         out1 = __lasx_xvpermi_q(out2, out3, 0x13);
174         tmp0 = __lasx_xvavgr_bu(out0, tmp0);
175         tmp1 = __lasx_xvavgr_bu(out1, tmp1);
176 
177         __lasx_xvstelm_d(tmp0, dst, 0, 0);
178         __lasx_xvstelm_d(tmp0, dst + stride, 0, 1);
179         __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
180         __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
181 
182         __lasx_xvstelm_d(tmp0, dst, 8, 2);
183         __lasx_xvstelm_d(tmp0, dst + stride, 8, 3);
184         __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
185         __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
186 
187         dst    += stride_4x;
188         src_vt0 = src_vt4;
189         src_vt1 = src_vt5;
190         src_vt2 = src_vt6;
191         src_vt3 = src_vt7;
192         src_vt4 = src_vt8;
193     }
194 }
195 
196 static av_always_inline void
avc_luma_hv_qrt_16x16_lasx(uint8_t *src_x, uint8_t *src_y, uint8_t *dst, ptrdiff_t stride)197 avc_luma_hv_qrt_16x16_lasx(uint8_t *src_x, uint8_t *src_y,
198                            uint8_t *dst, ptrdiff_t stride)
199 {
200     const int16_t filt_const0 = 0xfb01;
201     const int16_t filt_const1 = 0x1414;
202     const int16_t filt_const2 = 0x1fb;
203     uint32_t loop_cnt;
204     ptrdiff_t stride_2x = stride << 1;
205     ptrdiff_t stride_3x = stride_2x + stride;
206     ptrdiff_t stride_4x = stride << 2;
207     __m256i tmp0, tmp1;
208     __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
209     __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
210     __m256i src_vt7, src_vt8;
211     __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
212     __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
213     __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
214     __m256i vt_out3, out0, out1, out2, out3;
215     __m256i minus5b = __lasx_xvldi(0xFB);
216     __m256i plus20b = __lasx_xvldi(20);
217 
218     filt0 = __lasx_xvreplgr2vr_h(filt_const0);
219     filt1 = __lasx_xvreplgr2vr_h(filt_const1);
220     filt2 = __lasx_xvreplgr2vr_h(filt_const2);
221 
222     mask0 = __lasx_xvld(luma_mask_arr, 0);
223     DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2);
224     src_vt0 = __lasx_xvld(src_y, 0);
225     DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x,
226               src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
227     src_y += stride_4x;
228 
229     src_vt0 = __lasx_xvxori_b(src_vt0, 128);
230     DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
231               src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
232 
233     for (loop_cnt = 4; loop_cnt--;) {
234         src_hz0 = __lasx_xvld(src_x, 0);
235         DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x,
236                   src_hz1, src_hz2);
237         src_hz3 = __lasx_xvldx(src_x, stride_3x);
238         src_x  += stride_4x;
239         src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
240         src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
241         src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
242         src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
243         DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
244                   src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
245 
246         hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
247         hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
248         hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
249         hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
250         hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
251         hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
252 
253         DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x,
254                   src_y, stride_3x, src_y, stride_4x,
255                   src_vt5, src_vt6, src_vt7, src_vt8);
256         src_y += stride_4x;
257 
258         DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
259                   src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
260         DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
261                   0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
262                   src_vt0, src_vt1, src_vt2, src_vt3);
263         src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
264         DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
265                   src_vt3, src_vt2, src_vt87_h, src_vt3,
266                   src_hz0, src_hz1, src_hz2, src_hz3);
267         DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
268                   src_vt3, src_vt2, src_vt87_h, src_vt3,
269                   src_vt0, src_vt1, src_vt2, src_vt3);
270         DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1,
271                   src_hz1, 0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3,
272                   0x02, src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
273         DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1,
274                   src_hz1, 0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3,
275                   0x13, src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
276 
277         vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h,
278                                  filt0, filt1, filt2);
279         vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h,
280                                  filt0, filt1, filt2);
281         vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h,
282                                  filt0, filt1, filt2);
283         vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h,
284                                  filt0, filt1, filt2);
285         vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
286         vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
287 
288         DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
289                   out0, out2);
290         DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
291                   out1, out3);
292         tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
293         tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
294 
295         DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
296         __lasx_xvstelm_d(tmp0, dst, 0, 0);
297         __lasx_xvstelm_d(tmp0, dst + stride, 0, 1);
298         __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
299         __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
300 
301         __lasx_xvstelm_d(tmp0, dst, 8, 2);
302         __lasx_xvstelm_d(tmp0, dst + stride, 8, 3);
303         __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
304         __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
305 
306         dst    += stride_4x;
307         src_vt0 = src_vt4;
308         src_vt1 = src_vt5;
309         src_vt2 = src_vt6;
310         src_vt3 = src_vt7;
311         src_vt4 = src_vt8;
312     }
313 }
314 
315 /* put_pixels8_8_inline_asm: dst = src */
316 static av_always_inline void
put_pixels8_8_inline_asm(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)317 put_pixels8_8_inline_asm(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
318 {
319     uint64_t tmp[8];
320     ptrdiff_t stride_2, stride_3, stride_4;
321     __asm__ volatile (
322     "slli.d     %[stride_2],     %[stride],   1           \n\t"
323     "add.d      %[stride_3],     %[stride_2], %[stride]   \n\t"
324     "slli.d     %[stride_4],     %[stride_2], 1           \n\t"
325     "ld.d       %[tmp0],         %[src],      0x0         \n\t"
326     "ldx.d      %[tmp1],         %[src],      %[stride]   \n\t"
327     "ldx.d      %[tmp2],         %[src],      %[stride_2] \n\t"
328     "ldx.d      %[tmp3],         %[src],      %[stride_3] \n\t"
329     "add.d      %[src],          %[src],      %[stride_4] \n\t"
330     "ld.d       %[tmp4],         %[src],      0x0         \n\t"
331     "ldx.d      %[tmp5],         %[src],      %[stride]   \n\t"
332     "ldx.d      %[tmp6],         %[src],      %[stride_2] \n\t"
333     "ldx.d      %[tmp7],         %[src],      %[stride_3] \n\t"
334 
335     "st.d       %[tmp0],         %[dst],      0x0         \n\t"
336     "stx.d      %[tmp1],         %[dst],      %[stride]   \n\t"
337     "stx.d      %[tmp2],         %[dst],      %[stride_2] \n\t"
338     "stx.d      %[tmp3],         %[dst],      %[stride_3] \n\t"
339     "add.d      %[dst],          %[dst],      %[stride_4] \n\t"
340     "st.d       %[tmp4],         %[dst],      0x0         \n\t"
341     "stx.d      %[tmp5],         %[dst],      %[stride]   \n\t"
342     "stx.d      %[tmp6],         %[dst],      %[stride_2] \n\t"
343     "stx.d      %[tmp7],         %[dst],      %[stride_3] \n\t"
344     : [tmp0]"=&r"(tmp[0]),        [tmp1]"=&r"(tmp[1]),
345       [tmp2]"=&r"(tmp[2]),        [tmp3]"=&r"(tmp[3]),
346       [tmp4]"=&r"(tmp[4]),        [tmp5]"=&r"(tmp[5]),
347       [tmp6]"=&r"(tmp[6]),        [tmp7]"=&r"(tmp[7]),
348       [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
349       [stride_4]"=&r"(stride_4),
350       [dst]"+&r"(dst),            [src]"+&r"(src)
351     : [stride]"r"(stride)
352     : "memory"
353     );
354 }
355 
356 /* avg_pixels8_8_lsx   : dst = avg(src, dst)
357  * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
358  * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
359 static av_always_inline void
avg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)360 avg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
361 {
362     uint8_t *tmp = dst;
363     ptrdiff_t stride_2, stride_3, stride_4;
364     __asm__ volatile (
365     /* h0~h7 */
366     "slli.d     %[stride_2],     %[stride],   1           \n\t"
367     "add.d      %[stride_3],     %[stride_2], %[stride]   \n\t"
368     "slli.d     %[stride_4],     %[stride_2], 1           \n\t"
369     "vld        $vr0,            %[src],      0           \n\t"
370     "vldx       $vr1,            %[src],      %[stride]   \n\t"
371     "vldx       $vr2,            %[src],      %[stride_2] \n\t"
372     "vldx       $vr3,            %[src],      %[stride_3] \n\t"
373     "add.d      %[src],          %[src],      %[stride_4] \n\t"
374     "vld        $vr4,            %[src],      0           \n\t"
375     "vldx       $vr5,            %[src],      %[stride]   \n\t"
376     "vldx       $vr6,            %[src],      %[stride_2] \n\t"
377     "vldx       $vr7,            %[src],      %[stride_3] \n\t"
378 
379     "vld        $vr8,            %[tmp],      0           \n\t"
380     "vldx       $vr9,            %[tmp],      %[stride]   \n\t"
381     "vldx       $vr10,           %[tmp],      %[stride_2] \n\t"
382     "vldx       $vr11,           %[tmp],      %[stride_3] \n\t"
383     "add.d      %[tmp],          %[tmp],      %[stride_4] \n\t"
384     "vld        $vr12,           %[tmp],      0           \n\t"
385     "vldx       $vr13,           %[tmp],      %[stride]   \n\t"
386     "vldx       $vr14,           %[tmp],      %[stride_2] \n\t"
387     "vldx       $vr15,           %[tmp],      %[stride_3] \n\t"
388 
389     "vavgr.bu    $vr0,           $vr8,        $vr0        \n\t"
390     "vavgr.bu    $vr1,           $vr9,        $vr1        \n\t"
391     "vavgr.bu    $vr2,           $vr10,       $vr2        \n\t"
392     "vavgr.bu    $vr3,           $vr11,       $vr3        \n\t"
393     "vavgr.bu    $vr4,           $vr12,       $vr4        \n\t"
394     "vavgr.bu    $vr5,           $vr13,       $vr5        \n\t"
395     "vavgr.bu    $vr6,           $vr14,       $vr6        \n\t"
396     "vavgr.bu    $vr7,           $vr15,       $vr7        \n\t"
397 
398     "vstelm.d    $vr0,           %[dst],      0,  0       \n\t"
399     "add.d       %[dst],         %[dst],      %[stride]   \n\t"
400     "vstelm.d    $vr1,           %[dst],      0,  0       \n\t"
401     "add.d       %[dst],         %[dst],      %[stride]   \n\t"
402     "vstelm.d    $vr2,           %[dst],      0,  0       \n\t"
403     "add.d       %[dst],         %[dst],      %[stride]   \n\t"
404     "vstelm.d    $vr3,           %[dst],      0,  0       \n\t"
405     "add.d       %[dst],         %[dst],      %[stride]   \n\t"
406     "vstelm.d    $vr4,           %[dst],      0,  0       \n\t"
407     "add.d       %[dst],         %[dst],      %[stride]   \n\t"
408     "vstelm.d    $vr5,           %[dst],      0,  0       \n\t"
409     "add.d       %[dst],         %[dst],      %[stride]   \n\t"
410     "vstelm.d    $vr6,           %[dst],      0,  0       \n\t"
411     "add.d       %[dst],         %[dst],      %[stride]   \n\t"
412     "vstelm.d    $vr7,           %[dst],      0,  0       \n\t"
413     : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src),
414       [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
415       [stride_4]"=&r"(stride_4)
416     : [stride]"r"(stride)
417     : "memory"
418     );
419 }
420 
421 /* avg_pixels8_8_lsx   : dst = avg(src, dst)
422  * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
423  * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
424 static av_always_inline void
put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)425 put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
426                      ptrdiff_t dstStride, ptrdiff_t srcStride)
427 {
428     ptrdiff_t stride_2, stride_3, stride_4;
429     __asm__ volatile (
430     /* h0~h7 */
431     "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
432     "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
433     "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
434     "vld        $vr0,            %[src],         0            \n\t"
435     "vldx       $vr1,            %[src],         %[srcStride] \n\t"
436     "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
437     "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
438     "add.d      %[src],          %[src],         %[stride_4]  \n\t"
439     "vld        $vr4,            %[src],         0            \n\t"
440     "vldx       $vr5,            %[src],         %[srcStride] \n\t"
441     "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
442     "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
443 
444     "vld        $vr8,            %[half],        0x00         \n\t"
445     "vld        $vr9,            %[half],        0x08         \n\t"
446     "vld        $vr10,           %[half],        0x10         \n\t"
447     "vld        $vr11,           %[half],        0x18         \n\t"
448     "vld        $vr12,           %[half],        0x20         \n\t"
449     "vld        $vr13,           %[half],        0x28         \n\t"
450     "vld        $vr14,           %[half],        0x30         \n\t"
451     "vld        $vr15,           %[half],        0x38         \n\t"
452 
453     "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
454     "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
455     "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
456     "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
457     "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
458     "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
459     "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
460     "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
461 
462     "vstelm.d   $vr0,            %[dst],         0,  0        \n\t"
463     "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
464     "vstelm.d   $vr1,            %[dst],         0,  0        \n\t"
465     "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
466     "vstelm.d   $vr2,            %[dst],         0,  0        \n\t"
467     "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
468     "vstelm.d   $vr3,            %[dst],         0,  0        \n\t"
469     "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
470     "vstelm.d   $vr4,            %[dst],         0,  0        \n\t"
471     "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
472     "vstelm.d   $vr5,            %[dst],         0,  0        \n\t"
473     "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
474     "vstelm.d   $vr6,            %[dst],         0,  0        \n\t"
475     "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
476     "vstelm.d   $vr7,            %[dst],         0,  0        \n\t"
477     : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
478       [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
479       [stride_4]"=&r"(stride_4)
480     : [srcStride]"r"(srcStride), [dstStride]"r"(dstStride)
481     : "memory"
482     );
483 }
484 
485 /* avg_pixels8_8_lsx   : dst = avg(src, dst)
486  * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
487  * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
488 static av_always_inline void
avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)489 avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
490                      ptrdiff_t dstStride, ptrdiff_t srcStride)
491 {
492     uint8_t *tmp = dst;
493     ptrdiff_t stride_2, stride_3, stride_4;
494     __asm__ volatile (
495     /* h0~h7 */
496     "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
497     "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
498     "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
499     "vld        $vr0,            %[src],         0            \n\t"
500     "vldx       $vr1,            %[src],         %[srcStride] \n\t"
501     "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
502     "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
503     "add.d      %[src],          %[src],         %[stride_4]  \n\t"
504     "vld        $vr4,            %[src],         0            \n\t"
505     "vldx       $vr5,            %[src],         %[srcStride] \n\t"
506     "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
507     "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
508 
509     "vld        $vr8,            %[half],        0x00         \n\t"
510     "vld        $vr9,            %[half],        0x08         \n\t"
511     "vld        $vr10,           %[half],        0x10         \n\t"
512     "vld        $vr11,           %[half],        0x18         \n\t"
513     "vld        $vr12,           %[half],        0x20         \n\t"
514     "vld        $vr13,           %[half],        0x28         \n\t"
515     "vld        $vr14,           %[half],        0x30         \n\t"
516     "vld        $vr15,           %[half],        0x38         \n\t"
517 
518     "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
519     "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
520     "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
521     "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
522     "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
523     "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
524     "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
525     "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
526 
527     "slli.d     %[stride_2],     %[dstStride],   1            \n\t"
528     "add.d      %[stride_3],     %[stride_2],    %[dstStride] \n\t"
529     "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
530     "vld        $vr8,            %[tmp],         0            \n\t"
531     "vldx       $vr9,            %[tmp],         %[dstStride] \n\t"
532     "vldx       $vr10,           %[tmp],         %[stride_2]  \n\t"
533     "vldx       $vr11,           %[tmp],         %[stride_3]  \n\t"
534     "add.d      %[tmp],          %[tmp],         %[stride_4]  \n\t"
535     "vld        $vr12,           %[tmp],         0            \n\t"
536     "vldx       $vr13,           %[tmp],         %[dstStride] \n\t"
537     "vldx       $vr14,           %[tmp],         %[stride_2]  \n\t"
538     "vldx       $vr15,           %[tmp],         %[stride_3]  \n\t"
539 
540     "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
541     "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
542     "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
543     "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
544     "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
545     "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
546     "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
547     "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
548 
549     "vstelm.d    $vr0,           %[dst],         0,  0        \n\t"
550     "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
551     "vstelm.d    $vr1,           %[dst],         0,  0        \n\t"
552     "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
553     "vstelm.d    $vr2,           %[dst],         0,  0        \n\t"
554     "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
555     "vstelm.d    $vr3,           %[dst],         0,  0        \n\t"
556     "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
557     "vstelm.d    $vr4,           %[dst],         0,  0        \n\t"
558     "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
559     "vstelm.d    $vr5,           %[dst],         0,  0        \n\t"
560     "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
561     "vstelm.d    $vr6,           %[dst],         0,  0        \n\t"
562     "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
563     "vstelm.d    $vr7,           %[dst],         0,  0        \n\t"
564     : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half),
565       [src]"+&r"(src), [stride_2]"=&r"(stride_2),
566       [stride_3]"=&r"(stride_3), [stride_4]"=&r"(stride_4)
567     : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
568     : "memory"
569     );
570 }
571 
572 /* put_pixels16_8_lsx: dst = src */
573 static av_always_inline void
put_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)574 put_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
575 {
576     ptrdiff_t stride_2, stride_3, stride_4;
577     __asm__ volatile (
578     "slli.d     %[stride_2],     %[stride],      1            \n\t"
579     "add.d      %[stride_3],     %[stride_2],    %[stride]    \n\t"
580     "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
581     "vld        $vr0,            %[src],         0            \n\t"
582     "vldx       $vr1,            %[src],         %[stride]    \n\t"
583     "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
584     "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
585     "add.d      %[src],          %[src],         %[stride_4]  \n\t"
586     "vld        $vr4,            %[src],         0            \n\t"
587     "vldx       $vr5,            %[src],         %[stride]    \n\t"
588     "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
589     "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
590     "add.d      %[src],          %[src],         %[stride_4]  \n\t"
591 
592     "vst        $vr0,            %[dst],         0            \n\t"
593     "vstx       $vr1,            %[dst],         %[stride]    \n\t"
594     "vstx       $vr2,            %[dst],         %[stride_2]  \n\t"
595     "vstx       $vr3,            %[dst],         %[stride_3]  \n\t"
596     "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
597     "vst        $vr4,            %[dst],         0            \n\t"
598     "vstx       $vr5,            %[dst],         %[stride]    \n\t"
599     "vstx       $vr6,            %[dst],         %[stride_2]  \n\t"
600     "vstx       $vr7,            %[dst],         %[stride_3]  \n\t"
601     "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
602 
603     "vld        $vr0,            %[src],         0            \n\t"
604     "vldx       $vr1,            %[src],         %[stride]    \n\t"
605     "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
606     "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
607     "add.d      %[src],          %[src],         %[stride_4]  \n\t"
608     "vld        $vr4,            %[src],         0            \n\t"
609     "vldx       $vr5,            %[src],         %[stride]    \n\t"
610     "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
611     "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
612 
613     "vst        $vr0,            %[dst],         0            \n\t"
614     "vstx       $vr1,            %[dst],         %[stride]    \n\t"
615     "vstx       $vr2,            %[dst],         %[stride_2]  \n\t"
616     "vstx       $vr3,            %[dst],         %[stride_3]  \n\t"
617     "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
618     "vst        $vr4,            %[dst],         0            \n\t"
619     "vstx       $vr5,            %[dst],         %[stride]    \n\t"
620     "vstx       $vr6,            %[dst],         %[stride_2]  \n\t"
621     "vstx       $vr7,            %[dst],         %[stride_3]  \n\t"
622     : [dst]"+&r"(dst),            [src]"+&r"(src),
623       [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
624       [stride_4]"=&r"(stride_4)
625     : [stride]"r"(stride)
626     : "memory"
627     );
628 }
629 
630 /* avg_pixels16_8_lsx    : dst = avg(src, dst)
631  * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
632  * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
633 static av_always_inline void
avg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)634 avg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
635 {
636     uint8_t *tmp = dst;
637     ptrdiff_t stride_2, stride_3, stride_4;
638     __asm__ volatile (
639     /* h0~h7 */
640     "slli.d     %[stride_2],     %[stride],      1            \n\t"
641     "add.d      %[stride_3],     %[stride_2],    %[stride]    \n\t"
642     "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
643     "vld        $vr0,            %[src],         0            \n\t"
644     "vldx       $vr1,            %[src],         %[stride]    \n\t"
645     "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
646     "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
647     "add.d      %[src],          %[src],         %[stride_4]  \n\t"
648     "vld        $vr4,            %[src],         0            \n\t"
649     "vldx       $vr5,            %[src],         %[stride]    \n\t"
650     "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
651     "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
652     "add.d      %[src],          %[src],         %[stride_4]  \n\t"
653 
654     "vld        $vr8,            %[tmp],         0            \n\t"
655     "vldx       $vr9,            %[tmp],         %[stride]    \n\t"
656     "vldx       $vr10,           %[tmp],         %[stride_2]  \n\t"
657     "vldx       $vr11,           %[tmp],         %[stride_3]  \n\t"
658     "add.d      %[tmp],          %[tmp],         %[stride_4]  \n\t"
659     "vld        $vr12,           %[tmp],         0            \n\t"
660     "vldx       $vr13,           %[tmp],         %[stride]    \n\t"
661     "vldx       $vr14,           %[tmp],         %[stride_2]  \n\t"
662     "vldx       $vr15,           %[tmp],         %[stride_3]  \n\t"
663     "add.d      %[tmp],          %[tmp],         %[stride_4]  \n\t"
664 
665     "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
666     "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
667     "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
668     "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
669     "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
670     "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
671     "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
672     "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
673 
674     "vst        $vr0,            %[dst],         0            \n\t"
675     "vstx       $vr1,            %[dst],         %[stride]    \n\t"
676     "vstx       $vr2,            %[dst],         %[stride_2]  \n\t"
677     "vstx       $vr3,            %[dst],         %[stride_3]  \n\t"
678     "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
679     "vst        $vr4,            %[dst],         0            \n\t"
680     "vstx       $vr5,            %[dst],         %[stride]    \n\t"
681     "vstx       $vr6,            %[dst],         %[stride_2]  \n\t"
682     "vstx       $vr7,            %[dst],         %[stride_3]  \n\t"
683     "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
684 
685     /* h8~h15 */
686     "vld        $vr0,            %[src],         0            \n\t"
687     "vldx       $vr1,            %[src],         %[stride]    \n\t"
688     "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
689     "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
690     "add.d      %[src],          %[src],         %[stride_4]  \n\t"
691     "vld        $vr4,            %[src],         0            \n\t"
692     "vldx       $vr5,            %[src],         %[stride]    \n\t"
693     "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
694     "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
695 
696     "vld        $vr8,            %[tmp],         0            \n\t"
697     "vldx       $vr9,            %[tmp],         %[stride]    \n\t"
698     "vldx       $vr10,           %[tmp],         %[stride_2]  \n\t"
699     "vldx       $vr11,           %[tmp],         %[stride_3]  \n\t"
700     "add.d      %[tmp],          %[tmp],         %[stride_4]  \n\t"
701     "vld        $vr12,           %[tmp],         0            \n\t"
702     "vldx       $vr13,           %[tmp],         %[stride]    \n\t"
703     "vldx       $vr14,           %[tmp],         %[stride_2]  \n\t"
704     "vldx       $vr15,           %[tmp],         %[stride_3]  \n\t"
705 
706     "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
707     "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
708     "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
709     "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
710     "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
711     "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
712     "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
713     "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
714 
715     "vst        $vr0,            %[dst],         0            \n\t"
716     "vstx       $vr1,            %[dst],         %[stride]    \n\t"
717     "vstx       $vr2,            %[dst],         %[stride_2]  \n\t"
718     "vstx       $vr3,            %[dst],         %[stride_3]  \n\t"
719     "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
720     "vst        $vr4,            %[dst],         0            \n\t"
721     "vstx       $vr5,            %[dst],         %[stride]    \n\t"
722     "vstx       $vr6,            %[dst],         %[stride_2]  \n\t"
723     "vstx       $vr7,            %[dst],         %[stride_3]  \n\t"
724     : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src),
725       [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
726       [stride_4]"=&r"(stride_4)
727     : [stride]"r"(stride)
728     : "memory"
729     );
730 }
731 
732 /* avg_pixels16_8_lsx   : dst = avg(src, dst)
733  * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
734  * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
735 static av_always_inline void
put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)736 put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
737                       ptrdiff_t dstStride, ptrdiff_t srcStride)
738 {
739     ptrdiff_t stride_2, stride_3, stride_4;
740     ptrdiff_t dstride_2, dstride_3, dstride_4;
741     __asm__ volatile (
742     "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
743     "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
744     "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
745     "slli.d     %[dstride_2],    %[dstStride],   1            \n\t"
746     "add.d      %[dstride_3],    %[dstride_2],   %[dstStride] \n\t"
747     "slli.d     %[dstride_4],    %[dstride_2],   1            \n\t"
748     /* h0~h7 */
749     "vld        $vr0,            %[src],         0            \n\t"
750     "vldx       $vr1,            %[src],         %[srcStride] \n\t"
751     "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
752     "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
753     "add.d      %[src],          %[src],         %[stride_4]  \n\t"
754     "vld        $vr4,            %[src],         0            \n\t"
755     "vldx       $vr5,            %[src],         %[srcStride] \n\t"
756     "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
757     "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
758     "add.d      %[src],          %[src],         %[stride_4]  \n\t"
759 
760     "vld        $vr8,            %[half],        0x00         \n\t"
761     "vld        $vr9,            %[half],        0x10         \n\t"
762     "vld        $vr10,           %[half],        0x20         \n\t"
763     "vld        $vr11,           %[half],        0x30         \n\t"
764     "vld        $vr12,           %[half],        0x40         \n\t"
765     "vld        $vr13,           %[half],        0x50         \n\t"
766     "vld        $vr14,           %[half],        0x60         \n\t"
767     "vld        $vr15,           %[half],        0x70         \n\t"
768 
769     "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
770     "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
771     "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
772     "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
773     "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
774     "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
775     "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
776     "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
777 
778     "vst        $vr0,            %[dst],         0            \n\t"
779     "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
780     "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
781     "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
782     "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
783     "vst        $vr4,            %[dst],         0            \n\t"
784     "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
785     "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
786     "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
787     "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
788 
789     /* h8~h15 */
790     "vld        $vr0,            %[src],         0            \n\t"
791     "vldx       $vr1,            %[src],         %[srcStride] \n\t"
792     "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
793     "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
794     "add.d      %[src],          %[src],         %[stride_4]  \n\t"
795     "vld        $vr4,            %[src],         0            \n\t"
796     "vldx       $vr5,            %[src],         %[srcStride] \n\t"
797     "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
798     "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
799 
800     "vld        $vr8,            %[half],        0x80         \n\t"
801     "vld        $vr9,            %[half],        0x90         \n\t"
802     "vld        $vr10,           %[half],        0xa0         \n\t"
803     "vld        $vr11,           %[half],        0xb0         \n\t"
804     "vld        $vr12,           %[half],        0xc0         \n\t"
805     "vld        $vr13,           %[half],        0xd0         \n\t"
806     "vld        $vr14,           %[half],        0xe0         \n\t"
807     "vld        $vr15,           %[half],        0xf0         \n\t"
808 
809     "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
810     "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
811     "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
812     "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
813     "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
814     "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
815     "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
816     "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
817 
818     "vst        $vr0,            %[dst],         0            \n\t"
819     "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
820     "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
821     "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
822     "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
823     "vst        $vr4,            %[dst],         0            \n\t"
824     "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
825     "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
826     "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
827     : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
828       [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
829       [stride_4]"=&r"(stride_4),  [dstride_2]"=&r"(dstride_2),
830       [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
831     : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
832     : "memory"
833     );
834 }
835 
836 /* avg_pixels16_8_lsx    : dst = avg(src, dst)
837  * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
838  * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
839 static av_always_inline void
avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)840 avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
841                       ptrdiff_t dstStride, ptrdiff_t srcStride)
842 {
843     uint8_t *tmp = dst;
844     ptrdiff_t stride_2, stride_3, stride_4;
845     ptrdiff_t dstride_2, dstride_3, dstride_4;
846     __asm__ volatile (
847     "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
848     "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
849     "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
850     "slli.d     %[dstride_2],    %[dstStride],   1            \n\t"
851     "add.d      %[dstride_3],    %[dstride_2],   %[dstStride] \n\t"
852     "slli.d     %[dstride_4],    %[dstride_2],   1            \n\t"
853     /* h0~h7 */
854     "vld        $vr0,            %[src],         0            \n\t"
855     "vldx       $vr1,            %[src],         %[srcStride] \n\t"
856     "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
857     "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
858     "add.d      %[src],          %[src],         %[stride_4]  \n\t"
859     "vld        $vr4,            %[src],         0            \n\t"
860     "vldx       $vr5,            %[src],         %[srcStride] \n\t"
861     "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
862     "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
863     "add.d      %[src],          %[src],         %[stride_4]  \n\t"
864 
865     "vld        $vr8,            %[half],        0x00         \n\t"
866     "vld        $vr9,            %[half],        0x10         \n\t"
867     "vld        $vr10,           %[half],        0x20         \n\t"
868     "vld        $vr11,           %[half],        0x30         \n\t"
869     "vld        $vr12,           %[half],        0x40         \n\t"
870     "vld        $vr13,           %[half],        0x50         \n\t"
871     "vld        $vr14,           %[half],        0x60         \n\t"
872     "vld        $vr15,           %[half],        0x70         \n\t"
873 
874     "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
875     "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
876     "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
877     "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
878     "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
879     "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
880     "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
881     "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
882 
883     "vld        $vr8,            %[tmp],         0            \n\t"
884     "vldx       $vr9,            %[tmp],         %[dstStride] \n\t"
885     "vldx       $vr10,           %[tmp],         %[dstride_2] \n\t"
886     "vldx       $vr11,           %[tmp],         %[dstride_3] \n\t"
887     "add.d      %[tmp],          %[tmp],         %[dstride_4] \n\t"
888     "vld        $vr12,           %[tmp],         0            \n\t"
889     "vldx       $vr13,           %[tmp],         %[dstStride] \n\t"
890     "vldx       $vr14,           %[tmp],         %[dstride_2] \n\t"
891     "vldx       $vr15,           %[tmp],         %[dstride_3] \n\t"
892     "add.d      %[tmp],          %[tmp],         %[dstride_4] \n\t"
893 
894     "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
895     "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
896     "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
897     "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
898     "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
899     "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
900     "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
901     "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
902 
903     "vst        $vr0,            %[dst],         0            \n\t"
904     "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
905     "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
906     "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
907     "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
908     "vst        $vr4,            %[dst],         0            \n\t"
909     "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
910     "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
911     "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
912     "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
913 
914     /* h8~h15    */
915     "vld        $vr0,            %[src],         0            \n\t"
916     "vldx       $vr1,            %[src],         %[srcStride] \n\t"
917     "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
918     "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
919     "add.d      %[src],          %[src],         %[stride_4]  \n\t"
920     "vld        $vr4,            %[src],         0            \n\t"
921     "vldx       $vr5,            %[src],         %[srcStride] \n\t"
922     "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
923     "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
924 
925     "vld        $vr8,            %[half],        0x80         \n\t"
926     "vld        $vr9,            %[half],        0x90         \n\t"
927     "vld        $vr10,           %[half],        0xa0         \n\t"
928     "vld        $vr11,           %[half],        0xb0         \n\t"
929     "vld        $vr12,           %[half],        0xc0         \n\t"
930     "vld        $vr13,           %[half],        0xd0         \n\t"
931     "vld        $vr14,           %[half],        0xe0         \n\t"
932     "vld        $vr15,           %[half],        0xf0         \n\t"
933 
934     "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
935     "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
936     "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
937     "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
938     "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
939     "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
940     "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
941     "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
942 
943     "vld        $vr8,            %[tmp],         0            \n\t"
944     "vldx       $vr9,            %[tmp],         %[dstStride] \n\t"
945     "vldx       $vr10,           %[tmp],         %[dstride_2] \n\t"
946     "vldx       $vr11,           %[tmp],         %[dstride_3] \n\t"
947     "add.d      %[tmp],          %[tmp],         %[dstride_4] \n\t"
948     "vld        $vr12,           %[tmp],         0            \n\t"
949     "vldx       $vr13,           %[tmp],         %[dstStride] \n\t"
950     "vldx       $vr14,           %[tmp],         %[dstride_2] \n\t"
951     "vldx       $vr15,           %[tmp],         %[dstride_3] \n\t"
952 
953     "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
954     "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
955     "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
956     "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
957     "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
958     "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
959     "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
960     "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
961 
962     "vst        $vr0,            %[dst],         0            \n\t"
963     "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
964     "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
965     "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
966     "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
967     "vst        $vr4,            %[dst],         0            \n\t"
968     "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
969     "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
970     "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
971     : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half), [src]"+&r"(src),
972       [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
973       [stride_4]"=&r"(stride_4),  [dstride_2]"=&r"(dstride_2),
974       [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
975     : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
976     : "memory"
977     );
978 }
979 
980 #define QPEL8_H_LOWPASS(out_v)                                               \
981     src00 = __lasx_xvld(src, - 2);                                           \
982     src += srcStride;                                                        \
983     src10 = __lasx_xvld(src, - 2);                                           \
984     src += srcStride;                                                        \
985     src00 = __lasx_xvpermi_q(src00, src10, 0x02);                            \
986     src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1);                   \
987     src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2);                   \
988     src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3);                   \
989     src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4);                   \
990     src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5);                   \
991     DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
992     src00 = __lasx_xvaddwl_h_bu(src00, src05);                               \
993     src02 = __lasx_xvmul_h(src02, h_20);                                     \
994     src01 = __lasx_xvmul_h(src01, h_5);                                      \
995     src02 = __lasx_xvssub_h(src02, src01);                                   \
996     src02 = __lasx_xvsadd_h(src02, src00);                                   \
997     src02 = __lasx_xvsadd_h(src02, h_16);                                    \
998     out_v = __lasx_xvssrani_bu_h(src02, src02, 5);                           \
999 
1000 static av_always_inline void
put_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)1001 put_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride,
1002                               int srcStride)
1003 {
1004     int dstStride_2x = dstStride << 1;
1005     __m256i src00, src01, src02, src03, src04, src05, src10;
1006     __m256i out0, out1, out2, out3;
1007     __m256i h_20 = __lasx_xvldi(0x414);
1008     __m256i h_5  = __lasx_xvldi(0x405);
1009     __m256i h_16 = __lasx_xvldi(0x410);
1010     __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1011     __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1012     __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1013     __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1014     __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1015 
1016     QPEL8_H_LOWPASS(out0)
1017     QPEL8_H_LOWPASS(out1)
1018     QPEL8_H_LOWPASS(out2)
1019     QPEL8_H_LOWPASS(out3)
1020     __lasx_xvstelm_d(out0, dst, 0, 0);
1021     __lasx_xvstelm_d(out0, dst + dstStride, 0, 2);
1022     dst += dstStride_2x;
1023     __lasx_xvstelm_d(out1, dst, 0, 0);
1024     __lasx_xvstelm_d(out1, dst + dstStride, 0, 2);
1025     dst += dstStride_2x;
1026     __lasx_xvstelm_d(out2, dst, 0, 0);
1027     __lasx_xvstelm_d(out2, dst + dstStride, 0, 2);
1028     dst += dstStride_2x;
1029     __lasx_xvstelm_d(out3, dst, 0, 0);
1030     __lasx_xvstelm_d(out3, dst + dstStride, 0, 2);
1031 }
1032 
1033 #define QPEL8_V_LOWPASS(src0, src1, src2, src3, src4, src5, src6,       \
1034                         tmp0, tmp1, tmp2, tmp3, tmp4, tmp5)             \
1035 {                                                                       \
1036     tmp0 = __lasx_xvpermi_q(src0, src1, 0x02);                          \
1037     tmp1 = __lasx_xvpermi_q(src1, src2, 0x02);                          \
1038     tmp2 = __lasx_xvpermi_q(src2, src3, 0x02);                          \
1039     tmp3 = __lasx_xvpermi_q(src3, src4, 0x02);                          \
1040     tmp4 = __lasx_xvpermi_q(src4, src5, 0x02);                          \
1041     tmp5 = __lasx_xvpermi_q(src5, src6, 0x02);                          \
1042     DUP2_ARG2(__lasx_xvaddwl_h_bu, tmp2, tmp3, tmp1, tmp4, tmp2, tmp1); \
1043     tmp0 = __lasx_xvaddwl_h_bu(tmp0, tmp5);                             \
1044     tmp2 = __lasx_xvmul_h(tmp2, h_20);                                  \
1045     tmp1 = __lasx_xvmul_h(tmp1, h_5);                                   \
1046     tmp2 = __lasx_xvssub_h(tmp2, tmp1);                                 \
1047     tmp2 = __lasx_xvsadd_h(tmp2, tmp0);                                 \
1048     tmp2 = __lasx_xvsadd_h(tmp2, h_16);                                 \
1049     tmp2 = __lasx_xvssrani_bu_h(tmp2, tmp2, 5);                         \
1050 }
1051 
1052 static av_always_inline void
put_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride)1053 put_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride,
1054                               int srcStride)
1055 {
1056     int srcStride_2x = srcStride << 1;
1057     int dstStride_2x = dstStride << 1;
1058     int srcStride_4x = srcStride << 2;
1059     int srcStride_3x = srcStride_2x + srcStride;
1060     __m256i src00, src01, src02, src03, src04, src05, src06;
1061     __m256i src07, src08, src09, src10, src11, src12;
1062     __m256i tmp00, tmp01, tmp02, tmp03, tmp04, tmp05;
1063     __m256i h_20 = __lasx_xvldi(0x414);
1064     __m256i h_5  = __lasx_xvldi(0x405);
1065     __m256i h_16 = __lasx_xvldi(0x410);
1066 
1067     DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0,
1068               src00, src01);
1069     src02 = __lasx_xvld(src, 0);
1070     DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1071               srcStride_3x, src, srcStride_4x, src03, src04, src05, src06);
1072     src += srcStride_4x;
1073     DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1074               srcStride_3x, src, srcStride_4x, src07, src08, src09, src10);
1075     src += srcStride_4x;
1076     DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12);
1077 
1078     QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06,
1079                     tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1080     __lasx_xvstelm_d(tmp02, dst, 0, 0);
1081     __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1082     dst += dstStride_2x;
1083     QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08,
1084                     tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1085     __lasx_xvstelm_d(tmp02, dst, 0, 0);
1086     __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1087     dst += dstStride_2x;
1088     QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10,
1089                     tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1090     __lasx_xvstelm_d(tmp02, dst, 0, 0);
1091     __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1092     dst += dstStride_2x;
1093     QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12,
1094                     tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1095     __lasx_xvstelm_d(tmp02, dst, 0, 0);
1096     __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1097 }
1098 
1099 static av_always_inline void
avg_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride)1100 avg_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride,
1101                               int srcStride)
1102 {
1103     int srcStride_2x = srcStride << 1;
1104     int srcStride_4x = srcStride << 2;
1105     int dstStride_2x = dstStride << 1;
1106     int dstStride_4x = dstStride << 2;
1107     int srcStride_3x = srcStride_2x + srcStride;
1108     int dstStride_3x = dstStride_2x + dstStride;
1109     __m256i src00, src01, src02, src03, src04, src05, src06;
1110     __m256i src07, src08, src09, src10, src11, src12, tmp00;
1111     __m256i tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp08, tmp09;
1112     __m256i h_20 = __lasx_xvldi(0x414);
1113     __m256i h_5  = __lasx_xvldi(0x405);
1114     __m256i h_16 = __lasx_xvldi(0x410);
1115 
1116 
1117     DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0,
1118               src00, src01);
1119     src02 = __lasx_xvld(src, 0);
1120     DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1121               srcStride_3x, src, srcStride_4x, src03, src04, src05, src06);
1122     src += srcStride_4x;
1123     DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1124               srcStride_3x, src, srcStride_4x, src07, src08, src09, src10);
1125     src += srcStride_4x;
1126     DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12);
1127 
1128     tmp06 = __lasx_xvld(dst, 0);
1129     DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
1130               dst, dstStride_3x, dst, dstStride_4x,
1131               tmp07, tmp02, tmp03, tmp04);
1132     dst += dstStride_4x;
1133     DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
1134               tmp05, tmp00);
1135     tmp01 = __lasx_xvldx(dst, dstStride_3x);
1136     dst -= dstStride_4x;
1137 
1138     tmp06 = __lasx_xvpermi_q(tmp06, tmp07, 0x02);
1139     tmp07 = __lasx_xvpermi_q(tmp02, tmp03, 0x02);
1140     tmp08 = __lasx_xvpermi_q(tmp04, tmp05, 0x02);
1141     tmp09 = __lasx_xvpermi_q(tmp00, tmp01, 0x02);
1142 
1143     QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06,
1144                     tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1145     tmp06 = __lasx_xvavgr_bu(tmp06, tmp02);
1146     __lasx_xvstelm_d(tmp06, dst, 0, 0);
1147     __lasx_xvstelm_d(tmp06, dst + dstStride, 0, 2);
1148     dst += dstStride_2x;
1149     QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08,
1150                     tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1151     tmp07 = __lasx_xvavgr_bu(tmp07, tmp02);
1152     __lasx_xvstelm_d(tmp07, dst, 0, 0);
1153     __lasx_xvstelm_d(tmp07, dst + dstStride, 0, 2);
1154     dst += dstStride_2x;
1155     QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10,
1156                     tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1157     tmp08 = __lasx_xvavgr_bu(tmp08, tmp02);
1158     __lasx_xvstelm_d(tmp08, dst, 0, 0);
1159     __lasx_xvstelm_d(tmp08, dst + dstStride, 0, 2);
1160     dst += dstStride_2x;
1161     QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12,
1162                     tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1163     tmp09 = __lasx_xvavgr_bu(tmp09, tmp02);
1164     __lasx_xvstelm_d(tmp09, dst, 0, 0);
1165     __lasx_xvstelm_d(tmp09, dst + dstStride, 0, 2);
1166 }
1167 
1168 #define QPEL8_HV_LOWPASS_H(tmp)                                              \
1169 {                                                                            \
1170     src00 = __lasx_xvld(src, -2);                                            \
1171     src += srcStride;                                                        \
1172     src10 = __lasx_xvld(src, -2);                                            \
1173     src += srcStride;                                                        \
1174     src00 = __lasx_xvpermi_q(src00, src10, 0x02);                            \
1175     src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1);                   \
1176     src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2);                   \
1177     src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3);                   \
1178     src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4);                   \
1179     src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5);                   \
1180     DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
1181     src00 = __lasx_xvaddwl_h_bu(src00, src05);                               \
1182     src02 = __lasx_xvmul_h(src02, h_20);                                     \
1183     src01 = __lasx_xvmul_h(src01, h_5);                                      \
1184     src02 = __lasx_xvssub_h(src02, src01);                                   \
1185     tmp  = __lasx_xvsadd_h(src02, src00);                                    \
1186 }
1187 
1188 #define QPEL8_HV_LOWPASS_V(src0, src1, src2, src3,                       \
1189                            src4, src5, temp0, temp1,                     \
1190                            temp2, temp3, temp4, temp5,                   \
1191                            out)                                          \
1192 {                                                                        \
1193     DUP2_ARG2(__lasx_xvaddwl_w_h, src2, src3, src1, src4, temp0, temp2); \
1194     DUP2_ARG2(__lasx_xvaddwh_w_h, src2, src3, src1, src4, temp1, temp3); \
1195     temp4 = __lasx_xvaddwl_w_h(src0, src5);                              \
1196     temp5 = __lasx_xvaddwh_w_h(src0, src5);                              \
1197     temp0 = __lasx_xvmul_w(temp0, w_20);                                 \
1198     temp1 = __lasx_xvmul_w(temp1, w_20);                                 \
1199     temp2 = __lasx_xvmul_w(temp2, w_5);                                  \
1200     temp3 = __lasx_xvmul_w(temp3, w_5);                                  \
1201     temp0 = __lasx_xvssub_w(temp0, temp2);                               \
1202     temp1 = __lasx_xvssub_w(temp1, temp3);                               \
1203     temp0 = __lasx_xvsadd_w(temp0, temp4);                               \
1204     temp1 = __lasx_xvsadd_w(temp1, temp5);                               \
1205     temp0 = __lasx_xvsadd_w(temp0, w_512);                               \
1206     temp1 = __lasx_xvsadd_w(temp1, w_512);                               \
1207     temp0 = __lasx_xvssrani_hu_w(temp0, temp0, 10);                      \
1208     temp1 = __lasx_xvssrani_hu_w(temp1, temp1, 10);                      \
1209     temp0 = __lasx_xvpackev_d(temp1, temp0);                             \
1210     out   = __lasx_xvssrani_bu_h(temp0, temp0, 0);                       \
1211 }
1212 
1213 static av_always_inline void
put_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)1214 put_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1215                                ptrdiff_t dstStride, ptrdiff_t srcStride)
1216 {
1217     __m256i src00, src01, src02, src03, src04, src05, src10;
1218     __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1219     __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
1220     __m256i h_20 = __lasx_xvldi(0x414);
1221     __m256i h_5  = __lasx_xvldi(0x405);
1222     __m256i w_20 = __lasx_xvldi(0x814);
1223     __m256i w_5  = __lasx_xvldi(0x805);
1224     __m256i w_512 = {512};
1225     __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1226     __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1227     __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1228     __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1229     __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1230 
1231     w_512 = __lasx_xvreplve0_w(w_512);
1232 
1233     src -= srcStride << 1;
1234     QPEL8_HV_LOWPASS_H(tmp0)
1235     QPEL8_HV_LOWPASS_H(tmp2)
1236     QPEL8_HV_LOWPASS_H(tmp4)
1237     QPEL8_HV_LOWPASS_H(tmp6)
1238     QPEL8_HV_LOWPASS_H(tmp8)
1239     QPEL8_HV_LOWPASS_H(tmp10)
1240     QPEL8_HV_LOWPASS_H(tmp12)
1241     tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
1242     tmp9  = __lasx_xvpermi_q(tmp10, tmp8,  0x21);
1243     tmp7  = __lasx_xvpermi_q(tmp8,  tmp6,  0x21);
1244     tmp5  = __lasx_xvpermi_q(tmp6,  tmp4,  0x21);
1245     tmp3  = __lasx_xvpermi_q(tmp4,  tmp2,  0x21);
1246     tmp1  = __lasx_xvpermi_q(tmp2,  tmp0,  0x21);
1247 
1248     QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01,
1249                        src02, src03, src04, src05, tmp0)
1250     QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01,
1251                        src02, src03, src04, src05, tmp2)
1252     QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01,
1253                        src02, src03, src04, src05, tmp4)
1254     QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01,
1255                        src02, src03, src04, src05, tmp6)
1256     __lasx_xvstelm_d(tmp0, dst, 0, 0);
1257     dst += dstStride;
1258     __lasx_xvstelm_d(tmp0, dst, 0, 2);
1259     dst += dstStride;
1260     __lasx_xvstelm_d(tmp2, dst, 0, 0);
1261     dst += dstStride;
1262     __lasx_xvstelm_d(tmp2, dst, 0, 2);
1263     dst += dstStride;
1264     __lasx_xvstelm_d(tmp4, dst, 0, 0);
1265     dst += dstStride;
1266     __lasx_xvstelm_d(tmp4, dst, 0, 2);
1267     dst += dstStride;
1268     __lasx_xvstelm_d(tmp6, dst, 0, 0);
1269     dst += dstStride;
1270     __lasx_xvstelm_d(tmp6, dst, 0, 2);
1271 }
1272 
1273 static av_always_inline void
avg_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)1274 avg_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride,
1275                               int srcStride)
1276 {
1277     int dstStride_2x = dstStride << 1;
1278     int dstStride_4x = dstStride << 2;
1279     int dstStride_3x = dstStride_2x + dstStride;
1280     __m256i src00, src01, src02, src03, src04, src05, src10;
1281     __m256i dst00, dst01, dst0, dst1, dst2, dst3;
1282     __m256i out0, out1, out2, out3;
1283     __m256i h_20 = __lasx_xvldi(0x414);
1284     __m256i h_5  = __lasx_xvldi(0x405);
1285     __m256i h_16 = __lasx_xvldi(0x410);
1286     __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1287     __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1288     __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1289     __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1290     __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1291 
1292     QPEL8_H_LOWPASS(out0)
1293     QPEL8_H_LOWPASS(out1)
1294     QPEL8_H_LOWPASS(out2)
1295     QPEL8_H_LOWPASS(out3)
1296     src00 = __lasx_xvld(dst, 0);
1297     DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
1298               dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
1299     dst += dstStride_4x;
1300     DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, dst00);
1301     dst01 = __lasx_xvldx(dst, dstStride_3x);
1302     dst -= dstStride_4x;
1303     dst0 = __lasx_xvpermi_q(src00, src01, 0x02);
1304     dst1 = __lasx_xvpermi_q(src02, src03, 0x02);
1305     dst2 = __lasx_xvpermi_q(src04, src05, 0x02);
1306     dst3 = __lasx_xvpermi_q(dst00, dst01, 0x02);
1307     dst0 = __lasx_xvavgr_bu(dst0, out0);
1308     dst1 = __lasx_xvavgr_bu(dst1, out1);
1309     dst2 = __lasx_xvavgr_bu(dst2, out2);
1310     dst3 = __lasx_xvavgr_bu(dst3, out3);
1311     __lasx_xvstelm_d(dst0, dst, 0, 0);
1312     __lasx_xvstelm_d(dst0, dst + dstStride, 0, 2);
1313     __lasx_xvstelm_d(dst1, dst + dstStride_2x, 0, 0);
1314     __lasx_xvstelm_d(dst1, dst + dstStride_3x, 0, 2);
1315     dst += dstStride_4x;
1316     __lasx_xvstelm_d(dst2, dst, 0, 0);
1317     __lasx_xvstelm_d(dst2, dst + dstStride, 0, 2);
1318     __lasx_xvstelm_d(dst3, dst + dstStride_2x, 0, 0);
1319     __lasx_xvstelm_d(dst3, dst + dstStride_3x, 0, 2);
1320 }
1321 
1322 static av_always_inline void
avg_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)1323 avg_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1324                                ptrdiff_t dstStride, ptrdiff_t srcStride)
1325 {
1326     __m256i src00, src01, src02, src03, src04, src05, src10;
1327     __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1328     __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
1329     __m256i h_20 = __lasx_xvldi(0x414);
1330     __m256i h_5  = __lasx_xvldi(0x405);
1331     __m256i w_20 = __lasx_xvldi(0x814);
1332     __m256i w_5  = __lasx_xvldi(0x805);
1333     __m256i w_512 = {512};
1334     __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1335     __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1336     __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1337     __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1338     __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1339     ptrdiff_t dstStride_2x = dstStride << 1;
1340     ptrdiff_t dstStride_4x = dstStride << 2;
1341     ptrdiff_t dstStride_3x = dstStride_2x + dstStride;
1342 
1343     w_512 = __lasx_xvreplve0_w(w_512);
1344 
1345     src -= srcStride << 1;
1346     QPEL8_HV_LOWPASS_H(tmp0)
1347     QPEL8_HV_LOWPASS_H(tmp2)
1348     QPEL8_HV_LOWPASS_H(tmp4)
1349     QPEL8_HV_LOWPASS_H(tmp6)
1350     QPEL8_HV_LOWPASS_H(tmp8)
1351     QPEL8_HV_LOWPASS_H(tmp10)
1352     QPEL8_HV_LOWPASS_H(tmp12)
1353     tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
1354     tmp9  = __lasx_xvpermi_q(tmp10, tmp8,  0x21);
1355     tmp7  = __lasx_xvpermi_q(tmp8,  tmp6,  0x21);
1356     tmp5  = __lasx_xvpermi_q(tmp6,  tmp4,  0x21);
1357     tmp3  = __lasx_xvpermi_q(tmp4,  tmp2,  0x21);
1358     tmp1  = __lasx_xvpermi_q(tmp2,  tmp0,  0x21);
1359 
1360     QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01,
1361                        src02, src03, src04, src05, tmp0)
1362     QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01,
1363                        src02, src03, src04, src05, tmp2)
1364     QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01,
1365                        src02, src03, src04, src05, tmp4)
1366     QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01,
1367                        src02, src03, src04, src05, tmp6)
1368 
1369     src00 = __lasx_xvld(dst, 0);
1370     DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
1371               dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
1372     dst += dstStride_4x;
1373     DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, tmp8);
1374     tmp9 = __lasx_xvldx(dst, dstStride_3x);
1375     dst -= dstStride_4x;
1376     tmp1 = __lasx_xvpermi_q(src00, src01, 0x02);
1377     tmp3 = __lasx_xvpermi_q(src02, src03, 0x02);
1378     tmp5 = __lasx_xvpermi_q(src04, src05, 0x02);
1379     tmp7 = __lasx_xvpermi_q(tmp8,  tmp9,  0x02);
1380     tmp0 = __lasx_xvavgr_bu(tmp0, tmp1);
1381     tmp2 = __lasx_xvavgr_bu(tmp2, tmp3);
1382     tmp4 = __lasx_xvavgr_bu(tmp4, tmp5);
1383     tmp6 = __lasx_xvavgr_bu(tmp6, tmp7);
1384     __lasx_xvstelm_d(tmp0, dst, 0, 0);
1385     dst += dstStride;
1386     __lasx_xvstelm_d(tmp0, dst, 0, 2);
1387     dst += dstStride;
1388     __lasx_xvstelm_d(tmp2, dst, 0, 0);
1389     dst += dstStride;
1390     __lasx_xvstelm_d(tmp2, dst, 0, 2);
1391     dst += dstStride;
1392     __lasx_xvstelm_d(tmp4, dst, 0, 0);
1393     dst += dstStride;
1394     __lasx_xvstelm_d(tmp4, dst, 0, 2);
1395     dst += dstStride;
1396     __lasx_xvstelm_d(tmp6, dst, 0, 0);
1397     dst += dstStride;
1398     __lasx_xvstelm_d(tmp6, dst, 0, 2);
1399 }
1400 
1401 static av_always_inline void
put_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)1402 put_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1403                                int dstStride, int srcStride)
1404 {
1405     put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1406     put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1407     src += srcStride << 3;
1408     dst += dstStride << 3;
1409     put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1410     put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1411 }
1412 
1413 static av_always_inline void
avg_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)1414 avg_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1415                                int dstStride, int srcStride)
1416 {
1417     avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1418     avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1419     src += srcStride << 3;
1420     dst += dstStride << 3;
1421     avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1422     avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1423 }
1424 
put_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)1425 static void put_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1426                                            int dstStride, int srcStride)
1427 {
1428     put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1429     put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1430     src += 8*srcStride;
1431     dst += 8*dstStride;
1432     put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1433     put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1434 }
1435 
avg_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)1436 static void avg_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1437                                            int dstStride, int srcStride)
1438 {
1439     avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1440     avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1441     src += 8*srcStride;
1442     dst += 8*dstStride;
1443     avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1444     avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1445 }
1446 
put_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)1447 static void put_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1448                                      ptrdiff_t dstStride, ptrdiff_t srcStride)
1449 {
1450     put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1451     put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1452     src += srcStride << 3;
1453     dst += dstStride << 3;
1454     put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1455     put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1456 }
1457 
avg_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)1458 static void avg_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1459                                      ptrdiff_t dstStride, ptrdiff_t srcStride)
1460 {
1461     avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1462     avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1463     src += srcStride << 3;
1464     dst += dstStride << 3;
1465     avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1466     avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1467 }
1468 
ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1469 void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
1470                                  ptrdiff_t stride)
1471 {
1472     /* In mmi optimization, it used function ff_put_pixels8_8_mmi
1473      * which implemented in hpeldsp_mmi.c */
1474     put_pixels8_8_inline_asm(dst, src, stride);
1475 }
1476 
ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1477 void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
1478                                  ptrdiff_t stride)
1479 {
1480     uint8_t half[64];
1481 
1482     put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1483     /* in qpel8, the stride of half and height of block is 8 */
1484     put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
1485 }
1486 
ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1487 void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
1488                                  ptrdiff_t stride)
1489 {
1490     put_h264_qpel8_h_lowpass_lasx(dst, src, stride, stride);
1491 }
1492 
ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1493 void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
1494                                  ptrdiff_t stride)
1495 {
1496     uint8_t half[64];
1497 
1498     put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1499     put_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
1500 }
1501 
ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1502 void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src,
1503                                  ptrdiff_t stride)
1504 {
1505     uint8_t half[64];
1506 
1507     put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride);
1508     put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
1509 }
1510 
ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1511 void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
1512                                  ptrdiff_t stride)
1513 {
1514     uint8_t halfH[64];
1515     uint8_t halfV[64];
1516 
1517     put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1518     put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1519     put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1520 }
1521 
ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1522 void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
1523                                  ptrdiff_t stride)
1524 {
1525     uint8_t temp[128];
1526     uint8_t *const halfH  = temp;
1527     uint8_t *const halfHV = temp + 64;
1528 
1529     put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1530     put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1531     put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1532 }
1533 
ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1534 void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
1535                                  ptrdiff_t stride)
1536 {
1537     uint8_t halfH[64];
1538     uint8_t halfV[64];
1539 
1540     put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1541     put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1542     put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1543 }
1544 
ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1545 void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
1546                                  ptrdiff_t stride)
1547 {
1548     put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride);
1549 }
1550 
ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1551 void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
1552                                  ptrdiff_t stride)
1553 {
1554     uint8_t temp[128];
1555     uint8_t *const halfHV = temp;
1556     uint8_t *const halfH  = temp + 64;
1557 
1558     put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1559     put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride);
1560     put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1561 }
1562 
ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1563 void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
1564                                  ptrdiff_t stride)
1565 {
1566     put_h264_qpel8_hv_lowpass_lasx(dst, src, stride, stride);
1567 }
1568 
ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1569 void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
1570                                  ptrdiff_t stride)
1571 {
1572     uint8_t temp[128];
1573     uint8_t *const halfHV = temp;
1574     uint8_t *const halfH  = temp + 64;
1575 
1576     put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1577     put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride);
1578     put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1579 }
1580 
ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1581 void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src,
1582                                  ptrdiff_t stride)
1583 {
1584     uint8_t half[64];
1585 
1586     put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride);
1587     put_pixels8_l2_8_lsx(dst, src + stride, half, stride, stride);
1588 }
1589 
ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1590 void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
1591                                  ptrdiff_t stride)
1592 {
1593     uint8_t halfH[64];
1594     uint8_t halfV[64];
1595 
1596     put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1597     put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1598     put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1599 }
1600 
ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1601 void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
1602                                  ptrdiff_t stride)
1603 {
1604     uint8_t temp[128];
1605     uint8_t *const halfH  = temp;
1606     uint8_t *const halfHV = temp + 64;
1607 
1608     put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1609     put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1610     put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1611 }
1612 
ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1613 void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
1614                                  ptrdiff_t stride)
1615 {
1616     uint8_t halfH[64];
1617     uint8_t halfV[64];
1618 
1619     put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1620     put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1621     put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1622 }
1623 
ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1624 void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
1625                                  ptrdiff_t stride)
1626 {
1627     /* In mmi optimization, it used function ff_avg_pixels8_8_mmi
1628      * which implemented in hpeldsp_mmi.c */
1629     avg_pixels8_8_lsx(dst, src, stride);
1630 }
1631 
ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1632 void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
1633                                  ptrdiff_t stride)
1634 {
1635     uint8_t half[64];
1636 
1637     put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1638     avg_pixels8_l2_8_lsx(dst, src, half, stride, stride);
1639 }
1640 
ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1641 void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
1642                                  ptrdiff_t stride)
1643 {
1644     avg_h264_qpel8_h_lowpass_lasx(dst, src, stride, stride);
1645 }
1646 
ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1647 void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
1648                                  ptrdiff_t stride)
1649 {
1650     uint8_t half[64];
1651 
1652     put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1653     avg_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
1654 }
1655 
ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1656 void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
1657                                  ptrdiff_t stride)
1658 {
1659     uint8_t halfH[64];
1660     uint8_t halfV[64];
1661 
1662     put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1663     put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1664     avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1665 }
1666 
ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1667 void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
1668                                  ptrdiff_t stride)
1669 {
1670     uint8_t temp[128];
1671     uint8_t *const halfH  = temp;
1672     uint8_t *const halfHV = temp + 64;
1673 
1674     put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1675     put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1676     avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1677 }
1678 
ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1679 void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
1680                                  ptrdiff_t stride)
1681 {
1682     uint8_t halfH[64];
1683     uint8_t halfV[64];
1684 
1685     put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1686     put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1687     avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1688 }
1689 
ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1690 void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
1691                                  ptrdiff_t stride)
1692 {
1693     avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride);
1694 }
1695 
ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1696 void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
1697                                  ptrdiff_t stride)
1698 {
1699     uint8_t temp[128];
1700     uint8_t *const halfHV = temp;
1701     uint8_t *const halfH  = temp + 64;
1702 
1703     put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1704     put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride);
1705     avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1706 }
1707 
ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1708 void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
1709                                  ptrdiff_t stride)
1710 {
1711     avg_h264_qpel8_hv_lowpass_lasx(dst, src, stride, stride);
1712 }
1713 
ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1714 void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
1715                                  ptrdiff_t stride)
1716 {
1717     uint8_t temp[128];
1718     uint8_t *const halfHV = temp;
1719     uint8_t *const halfH  = temp + 64;
1720 
1721     put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1722     put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride);
1723     avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1724 }
1725 
ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1726 void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
1727                                  ptrdiff_t stride)
1728 {
1729     uint8_t halfH[64];
1730     uint8_t halfV[64];
1731 
1732     put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1733     put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1734     avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1735 }
1736 
ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1737 void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
1738                                  ptrdiff_t stride)
1739 {
1740     uint8_t temp[128];
1741     uint8_t *const halfH  = temp;
1742     uint8_t *const halfHV = temp + 64;
1743 
1744     put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1745     put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1746     avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1747 }
1748 
ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1749 void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
1750                                  ptrdiff_t stride)
1751 {
1752     uint8_t halfH[64];
1753     uint8_t halfV[64];
1754 
1755     put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1756     put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1757     avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1758 }
1759 
ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1760 void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
1761                                   ptrdiff_t stride)
1762 {
1763     /* In mmi optimization, it used function ff_put_pixels16_8_mmi
1764      * which implemented in hpeldsp_mmi.c */
1765     put_pixels16_8_lsx(dst, src, stride);
1766 }
1767 
ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1768 void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
1769                                   ptrdiff_t stride)
1770 {
1771     uint8_t half[256];
1772 
1773     put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1774     put_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1775 }
1776 
ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1777 void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
1778                                   ptrdiff_t stride)
1779 {
1780     put_h264_qpel16_h_lowpass_lasx(dst, src, stride, stride);
1781 }
1782 
ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1783 void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
1784                                   ptrdiff_t stride)
1785 {
1786     uint8_t half[256];
1787 
1788     put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1789     put_pixels16_l2_8_lsx(dst, src+1, half, stride, stride);
1790 }
1791 
ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1792 void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
1793                                   ptrdiff_t stride)
1794 {
1795     uint8_t half[256];
1796 
1797     put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
1798     put_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1799 }
1800 
ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1801 void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
1802                                   ptrdiff_t stride)
1803 {
1804     avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2),
1805                                dst, stride);
1806 }
1807 
ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1808 void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
1809                                   ptrdiff_t stride)
1810 {
1811     uint8_t temp[512];
1812     uint8_t *const halfH  = temp;
1813     uint8_t *const halfHV = temp + 256;
1814 
1815     put_h264_qpel16_h_lowpass_lasx(halfH, src, 16, stride);
1816     put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1817     put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1818 }
1819 
ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1820 void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
1821                                   ptrdiff_t stride)
1822 {
1823     avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2) + 1,
1824                                dst, stride);
1825 }
1826 
ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1827 void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
1828                                   ptrdiff_t stride)
1829 {
1830     put_h264_qpel16_v_lowpass_lasx(dst, src, stride, stride);
1831 }
1832 
ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1833 void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
1834                                   ptrdiff_t stride)
1835 {
1836     uint8_t temp[512];
1837     uint8_t *const halfHV = temp;
1838     uint8_t *const halfH  = temp + 256;
1839 
1840     put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1841     put_h264_qpel16_v_lowpass_lasx(halfH, src, 16, stride);
1842     put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1843 }
1844 
ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1845 void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
1846                                   ptrdiff_t stride)
1847 {
1848     put_h264_qpel16_hv_lowpass_lasx(dst, src, stride, stride);
1849 }
1850 
ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1851 void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
1852                                   ptrdiff_t stride)
1853 {
1854     uint8_t temp[512];
1855     uint8_t *const halfHV = temp;
1856     uint8_t *const halfH  = temp + 256;
1857 
1858     put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1859     put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride);
1860     put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1861 }
1862 
ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1863 void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
1864                                   ptrdiff_t stride)
1865 {
1866     uint8_t half[256];
1867 
1868     put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
1869     put_pixels16_l2_8_lsx(dst, src+stride, half, stride, stride);
1870 }
1871 
ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1872 void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
1873                                   ptrdiff_t stride)
1874 {
1875     avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2, (uint8_t*)src - (stride * 2),
1876                                dst, stride);
1877 }
1878 
ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1879 void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
1880                                   ptrdiff_t stride)
1881 {
1882     uint8_t temp[512];
1883     uint8_t *const halfH  = temp;
1884     uint8_t *const halfHV = temp + 256;
1885 
1886     put_h264_qpel16_h_lowpass_lasx(halfH, src + stride, 16, stride);
1887     put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1888     put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1889 }
1890 
ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1891 void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
1892                                   ptrdiff_t stride)
1893 {
1894     avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2,
1895                                (uint8_t*)src - (stride * 2) + 1, dst, stride);
1896 }
1897 
ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1898 void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
1899                                   ptrdiff_t stride)
1900 {
1901     /* In mmi optimization, it used function ff_avg_pixels16_8_mmi
1902      * which implemented in hpeldsp_mmi.c */
1903     avg_pixels16_8_lsx(dst, src, stride);
1904 }
1905 
ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1906 void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
1907                                   ptrdiff_t stride)
1908 {
1909     uint8_t half[256];
1910 
1911     put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1912     avg_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1913 }
1914 
ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1915 void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
1916                                   ptrdiff_t stride)
1917 {
1918     avg_h264_qpel16_h_lowpass_lasx(dst, src, stride, stride);
1919 }
1920 
ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1921 void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
1922                                   ptrdiff_t stride)
1923 {
1924     uint8_t half[256];
1925 
1926     put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1927     avg_pixels16_l2_8_lsx(dst, src+1, half, stride, stride);
1928 }
1929 
ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1930 void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
1931                                   ptrdiff_t stride)
1932 {
1933     uint8_t half[256];
1934 
1935     put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
1936     avg_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1937 }
1938 
ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1939 void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
1940                                   ptrdiff_t stride)
1941 {
1942     avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src - 2,
1943                                            (uint8_t*)src - (stride * 2),
1944                                            dst, stride);
1945 }
1946 
ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1947 void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
1948                                   ptrdiff_t stride)
1949 {
1950     uint8_t temp[512];
1951     uint8_t *const halfH  = temp;
1952     uint8_t *const halfHV = temp + 256;
1953 
1954     put_h264_qpel16_h_lowpass_lasx(halfH, src, 16, stride);
1955     put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1956     avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1957 }
1958 
ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1959 void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
1960                                   ptrdiff_t stride)
1961 {
1962     avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src - 2,
1963                                             (uint8_t*)src - (stride * 2) + 1,
1964                                             dst, stride);
1965 }
1966 
ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1967 void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
1968                                   ptrdiff_t stride)
1969 {
1970     avg_h264_qpel16_v_lowpass_lasx(dst, src, stride, stride);
1971 }
1972 
ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1973 void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
1974                                   ptrdiff_t stride)
1975 {
1976     uint8_t temp[512];
1977     uint8_t *const halfHV = temp;
1978     uint8_t *const halfH  = temp + 256;
1979 
1980     put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1981     put_h264_qpel16_v_lowpass_lasx(halfH, src, 16, stride);
1982     avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1983 }
1984 
ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1985 void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
1986                                   ptrdiff_t stride)
1987 {
1988     avg_h264_qpel16_hv_lowpass_lasx(dst, src, stride, stride);
1989 }
1990 
ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1991 void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
1992                                   ptrdiff_t stride)
1993 {
1994     uint8_t temp[512];
1995     uint8_t *const halfHV = temp;
1996     uint8_t *const halfH  = temp + 256;
1997 
1998     put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1999     put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride);
2000     avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
2001 }
2002 
ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)2003 void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
2004                                   ptrdiff_t stride)
2005 {
2006     uint8_t half[256];
2007 
2008     put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
2009     avg_pixels16_l2_8_lsx(dst, src + stride, half, stride, stride);
2010 }
2011 
ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)2012 void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
2013                                   ptrdiff_t stride)
2014 {
2015     avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src + stride - 2,
2016                                             (uint8_t*)src - (stride * 2),
2017                                             dst, stride);
2018 }
2019 
ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)2020 void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
2021                                   ptrdiff_t stride)
2022 {
2023     uint8_t temp[512];
2024     uint8_t *const halfH  = temp;
2025     uint8_t *const halfHV = temp + 256;
2026 
2027     put_h264_qpel16_h_lowpass_lasx(halfH, src + stride, 16, stride);
2028     put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
2029     avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
2030 }
2031 
ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)2032 void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
2033                                   ptrdiff_t stride)
2034 {
2035     avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src + stride - 2,
2036                                             (uint8_t*)src - (stride * 2) + 1,
2037                                             dst, stride);
2038 }
2039