1 /*
2 * Copyright (c) 2021 Loongson Technology Corporation Limited
3 * Contributed by Hao Chen <chenhao@loongson.cn>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include "libavcodec/vp9dsp.h"
23 #include "libavutil/loongarch/loongson_intrinsics.h"
24 #include "vp9dsp_loongarch.h"
25
26 static const uint8_t mc_filt_mask_arr[16 * 3] = {
27 /* 8 width cases */
28 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 /* 4 width cases */
30 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 /* 4 width cases */
32 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
33 };
34
35
36 #define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, \
37 _mask0, _mask1, _mask2, _mask3, \
38 _filter0, _filter1, _filter2, _filter3, \
39 _out0, _out1) \
40 { \
41 __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
42 __m128i _reg0, _reg1, _reg2, _reg3; \
43 \
44 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0, \
45 _tmp0, _tmp1); \
46 DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \
47 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1, \
48 _tmp2, _tmp3); \
49 DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3, \
50 _filter1, _reg0, _reg1); \
51 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2, \
52 _tmp4, _tmp5); \
53 DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \
54 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3, \
55 _tmp6, _tmp7); \
56 DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7, \
57 _filter3, _reg2, _reg3); \
58 DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1); \
59 }
60
61 #define HORIZ_8TAP_8WID_4VECS_FILT(_src0, _src1, _src2, _src3, \
62 _mask0, _mask1, _mask2, _mask3, \
63 _filter0, _filter1, _filter2, _filter3, \
64 _out0, _out1, _out2, _out3) \
65 { \
66 __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
67 __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7; \
68 \
69 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0, _src2,\
70 _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, _tmp3);\
71 DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2, \
72 _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3); \
73 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2, _src2,\
74 _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, _tmp3);\
75 DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2, \
76 _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7); \
77 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1, _src2,\
78 _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, _tmp7);\
79 DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5, \
80 _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \
81 _reg1, _reg2, _reg3); \
82 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3, _src2,\
83 _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, _tmp7);\
84 DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5, \
85 _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \
86 _reg5, _reg6, _reg7); \
87 DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3, \
88 _reg7, _out0, _out1, _out2, _out3); \
89 }
90
91 #define FILT_8TAP_DPADD_S_H(_reg0, _reg1, _reg2, _reg3, \
92 _filter0, _filter1, _filter2, _filter3) \
93 ( { \
94 __m128i _vec0, _vec1; \
95 \
96 _vec0 = __lsx_vdp2_h_b(_reg0, _filter0); \
97 _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1); \
98 _vec1 = __lsx_vdp2_h_b(_reg2, _filter2); \
99 _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3); \
100 _vec0 = __lsx_vsadd_h(_vec0, _vec1); \
101 \
102 _vec0; \
103 } )
104
105 #define HORIZ_8TAP_FILT(_src0, _src1, _mask0, _mask1, _mask2, _mask3, \
106 _filt_h0, _filt_h1, _filt_h2, _filt_h3) \
107 ( { \
108 __m128i _tmp0, _tmp1, _tmp2, _tmp3; \
109 __m128i _out; \
110 \
111 DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1,\
112 _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3);\
113 _out = FILT_8TAP_DPADD_S_H(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, \
114 _filt_h2, _filt_h3); \
115 _out = __lsx_vsrari_h(_out, 7); \
116 _out = __lsx_vsat_h(_out, 7); \
117 \
118 _out; \
119 } )
120
121 #define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
122 { \
123 _src0 = __lsx_vld(_src, 0); \
124 _src += _stride; \
125 _src1 = __lsx_vld(_src, 0); \
126 _src += _stride; \
127 _src2 = __lsx_vld(_src, 0); \
128 _src += _stride; \
129 _src3 = __lsx_vld(_src, 0); \
130 }
131
common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)132 static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride,
133 uint8_t *dst, int32_t dst_stride,
134 const int8_t *filter)
135 {
136 __m128i src0, src1, src2, src3;
137 __m128i filter0, filter1, filter2, filter3;
138 __m128i mask0, mask1, mask2, mask3;
139 __m128i out, out0, out1;
140
141 mask0 = __lsx_vld(mc_filt_mask_arr, 16);
142 src -= 3;
143 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
144 filter0, filter1, filter2, filter3);
145 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
146 mask3 = __lsx_vaddi_bu(mask0, 6);
147
148 LSX_LD_4(src, src_stride, src0, src1, src2, src3);
149 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
150 src0, src1, src2, src3);
151 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
152 mask3, filter0, filter1, filter2, filter3, out0, out1);
153 out = __lsx_vssrarni_b_h(out1, out0, 7);
154 out = __lsx_vxori_b(out, 128);
155 __lsx_vstelm_w(out, dst, 0, 0);
156 dst += dst_stride;
157 __lsx_vstelm_w(out, dst, 0, 1);
158 dst += dst_stride;
159 __lsx_vstelm_w(out, dst, 0, 2);
160 dst += dst_stride;
161 __lsx_vstelm_w(out, dst, 0, 3);
162 }
163
common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)164 static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride,
165 uint8_t *dst, int32_t dst_stride,
166 const int8_t *filter)
167 {
168 int32_t src_stride2 = src_stride << 1;
169 int32_t src_stride3 = src_stride + src_stride2;
170 int32_t src_stride4 = src_stride2 << 1;
171 __m128i src0, src1, src2, src3;
172 __m128i filter0, filter1, filter2, filter3;
173 __m128i mask0, mask1, mask2, mask3;
174 __m128i out0, out1, out2, out3;
175 uint8_t *_src = (uint8_t*)src - 3;
176
177 mask0 = __lsx_vld(mc_filt_mask_arr, 16);
178 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
179 mask3 = __lsx_vaddi_bu(mask0, 6);
180 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
181 filter0, filter1, filter2, filter3);
182
183 src0 = __lsx_vld(_src, 0);
184 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
185 src3 = __lsx_vldx(_src, src_stride3);
186 _src += src_stride4;
187 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
188 src0, src1, src2, src3);
189 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
190 mask3, filter0, filter1, filter2, filter3, out0, out1);
191 src0 = __lsx_vld(_src, 0);
192 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
193 src3 = __lsx_vldx(_src, src_stride3);
194 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
195 src0, src1, src2, src3);
196 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
197 mask3, filter0, filter1, filter2, filter3, out2, out3);
198 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
199 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
200 __lsx_vstelm_w(out0, dst, 0, 0);
201 dst += dst_stride;
202 __lsx_vstelm_w(out0, dst, 0, 1);
203 dst += dst_stride;
204 __lsx_vstelm_w(out0, dst, 0, 2);
205 dst += dst_stride;
206 __lsx_vstelm_w(out0, dst, 0, 3);
207 dst += dst_stride;
208 __lsx_vstelm_w(out1, dst, 0, 0);
209 dst += dst_stride;
210 __lsx_vstelm_w(out1, dst, 0, 1);
211 dst += dst_stride;
212 __lsx_vstelm_w(out1, dst, 0, 2);
213 dst += dst_stride;
214 __lsx_vstelm_w(out1, dst, 0, 3);
215 }
216
common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)217 static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
218 uint8_t *dst, int32_t dst_stride,
219 const int8_t *filter, int32_t height)
220 {
221 if (height == 4) {
222 common_hz_8t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
223 } else if (height == 8) {
224 common_hz_8t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
225 }
226 }
227
common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)228 static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride,
229 uint8_t *dst, int32_t dst_stride,
230 const int8_t *filter)
231 {
232 __m128i src0, src1, src2, src3;
233 __m128i filter0, filter1, filter2, filter3;
234 __m128i mask0, mask1, mask2, mask3;
235 __m128i out0, out1, out2, out3;
236
237 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
238 src -= 3;
239 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
240 mask3 = __lsx_vaddi_bu(mask0, 6);
241 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
242 filter0, filter1, filter2, filter3);
243
244 LSX_LD_4(src, src_stride, src0, src1, src2, src3);
245 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
246 src0, src1, src2, src3);
247 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
248 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
249 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
250 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
251 __lsx_vstelm_d(out0, dst, 0, 0);
252 dst += dst_stride;
253 __lsx_vstelm_d(out0, dst, 0, 1);
254 dst += dst_stride;
255 __lsx_vstelm_d(out1, dst, 0, 0);
256 dst += dst_stride;
257 __lsx_vstelm_d(out1, dst, 0, 1);
258 }
259
common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)260 static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
261 uint8_t *dst, int32_t dst_stride,
262 const int8_t *filter, int32_t height)
263 {
264 uint32_t loop_cnt = height >> 2;
265 int32_t src_stride2 = src_stride << 1;
266 int32_t src_stride3 = src_stride + src_stride2;
267 int32_t src_stride4 = src_stride2 << 1;
268 __m128i src0, src1, src2, src3;
269 __m128i filter0, filter1, filter2, filter3;
270 __m128i mask0, mask1, mask2, mask3;
271 __m128i out0, out1, out2, out3;
272 uint8_t* _src = (uint8_t*)src - 3;
273
274 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
275 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
276 mask3 = __lsx_vaddi_bu(mask0, 6);
277 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
278 filter0, filter1, filter2, filter3);
279
280 for (; loop_cnt--;) {
281 src0 = __lsx_vld(_src, 0);
282 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
283 src3 = __lsx_vldx(_src, src_stride3);
284 _src += src_stride4;
285 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
286 src0, src1, src2, src3);
287 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
288 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
289 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
290 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
291 __lsx_vstelm_d(out0, dst, 0, 0);
292 dst += dst_stride;
293 __lsx_vstelm_d(out0, dst, 0, 1);
294 dst += dst_stride;
295 __lsx_vstelm_d(out1, dst, 0, 0);
296 dst += dst_stride;
297 __lsx_vstelm_d(out1, dst, 0, 1);
298 dst += dst_stride;
299 }
300 }
301
common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)302 static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
303 uint8_t *dst, int32_t dst_stride,
304 const int8_t *filter, int32_t height)
305 {
306 if (height == 4) {
307 common_hz_8t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
308 } else {
309 common_hz_8t_8x8mult_lsx(src, src_stride, dst, dst_stride,
310 filter, height);
311 }
312 }
313
common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)314 static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
315 uint8_t *dst, int32_t dst_stride,
316 const int8_t *filter, int32_t height)
317 {
318 uint32_t loop_cnt = height >> 1;
319 int32_t stride = src_stride << 1;
320 __m128i src0, src1, src2, src3;
321 __m128i filter0, filter1, filter2, filter3;
322 __m128i mask0, mask1, mask2, mask3;
323 __m128i out0, out1, out2, out3;
324
325 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
326 src -= 3;
327 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
328 mask3 = __lsx_vaddi_bu(mask0, 6);
329 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
330 filter0, filter1, filter2, filter3);
331
332 for (; loop_cnt--;) {
333 const uint8_t* _src = src + src_stride;
334 DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src0, src2);
335 DUP2_ARG2(__lsx_vld, src, 8, _src, 8, src1, src3);
336 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
337 src0, src1, src2, src3);
338 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
339 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
340 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
341 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
342 __lsx_vst(out0, dst, 0);
343 dst += dst_stride;
344 __lsx_vst(out1, dst, 0);
345 dst += dst_stride;
346 src += stride;
347 }
348 }
349
common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)350 static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
351 uint8_t *dst, int32_t dst_stride,
352 const int8_t *filter, int32_t height)
353 {
354 uint32_t loop_cnt = height >> 1;
355 __m128i src0, src1, src2, src3;
356 __m128i filter0, filter1, filter2, filter3;
357 __m128i mask0, mask1, mask2, mask3;
358 __m128i out0, out1, out2, out3;
359 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
360
361 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
362 src -= 3;
363 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
364 mask3 = __lsx_vaddi_bu(mask0, 6);
365 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
366 filter0, filter1, filter2, filter3);
367
368 for (; loop_cnt--;) {
369 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
370 src3 = __lsx_vld(src, 24);
371 src1 = __lsx_vshuf_b(src2, src0, shuff);
372 src += src_stride;
373 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
374 src0, src1, src2, src3);
375 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
376 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
377 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
378 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
379 __lsx_vst(out0, dst, 0);
380 __lsx_vst(out1, dst, 16);
381
382 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
383 src3 = __lsx_vld(src, 24);
384 src1 = __lsx_vshuf_b(src2, src0, shuff);
385 src += src_stride;
386
387 dst += dst_stride;
388 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
389 src0, src1, src2, src3);
390 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
391 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
392 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
393 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
394 __lsx_vst(out0, dst, 0);
395 __lsx_vst(out1, dst, 16);
396 dst += dst_stride;
397 }
398 }
399
common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)400 static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
401 uint8_t *dst, int32_t dst_stride,
402 const int8_t *filter, int32_t height)
403 {
404 int32_t loop_cnt = height;
405 __m128i src0, src1, src2, src3;
406 __m128i filter0, filter1, filter2, filter3;
407 __m128i mask0, mask1, mask2, mask3;
408 __m128i out0, out1, out2, out3;
409 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
410
411 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
412 src -= 3;
413 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
414 mask3 = __lsx_vaddi_bu(mask0, 6);
415 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
416 filter0, filter1, filter2, filter3);
417
418 for (; loop_cnt--;) {
419 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
420 src3 = __lsx_vld(src, 24);
421 src1 = __lsx_vshuf_b(src2, src0, shuff);
422 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
423 src0, src1, src2, src3);
424 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
425 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
426 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
427 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
428 __lsx_vst(out0, dst, 0);
429 __lsx_vst(out1, dst, 16);
430
431 DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
432 src3 = __lsx_vld(src, 56);
433 src1 = __lsx_vshuf_b(src2, src0, shuff);
434 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
435 src0, src1, src2, src3);
436 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
437 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
438 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
439 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
440 __lsx_vst(out0, dst, 32);
441 __lsx_vst(out1, dst, 48);
442 src += src_stride;
443 dst += dst_stride;
444 }
445 }
446
common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)447 static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
448 uint8_t *dst, int32_t dst_stride,
449 const int8_t *filter, int32_t height)
450 {
451 uint32_t loop_cnt = height >> 2;
452 int32_t src_stride2 = src_stride << 1;
453 int32_t src_stride3 = src_stride + src_stride2;
454 int32_t src_stride4 = src_stride2 << 1;
455 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
456 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
457 __m128i reg0, reg1, reg2, reg3, reg4;
458 __m128i filter0, filter1, filter2, filter3;
459 __m128i out0, out1;
460 uint8_t* _src = (uint8_t*)src - src_stride3;
461
462 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
463 filter0, filter1, filter2, filter3);
464 src0 = __lsx_vld(_src, 0);
465 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
466 src3 = __lsx_vldx(_src, src_stride3);
467 _src += src_stride4;
468 src4 = __lsx_vld(_src, 0);
469 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
470 _src += src_stride3;
471 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0,
472 tmp1, tmp2, tmp3);
473 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
474 DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
475 reg2 = __lsx_vilvl_d(tmp5, tmp2);
476 DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
477 reg2 = __lsx_vxori_b(reg2, 128);
478
479 for (;loop_cnt--;) {
480 src7 = __lsx_vld(_src, 0);
481 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
482 src10 = __lsx_vldx(_src, src_stride3);
483 _src += src_stride4;
484 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
485 src9, tmp0, tmp1, tmp2, tmp3);
486 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
487 DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
488 out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1,
489 filter2, filter3);
490 out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1,
491 filter2, filter3);
492 out0 = __lsx_vssrarni_b_h(out1, out0, 7);
493 out0 = __lsx_vxori_b(out0, 128);
494 __lsx_vstelm_w(out0, dst, 0, 0);
495 dst += dst_stride;
496 __lsx_vstelm_w(out0, dst, 0, 1);
497 dst += dst_stride;
498 __lsx_vstelm_w(out0, dst, 0, 2);
499 dst += dst_stride;
500 __lsx_vstelm_w(out0, dst, 0, 3);
501 dst += dst_stride;
502
503 reg0 = reg2;
504 reg1 = reg3;
505 reg2 = reg4;
506 src6 = src10;
507 }
508 }
509
common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)510 static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
511 uint8_t *dst, int32_t dst_stride,
512 const int8_t *filter, int32_t height)
513 {
514 uint32_t loop_cnt = height >> 2;
515 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
516 __m128i tmp0, tmp1, tmp2, tmp3;
517 __m128i reg0, reg1, reg2, reg3, reg4, reg5;
518 __m128i filter0, filter1, filter2, filter3;
519 __m128i out0, out1, out2, out3;
520 int32_t src_stride2 = src_stride << 1;
521 int32_t src_stride3 = src_stride + src_stride2;
522 int32_t src_stride4 = src_stride2 << 1;
523 uint8_t* _src = (uint8_t*)src - src_stride3;
524
525 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
526 filter0, filter1, filter2, filter3);
527
528 src0 = __lsx_vld(_src, 0);
529 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
530 src3 = __lsx_vldx(_src, src_stride3);
531 _src += src_stride4;
532 src4 = __lsx_vld(_src, 0);
533 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
534 _src += src_stride3;
535
536 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
537 src0, src1, src2, src3);
538 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
539 src6 = __lsx_vxori_b(src6, 128);
540 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
541 reg0, reg1, reg2, reg3);
542 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
543
544 for (;loop_cnt--;) {
545 src7 = __lsx_vld(_src, 0);
546 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
547 src10 = __lsx_vldx(_src, src_stride3);
548 _src += src_stride4;
549 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
550 src7, src8, src9, src10);
551 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
552 src9, tmp0, tmp1, tmp2, tmp3);
553 out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1,
554 filter2, filter3);
555 out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1,
556 filter2, filter3);
557 out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1,
558 filter2, filter3);
559 out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1,
560 filter2, filter3);
561 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
562 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
563 __lsx_vstelm_d(out0, dst, 0, 0);
564 dst += dst_stride;
565 __lsx_vstelm_d(out0, dst, 0, 1);
566 dst += dst_stride;
567 __lsx_vstelm_d(out1, dst, 0, 0);
568 dst += dst_stride;
569 __lsx_vstelm_d(out1, dst, 0, 1);
570 dst += dst_stride;
571
572 reg0 = reg2;
573 reg1 = tmp0;
574 reg2 = tmp2;
575 reg3 = reg5;
576 reg4 = tmp1;
577 reg5 = tmp3;
578 src6 = src10;
579 }
580 }
581
common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)582 static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
583 uint8_t *dst, int32_t dst_stride,
584 const int8_t *filter, int32_t height)
585 {
586 uint32_t loop_cnt = height >> 2;
587 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
588 __m128i filter0, filter1, filter2, filter3;
589 __m128i reg0, reg1, reg2, reg3, reg4, reg5;
590 __m128i reg6, reg7, reg8, reg9, reg10, reg11;
591 __m128i tmp0, tmp1, tmp2, tmp3;
592 int32_t src_stride2 = src_stride << 1;
593 int32_t src_stride3 = src_stride + src_stride2;
594 int32_t src_stride4 = src_stride2 << 1;
595 uint8_t* _src = (uint8_t*)src - src_stride3;
596
597 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
598 filter0, filter1, filter2, filter3);
599 src0 = __lsx_vld(_src, 0);
600 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
601 src3 = __lsx_vldx(_src, src_stride3);
602 _src += src_stride4;
603 src4 = __lsx_vld(_src, 0);
604 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
605 _src += src_stride3;
606 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
607 src0, src1, src2, src3);
608 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
609 src6 = __lsx_vxori_b(src6, 128);
610 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
611 reg0, reg1, reg2, reg3);
612 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
613 DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
614 reg6, reg7, reg8, reg9);
615 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
616
617 for (;loop_cnt--;) {
618 src7 = __lsx_vld(_src, 0);
619 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
620 src10 = __lsx_vldx(_src, src_stride3);
621 _src += src_stride4;
622 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
623 src7, src8, src9, src10);
624 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
625 src0, src1, src2, src3);
626 DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
627 src4, src5, src7, src8);
628 tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1,
629 filter2, filter3);
630 tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1,
631 filter2, filter3);
632 tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1,
633 filter2, filter3);
634 tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1,
635 filter2, filter3);
636 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
637 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
638 __lsx_vst(tmp0, dst, 0);
639 dst += dst_stride;
640 __lsx_vst(tmp1, dst, 0);
641 dst += dst_stride;
642 tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1,
643 filter2, filter3);
644 tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1,
645 filter2, filter3);
646 tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1,
647 filter2, filter3);
648 tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1,
649 filter2, filter3);
650 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
651 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
652 __lsx_vst(tmp0, dst, 0);
653 dst += dst_stride;
654 __lsx_vst(tmp1, dst, 0);
655 dst += dst_stride;
656
657 reg0 = reg2;
658 reg1 = src0;
659 reg2 = src2;
660 reg3 = reg5;
661 reg4 = src1;
662 reg5 = src3;
663 reg6 = reg8;
664 reg7 = src4;
665 reg8 = src7;
666 reg9 = reg11;
667 reg10 = src5;
668 reg11 = src8;
669 src6 = src10;
670 }
671 }
672
common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)673 static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride,
674 uint8_t *dst, int32_t dst_stride,
675 const int8_t *filter, int32_t height,
676 int32_t width)
677 {
678 uint8_t *src_tmp;
679 uint8_t *dst_tmp;
680 uint32_t cnt = width >> 4;
681 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
682 __m128i filter0, filter1, filter2, filter3;
683 __m128i reg0, reg1, reg2, reg3, reg4, reg5;
684 __m128i reg6, reg7, reg8, reg9, reg10, reg11;
685 __m128i tmp0, tmp1, tmp2, tmp3;
686 int32_t src_stride2 = src_stride << 1;
687 int32_t src_stride3 = src_stride + src_stride2;
688 int32_t src_stride4 = src_stride2 << 1;
689 int32_t dst_stride2 = dst_stride << 1;
690 int32_t dst_stride3 = dst_stride2 + dst_stride;
691 int32_t dst_stride4 = dst_stride2 << 1;
692 uint8_t* _src = (uint8_t*)src - src_stride3;
693
694 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
695 filter0, filter1, filter2, filter3);
696 for (;cnt--;) {
697 uint32_t loop_cnt = height >> 2;
698
699 src_tmp = _src;
700 dst_tmp = dst;
701
702 src0 = __lsx_vld(src_tmp, 0);
703 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
704 src1, src2);
705 src3 = __lsx_vldx(src_tmp, src_stride3);
706 src_tmp += src_stride4;
707 src4 = __lsx_vld(src_tmp, 0);
708 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
709 src5, src6);
710 src_tmp += src_stride3;
711
712 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
713 src0, src1, src2, src3);
714 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
715 src6 = __lsx_vxori_b(src6, 128);
716 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
717 reg0, reg1, reg2, reg3);
718 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
719 DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
720 reg6, reg7, reg8, reg9);
721 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
722
723 for (;loop_cnt--;) {
724 src7 = __lsx_vld(src_tmp, 0);
725 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
726 src8, src9);
727 src10 = __lsx_vldx(src_tmp, src_stride3);
728 src_tmp += src_stride4;
729 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10,
730 128, src7, src8, src9, src10);
731 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8,
732 src10, src9, src0, src1, src2, src3);
733 DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8,
734 src10, src9, src4, src5, src7, src8);
735 tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0,
736 filter1, filter2, filter3);
737 tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0,
738 filter1, filter2, filter3);
739 tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0,
740 filter1, filter2, filter3);
741 tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0,
742 filter1, filter2, filter3);
743 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
744 tmp0, tmp1);
745 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
746 __lsx_vst(tmp0, dst_tmp, 0);
747 __lsx_vstx(tmp1, dst_tmp, dst_stride);
748 tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0,
749 filter1, filter2, filter3);
750 tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0,
751 filter1, filter2, filter3);
752 tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0,
753 filter1, filter2, filter3);
754 tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0,
755 filter1, filter2, filter3);
756 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
757 tmp0, tmp1);
758 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
759 __lsx_vstx(tmp0, dst_tmp, dst_stride2);
760 __lsx_vstx(tmp1, dst_tmp, dst_stride3);
761 dst_tmp += dst_stride4;
762
763 reg0 = reg2;
764 reg1 = src0;
765 reg2 = src2;
766 reg3 = reg5;
767 reg4 = src1;
768 reg5 = src3;
769 reg6 = reg8;
770 reg7 = src4;
771 reg8 = src7;
772 reg9 = reg11;
773 reg10 = src5;
774 reg11 = src8;
775 src6 = src10;
776 }
777 _src += 16;
778 dst += 16;
779 }
780 }
781
common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)782 static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
783 uint8_t *dst, int32_t dst_stride,
784 const int8_t *filter, int32_t height)
785 {
786 common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height, 32);
787 }
788
common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)789 static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
790 uint8_t *dst, int32_t dst_stride,
791 const int8_t *filter, int32_t height)
792 {
793 common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride,
794 filter, height, 64);
795 }
796
common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)797 static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride,
798 uint8_t *dst, int32_t dst_stride,
799 const int8_t *filter_horiz,
800 const int8_t *filter_vert,
801 int32_t height)
802 {
803 uint32_t loop_cnt = height >> 2;
804 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
805 __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
806 __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
807 __m128i mask0, mask1, mask2, mask3;
808 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
809 __m128i out0, out1;
810 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
811 int32_t src_stride2 = src_stride << 1;
812 int32_t src_stride3 = src_stride + src_stride2;
813 int32_t src_stride4 = src_stride2 << 1;
814 uint8_t* _src = (uint8_t*)src - src_stride3 - 3;
815
816 mask0 = __lsx_vld(mc_filt_mask_arr, 16);
817 DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
818 filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
819 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
820 mask3 = __lsx_vaddi_bu(mask0, 6);
821
822 src0 = __lsx_vld(_src, 0);
823 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
824 src3 = __lsx_vldx(_src, src_stride3);
825 _src += src_stride4;
826 src4 = __lsx_vld(_src, 0);
827 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
828 _src += src_stride3;
829 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
830 src0, src1, src2, src3);
831 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
832 src6 = __lsx_vxori_b(src6, 128);
833
834 tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
835 filt_hz1, filt_hz2, filt_hz3);
836 tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
837 filt_hz1, filt_hz2, filt_hz3);
838 tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
839 filt_hz1, filt_hz2, filt_hz3);
840 tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
841 filt_hz1, filt_hz2, filt_hz3);
842 DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
843 DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
844 filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
845 DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
846 tmp2 = __lsx_vpackev_b(tmp5, tmp4);
847
848 for (;loop_cnt--;) {
849 src7 = __lsx_vld(_src, 0);
850 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
851 src10 = __lsx_vldx(_src, src_stride3);
852 _src += src_stride4;
853 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
854 src7, src8, src9, src10);
855 tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
856 filt_hz1, filt_hz2, filt_hz3);
857 tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
858 tmp4 = __lsx_vpackev_b(tmp3, tmp4);
859 out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
860 filt_vt2, filt_vt3);
861 src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
862 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
863 src0 = __lsx_vshuf_b(src1, tmp3, shuff);
864 src0 = __lsx_vpackev_b(src1, src0);
865 out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
866 filt_vt2, filt_vt3);
867 out0 = __lsx_vssrarni_b_h(out1, out0, 7);
868 out0 = __lsx_vxori_b(out0, 128);
869 __lsx_vstelm_w(out0, dst, 0, 0);
870 dst += dst_stride;
871 __lsx_vstelm_w(out0, dst, 0, 1);
872 dst += dst_stride;
873 __lsx_vstelm_w(out0, dst, 0, 2);
874 dst += dst_stride;
875 __lsx_vstelm_w(out0, dst, 0, 3);
876 dst += dst_stride;
877
878 tmp5 = src1;
879 tmp0 = tmp2;
880 tmp1 = tmp4;
881 tmp2 = src0;
882 }
883 }
884
common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)885 static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride,
886 uint8_t *dst, int32_t dst_stride,
887 const int8_t *filter_horiz,
888 const int8_t *filter_vert,
889 int32_t height)
890 {
891 uint32_t loop_cnt = height >> 2;
892 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
893 __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
894 __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
895 __m128i mask0, mask1, mask2, mask3;
896 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
897 __m128i out0, out1;
898 int32_t src_stride2 = src_stride << 1;
899 int32_t src_stride3 = src_stride + src_stride2;
900 int32_t src_stride4 = src_stride2 << 1;
901 uint8_t* _src = (uint8_t*)src - src_stride3 - 3;
902
903 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
904 DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz,
905 4, filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
906 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
907 mask3 = __lsx_vaddi_bu(mask0, 6);
908
909 src0 = __lsx_vld(_src, 0);
910 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
911 src3 = __lsx_vldx(_src, src_stride3);
912 _src += src_stride4;
913 src4 = __lsx_vld(_src, 0);
914 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
915 _src += src_stride3;
916 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
917 src0, src1, src2, src3);
918 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
919 src6 = __lsx_vxori_b(src6, 128);
920
921 src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
922 filt_hz1, filt_hz2, filt_hz3);
923 src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
924 filt_hz1, filt_hz2, filt_hz3);
925 src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
926 filt_hz1, filt_hz2, filt_hz3);
927 src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
928 filt_hz1, filt_hz2, filt_hz3);
929 src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
930 filt_hz1, filt_hz2, filt_hz3);
931 src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
932 filt_hz1, filt_hz2, filt_hz3);
933 src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
934 filt_hz1, filt_hz2, filt_hz3);
935
936 DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
937 filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
938 DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4,
939 src2, src1, tmp0, tmp1, tmp2, tmp4);
940 DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
941
942 for (;loop_cnt--;) {
943 src7 = __lsx_vld(_src, 0);
944 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
945 src10 = __lsx_vldx(_src, src_stride3);
946 _src += src_stride4;
947
948 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
949 src7, src8, src9, src10);
950 src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
951 filt_hz1, filt_hz2, filt_hz3);
952 tmp3 = __lsx_vpackev_b(src7, src6);
953 out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
954 filt_vt2, filt_vt3);
955 src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
956 filt_hz1, filt_hz2, filt_hz3);
957 src0 = __lsx_vpackev_b(src8, src7);
958 out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
959 filt_vt2, filt_vt3);
960 src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
961 filt_hz1, filt_hz2, filt_hz3);
962 src1 = __lsx_vpackev_b(src9, src8);
963 src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
964 filt_vt2, filt_vt3);
965 src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
966 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
967 src2 = __lsx_vpackev_b(src10, src9);
968 src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
969 filt_vt2, filt_vt3);
970 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
971 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
972 __lsx_vstelm_d(out0, dst, 0, 0);
973 dst += dst_stride;
974 __lsx_vstelm_d(out0, dst, 0, 1);
975 dst += dst_stride;
976 __lsx_vstelm_d(out1, dst, 0, 0);
977 dst += dst_stride;
978 __lsx_vstelm_d(out1, dst, 0, 1);
979 dst += dst_stride;
980
981 src6 = src10;
982 tmp0 = tmp2;
983 tmp1 = tmp3;
984 tmp2 = src1;
985 tmp4 = tmp6;
986 tmp5 = src0;
987 tmp6 = src2;
988 }
989 }
990
common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)991 static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride,
992 uint8_t *dst, int32_t dst_stride,
993 const int8_t *filter_horiz,
994 const int8_t *filter_vert,
995 int32_t height)
996 {
997 int32_t multiple8_cnt;
998
999 for (multiple8_cnt = 2; multiple8_cnt--;) {
1000 common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
1001 filter_vert, height);
1002 src += 8;
1003 dst += 8;
1004 }
1005 }
1006
common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)1007 static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride,
1008 uint8_t *dst, int32_t dst_stride,
1009 const int8_t *filter_horiz,
1010 const int8_t *filter_vert,
1011 int32_t height)
1012 {
1013 int32_t multiple8_cnt;
1014
1015 for (multiple8_cnt = 4; multiple8_cnt--;) {
1016 common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
1017 filter_vert, height);
1018 src += 8;
1019 dst += 8;
1020 }
1021 }
1022
common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)1023 static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride,
1024 uint8_t *dst, int32_t dst_stride,
1025 const int8_t *filter_horiz,
1026 const int8_t *filter_vert,
1027 int32_t height)
1028 {
1029 int32_t multiple8_cnt;
1030
1031 for (multiple8_cnt = 8; multiple8_cnt--;) {
1032 common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
1033 filter_vert, height);
1034 src += 8;
1035 dst += 8;
1036 }
1037 }
1038
copy_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)1039 static void copy_width8_lsx(const uint8_t *src, int32_t src_stride,
1040 uint8_t *dst, int32_t dst_stride,
1041 int32_t height)
1042 {
1043 int32_t cnt = height >> 2;
1044 __m128i src0, src1, src2, src3;
1045
1046 for (;cnt--;) {
1047 src0 = __lsx_vldrepl_d(src, 0);
1048 src += src_stride;
1049 src1 = __lsx_vldrepl_d(src, 0);
1050 src += src_stride;
1051 src2 = __lsx_vldrepl_d(src, 0);
1052 src += src_stride;
1053 src3 = __lsx_vldrepl_d(src, 0);
1054 src += src_stride;
1055 __lsx_vstelm_d(src0, dst, 0, 0);
1056 dst += dst_stride;
1057 __lsx_vstelm_d(src1, dst, 0, 0);
1058 dst += dst_stride;
1059 __lsx_vstelm_d(src2, dst, 0, 0);
1060 dst += dst_stride;
1061 __lsx_vstelm_d(src3, dst, 0, 0);
1062 dst += dst_stride;
1063 }
1064 }
1065
copy_width16_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)1066 static void copy_width16_lsx(const uint8_t *src, int32_t src_stride,
1067 uint8_t *dst, int32_t dst_stride,
1068 int32_t height)
1069 {
1070 int32_t cnt = height >> 2;
1071 __m128i src0, src1, src2, src3;
1072 int32_t src_stride2 = src_stride << 1;
1073 int32_t src_stride3 = src_stride + src_stride2;
1074 int32_t src_stride4 = src_stride2 << 1;
1075 int32_t dst_stride2 = dst_stride << 1;
1076 int32_t dst_stride3 = dst_stride2 + dst_stride;
1077 int32_t dst_stride4 = dst_stride2 << 1;
1078 uint8_t *_src = (uint8_t*)src;
1079
1080 for (;cnt--;) {
1081 src0 = __lsx_vld(_src, 0);
1082 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1083 src3 = __lsx_vldx(_src, src_stride3);
1084 _src += src_stride4;
1085 __lsx_vst(src0, dst, 0);
1086 __lsx_vstx(src1, dst, dst_stride);
1087 __lsx_vstx(src2, dst, dst_stride2);
1088 __lsx_vstx(src3, dst, dst_stride3);
1089 dst += dst_stride4;
1090 }
1091 }
1092
copy_width32_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)1093 static void copy_width32_lsx(const uint8_t *src, int32_t src_stride,
1094 uint8_t *dst, int32_t dst_stride,
1095 int32_t height)
1096 {
1097 int32_t cnt = height >> 2;
1098 uint8_t *src_tmp1 = (uint8_t*)src;
1099 uint8_t *dst_tmp1 = dst;
1100 uint8_t *src_tmp2 = src_tmp1 + 16;
1101 uint8_t *dst_tmp2 = dst_tmp1 + 16;
1102 int32_t src_stride2 = src_stride << 1;
1103 int32_t src_stride3 = src_stride + src_stride2;
1104 int32_t src_stride4 = src_stride2 << 1;
1105 int32_t dst_stride2 = dst_stride << 1;
1106 int32_t dst_stride3 = dst_stride2 + dst_stride;
1107 int32_t dst_stride4 = dst_stride2 << 1;
1108 __m128i src0, src1, src2, src3, src4, src5, src6, src7;
1109
1110 for (;cnt--;) {
1111 src0 = __lsx_vld(src_tmp1, 0);
1112 DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2,
1113 src1, src2);
1114 src3 = __lsx_vldx(src_tmp1, src_stride3);
1115 src_tmp1 += src_stride4;
1116
1117 src4 = __lsx_vld(src_tmp2, 0);
1118 DUP2_ARG2(__lsx_vldx, src_tmp2, src_stride, src_tmp2, src_stride2,
1119 src5, src6);
1120 src7 = __lsx_vldx(src_tmp2, src_stride3);
1121 src_tmp2 += src_stride4;
1122
1123 __lsx_vst(src0, dst_tmp1, 0);
1124 __lsx_vstx(src1, dst_tmp1, dst_stride);
1125 __lsx_vstx(src2, dst_tmp1, dst_stride2);
1126 __lsx_vstx(src3, dst_tmp1, dst_stride3);
1127 dst_tmp1 += dst_stride4;
1128 __lsx_vst(src4, dst_tmp2, 0);
1129 __lsx_vstx(src5, dst_tmp2, dst_stride);
1130 __lsx_vstx(src6, dst_tmp2, dst_stride2);
1131 __lsx_vstx(src7, dst_tmp2, dst_stride3);
1132 dst_tmp2 += dst_stride4;
1133 }
1134 }
1135
copy_width64_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)1136 static void copy_width64_lsx(const uint8_t *src, int32_t src_stride,
1137 uint8_t *dst, int32_t dst_stride,
1138 int32_t height)
1139 {
1140 int32_t cnt = height >> 2;
1141 __m128i src0, src1, src2, src3, src4, src5, src6, src7;
1142 __m128i src8, src9, src10, src11, src12, src13, src14, src15;
1143
1144 for (;cnt--;) {
1145 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
1146 src0, src1, src2, src3);
1147 src += src_stride;
1148 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
1149 src4, src5, src6, src7);
1150 src += src_stride;
1151 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
1152 src8, src9, src10, src11);
1153 src += src_stride;
1154 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
1155 src12, src13, src14, src15);
1156 src += src_stride;
1157 __lsx_vst(src0, dst, 0);
1158 __lsx_vst(src1, dst, 16);
1159 __lsx_vst(src2, dst, 32);
1160 __lsx_vst(src3, dst, 48);
1161 dst += dst_stride;
1162 __lsx_vst(src4, dst, 0);
1163 __lsx_vst(src5, dst, 16);
1164 __lsx_vst(src6, dst, 32);
1165 __lsx_vst(src7, dst, 48);
1166 dst += dst_stride;
1167 __lsx_vst(src8, dst, 0);
1168 __lsx_vst(src9, dst, 16);
1169 __lsx_vst(src10, dst, 32);
1170 __lsx_vst(src11, dst, 48);
1171 dst += dst_stride;
1172 __lsx_vst(src12, dst, 0);
1173 __lsx_vst(src13, dst, 16);
1174 __lsx_vst(src14, dst, 32);
1175 __lsx_vst(src15, dst, 48);
1176 dst += dst_stride;
1177 }
1178 }
1179
common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)1180 static void common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src,
1181 int32_t src_stride,
1182 uint8_t *dst, int32_t dst_stride,
1183 const int8_t *filter)
1184 {
1185 uint8_t *dst_tmp = dst;
1186 __m128i src0, src1, src2, src3;
1187 __m128i filter0, filter1, filter2, filter3;
1188 __m128i mask0, mask1, mask2, mask3;
1189 __m128i tmp0, tmp1;
1190 __m128i dst0, dst1, dst2, dst3;
1191
1192 mask0 = __lsx_vld(mc_filt_mask_arr, 16);
1193 src -= 3;
1194 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1195 mask3 = __lsx_vaddi_bu(mask0, 6);
1196 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1197 filter0, filter1, filter2, filter3);
1198 LSX_LD_4(src, src_stride, src0, src1, src2, src3);
1199 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1200 src0, src1, src2, src3);
1201 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
1202 filter0, filter1, filter2, filter3, tmp0, tmp1);
1203 dst0 = __lsx_vldrepl_w(dst_tmp, 0);
1204 dst_tmp += dst_stride;
1205 dst1 = __lsx_vldrepl_w(dst_tmp, 0);
1206 dst_tmp += dst_stride;
1207 dst2 = __lsx_vldrepl_w(dst_tmp, 0);
1208 dst_tmp += dst_stride;
1209 dst3 = __lsx_vldrepl_w(dst_tmp, 0);
1210 dst0 = __lsx_vilvl_w(dst1, dst0);
1211 dst1 = __lsx_vilvl_w(dst3, dst2);
1212 dst0 = __lsx_vilvl_d(dst1, dst0);
1213 tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
1214 tmp0 = __lsx_vxori_b(tmp0, 128);
1215 dst0 = __lsx_vavgr_bu(tmp0, dst0);
1216 __lsx_vstelm_w(dst0, dst, 0, 0);
1217 dst += dst_stride;
1218 __lsx_vstelm_w(dst0, dst, 0, 1);
1219 dst += dst_stride;
1220 __lsx_vstelm_w(dst0, dst, 0, 2);
1221 dst += dst_stride;
1222 __lsx_vstelm_w(dst0, dst, 0, 3);
1223 }
1224
common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)1225 static void common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src,
1226 int32_t src_stride,
1227 uint8_t *dst, int32_t dst_stride,
1228 const int8_t *filter)
1229 {
1230 uint8_t *dst_tmp = dst;
1231 __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
1232 __m128i mask0, mask1, mask2, mask3, tmp0, tmp1, tmp2, tmp3;
1233 __m128i dst0, dst1;
1234
1235 mask0 = __lsx_vld(mc_filt_mask_arr, 16);
1236 src -= 3;
1237 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1238 mask3 = __lsx_vaddi_bu(mask0, 6);
1239 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1240 filter0, filter1, filter2, filter3);
1241
1242 LSX_LD_4(src, src_stride, src0, src1, src2, src3);
1243 src += src_stride;
1244 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1245 src0, src1, src2, src3);
1246 tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
1247 dst_tmp += dst_stride;
1248 tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
1249 dst_tmp += dst_stride;
1250 tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
1251 dst_tmp += dst_stride;
1252 tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
1253 dst_tmp += dst_stride;
1254 tmp0 = __lsx_vilvl_w(tmp1, tmp0);
1255 tmp1 = __lsx_vilvl_w(tmp3, tmp2);
1256 dst0 = __lsx_vilvl_d(tmp1, tmp0);
1257
1258 tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
1259 dst_tmp += dst_stride;
1260 tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
1261 dst_tmp += dst_stride;
1262 tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
1263 dst_tmp += dst_stride;
1264 tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
1265 tmp0 = __lsx_vilvl_w(tmp1, tmp0);
1266 tmp1 = __lsx_vilvl_w(tmp3, tmp2);
1267 dst1 = __lsx_vilvl_d(tmp1, tmp0);
1268 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
1269 filter0, filter1, filter2, filter3, tmp0, tmp1);
1270 LSX_LD_4(src, src_stride, src0, src1, src2, src3);
1271 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1272 src0, src1, src2, src3);
1273 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
1274 filter0, filter1, filter2, filter3, tmp2, tmp3);
1275 DUP4_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp2, tmp2, 7,
1276 tmp3, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
1277 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1278 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1279 DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
1280 __lsx_vstelm_w(dst0, dst, 0, 0);
1281 dst += dst_stride;
1282 __lsx_vstelm_w(dst0, dst, 0, 1);
1283 dst += dst_stride;
1284 __lsx_vstelm_w(dst0, dst, 0, 2);
1285 dst += dst_stride;
1286 __lsx_vstelm_w(dst0, dst, 0, 3);
1287 dst += dst_stride;
1288 __lsx_vstelm_w(dst1, dst, 0, 0);
1289 dst += dst_stride;
1290 __lsx_vstelm_w(dst1, dst, 0, 1);
1291 dst += dst_stride;
1292 __lsx_vstelm_w(dst1, dst, 0, 2);
1293 dst += dst_stride;
1294 __lsx_vstelm_w(dst1, dst, 0, 3);
1295 }
1296
common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)1297 static void common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src,
1298 int32_t src_stride,
1299 uint8_t *dst, int32_t dst_stride,
1300 const int8_t *filter,
1301 int32_t height)
1302 {
1303 if (height == 4) {
1304 common_hz_8t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
1305 } else if (height == 8) {
1306 common_hz_8t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
1307 }
1308 }
1309
common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)1310 static void common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src,
1311 int32_t src_stride,
1312 uint8_t *dst, int32_t dst_stride,
1313 const int8_t *filter,
1314 int32_t height)
1315 {
1316 int32_t loop_cnt = height >> 2;
1317 uint8_t *dst_tmp = dst;
1318 __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
1319 __m128i mask0, mask1, mask2, mask3;
1320 __m128i tmp0, tmp1, tmp2, tmp3;
1321 __m128i dst0, dst1, dst2, dst3;
1322 int32_t src_stride2 = src_stride << 1;
1323 int32_t src_stride3 = src_stride2 + src_stride;
1324 int32_t src_stride4 = src_stride2 << 1;
1325 uint8_t *_src = (uint8_t*)src - 3;
1326
1327 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
1328 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1329 mask3 = __lsx_vaddi_bu(mask0, 6);
1330 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1331 filter0, filter1, filter2, filter3);
1332
1333 for (;loop_cnt--;) {
1334 src0 = __lsx_vld(_src, 0);
1335 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1336 src3 = __lsx_vldx(_src, src_stride3);
1337 _src += src_stride4;
1338 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1339 src0, src1, src2, src3);
1340 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1341 mask3,filter0, filter1, filter2, filter3, tmp0, tmp1, tmp2, tmp3);
1342 dst0 = __lsx_vldrepl_d(dst_tmp, 0);
1343 dst_tmp += dst_stride;
1344 dst1 = __lsx_vldrepl_d(dst_tmp, 0);
1345 dst_tmp += dst_stride;
1346 dst2 = __lsx_vldrepl_d(dst_tmp, 0);
1347 dst_tmp += dst_stride;
1348 dst3 = __lsx_vldrepl_d(dst_tmp, 0);
1349 dst_tmp += dst_stride;
1350 DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
1351 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
1352 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1353 DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
1354 __lsx_vstelm_d(dst0, dst, 0, 0);
1355 dst += dst_stride;
1356 __lsx_vstelm_d(dst0, dst, 0, 1);
1357 dst += dst_stride;
1358 __lsx_vstelm_d(dst1, dst, 0, 0);
1359 dst += dst_stride;
1360 __lsx_vstelm_d(dst1, dst, 0, 1);
1361 dst += dst_stride;
1362 }
1363 }
1364
common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)1365 static void common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src,
1366 int32_t src_stride,
1367 uint8_t *dst, int32_t dst_stride,
1368 const int8_t *filter,
1369 int32_t height)
1370 {
1371 int32_t loop_cnt = height >> 1;
1372 int32_t dst_stride2 = dst_stride << 1;
1373 uint8_t *dst_tmp = dst;
1374 __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
1375 __m128i mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
1376 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1377 __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1378
1379 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
1380 src -= 3;
1381 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1382 mask3 = __lsx_vaddi_bu(mask0, 6);
1383 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1384 filter0, filter1, filter2, filter3);
1385
1386 for (;loop_cnt--;) {
1387 DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
1388 src += src_stride;
1389 DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
1390 src += src_stride;
1391 dst0 = __lsx_vld(dst_tmp, 0);
1392 dst1 = __lsx_vldx(dst_tmp, dst_stride);
1393 dst_tmp += dst_stride2;
1394 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1395 src0, src1, src2, src3);
1396 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,
1397 mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
1398 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,
1399 mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
1400 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,
1401 mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
1402 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2,
1403 mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
1404 DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3,
1405 filter0, tmp0, tmp1, tmp2, tmp3);
1406 DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2, tmp11,
1407 filter2, tmp8, tmp9, tmp10, tmp11);
1408 DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2,
1409 tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
1410 DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3, tmp10,
1411 tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, tmp7);
1412 DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
1413 tmp0, tmp1, tmp2, tmp3);
1414 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst2, dst3);
1415 DUP2_ARG2(__lsx_vxori_b, dst2, 128, dst3, 128, dst2, dst3);
1416 DUP2_ARG2(__lsx_vavgr_bu, dst0, dst2, dst1, dst3, dst0, dst1);
1417 __lsx_vst(dst0, dst, 0);
1418 __lsx_vstx(dst1, dst, dst_stride);
1419 dst += dst_stride2;
1420 }
1421 }
1422
common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)1423 static void common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src,
1424 int32_t src_stride,
1425 uint8_t *dst, int32_t dst_stride,
1426 const int8_t *filter,
1427 int32_t height)
1428 {
1429 uint32_t loop_cnt = height;
1430 uint8_t *dst_tmp = dst;
1431 __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
1432 __m128i mask0, mask1, mask2, mask3, dst0, dst1;
1433 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1434 __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1435 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
1436
1437 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
1438 src -= 3;
1439 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1440 mask3 = __lsx_vaddi_bu(mask0, 6);
1441 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1442 filter0, filter1, filter2, filter3);
1443
1444 for (;loop_cnt--;) {
1445 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
1446 src3 = __lsx_vld(src, 24);
1447 src1 = __lsx_vshuf_b(src2, src0, shuff);
1448 src += src_stride;
1449 DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst, 16, dst0, dst1);
1450 dst_tmp += dst_stride;
1451 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1452 src0, src1, src2, src3);
1453 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2,
1454 src2, mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
1455 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2,
1456 src2, mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
1457 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2,
1458 src2, mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
1459 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2,
1460 src2, mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
1461 DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0,
1462 tmp3, filter0, tmp0, tmp1, tmp2, tmp3);
1463 DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
1464 tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
1465 DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1,
1466 tmp2, tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
1467 DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
1468 tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, tmp7);
1469 DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
1470 tmp0, tmp1, tmp2, tmp3);
1471 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
1472 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1473 DUP2_ARG2(__lsx_vavgr_bu, dst0, tmp0, dst1, tmp1, dst0, dst1);
1474 __lsx_vst(dst0, dst, 0);
1475 __lsx_vst(dst1, dst, 16);
1476 dst += dst_stride;
1477 }
1478 }
1479
common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)1480 static void common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src,
1481 int32_t src_stride,
1482 uint8_t *dst, int32_t dst_stride,
1483 const int8_t *filter,
1484 int32_t height)
1485 {
1486 int32_t loop_cnt = height;
1487 __m128i src0, src1, src2, src3;
1488 __m128i filter0, filter1, filter2, filter3;
1489 __m128i mask0, mask1, mask2, mask3;
1490 __m128i out0, out1, out2, out3, dst0, dst1;
1491 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
1492
1493 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
1494 src -= 3;
1495 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1496 mask3 = __lsx_vaddi_bu(mask0, 6);
1497 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1498 filter0, filter1, filter2, filter3);
1499
1500 for (;loop_cnt--;) {
1501 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
1502 src3 = __lsx_vld(src, 24);
1503 src1 = __lsx_vshuf_b(src2, src0, shuff);
1504 DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
1505 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1506 src0, src1, src2, src3);
1507 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1508 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
1509 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
1510 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1511 DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
1512 __lsx_vst(out0, dst, 0);
1513 __lsx_vst(out1, dst, 16);
1514
1515 DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
1516 src3 = __lsx_vld(src, 56);
1517 src1 = __lsx_vshuf_b(src2, src0, shuff);
1518 DUP2_ARG2(__lsx_vld, dst, 32, dst, 48, dst0, dst1);
1519 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1520 src0, src1, src2, src3);
1521 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1522 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
1523 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
1524 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1525 DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
1526 __lsx_vst(out0, dst, 32);
1527 __lsx_vst(out1, dst, 48);
1528 src += src_stride;
1529 dst += dst_stride;
1530 }
1531 }
1532
common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)1533 static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src,
1534 int32_t src_stride,
1535 uint8_t *dst, int32_t dst_stride,
1536 const int8_t *filter,
1537 int32_t height)
1538 {
1539 uint32_t loop_cnt = height >> 2;
1540 uint8_t *dst_tmp = dst;
1541 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1542 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1543 __m128i reg0, reg1, reg2, reg3, reg4;
1544 __m128i filter0, filter1, filter2, filter3;
1545 __m128i out0, out1;
1546 int32_t src_stride2 = src_stride << 1;
1547 int32_t src_stride3 = src_stride + src_stride2;
1548 int32_t src_stride4 = src_stride2 << 1;
1549 uint8_t* _src = (uint8_t*)src - src_stride3;
1550
1551 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1552 filter0, filter1, filter2, filter3);
1553 src0 = __lsx_vld(_src, 0);
1554 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1555 src3 = __lsx_vldx(_src, src_stride3);
1556 _src += src_stride4;
1557 src4 = __lsx_vld(_src, 0);
1558 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1559 _src += src_stride3;
1560 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
1561 tmp0, tmp1, tmp2, tmp3);
1562 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
1563 DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
1564 reg2 = __lsx_vilvl_d(tmp5, tmp2);
1565 DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
1566 reg2 = __lsx_vxori_b(reg2, 128);
1567
1568 for (;loop_cnt--;) {
1569 src7 = __lsx_vld(_src, 0);
1570 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
1571 src10 = __lsx_vldx(_src, src_stride3);
1572 _src += src_stride4;
1573 src0 = __lsx_vldrepl_w(dst_tmp, 0);
1574 dst_tmp += dst_stride;
1575 src1 = __lsx_vldrepl_w(dst_tmp, 0);
1576 dst_tmp += dst_stride;
1577 src2 = __lsx_vldrepl_w(dst_tmp, 0);
1578 dst_tmp += dst_stride;
1579 src3 = __lsx_vldrepl_w(dst_tmp, 0);
1580 dst_tmp += dst_stride;
1581 DUP2_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src0, src1);
1582 src0 = __lsx_vilvl_d(src1, src0);
1583 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
1584 src9, tmp0, tmp1, tmp2, tmp3);
1585 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
1586 DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
1587 out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0,
1588 filter1, filter2, filter3);
1589 out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0,
1590 filter1, filter2, filter3);
1591 out0 = __lsx_vssrarni_b_h(out1, out0, 7);
1592 out0 = __lsx_vxori_b(out0, 128);
1593 out0 = __lsx_vavgr_bu(out0, src0);
1594 __lsx_vstelm_w(out0, dst, 0, 0);
1595 dst += dst_stride;
1596 __lsx_vstelm_w(out0, dst, 0, 1);
1597 dst += dst_stride;
1598 __lsx_vstelm_w(out0, dst, 0, 2);
1599 dst += dst_stride;
1600 __lsx_vstelm_w(out0, dst, 0, 3);
1601 dst += dst_stride;
1602 reg0 = reg2;
1603 reg1 = reg3;
1604 reg2 = reg4;
1605 src6 = src10;
1606 }
1607 }
1608
common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)1609 static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src,
1610 int32_t src_stride,
1611 uint8_t *dst, int32_t dst_stride,
1612 const int8_t *filter,
1613 int32_t height)
1614 {
1615 uint32_t loop_cnt = height >> 2;
1616 uint8_t *dst_tmp = dst;
1617 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1618 __m128i tmp0, tmp1, tmp2, tmp3;
1619 __m128i reg0, reg1, reg2, reg3, reg4, reg5;
1620 __m128i filter0, filter1, filter2, filter3;
1621 __m128i out0, out1, out2, out3;
1622 int32_t src_stride2 = src_stride << 1;
1623 int32_t src_stride3 = src_stride + src_stride2;
1624 int32_t src_stride4 = src_stride2 << 1;
1625 uint8_t* _src = (uint8_t*)src - src_stride3;
1626
1627 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1628 filter0, filter1, filter2, filter3);
1629
1630 src0 = __lsx_vld(_src, 0);
1631 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1632 src3 = __lsx_vldx(_src, src_stride3);
1633 _src += src_stride4;
1634 src4 = __lsx_vld(_src, 0);
1635 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1636 _src += src_stride3;
1637 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1638 src0, src1, src2, src3);
1639 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1640 src6 = __lsx_vxori_b(src6, 128);
1641 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2,
1642 src1, reg0, reg1, reg2, reg3);
1643 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
1644
1645 for (;loop_cnt--;) {
1646 src7 = __lsx_vld(_src, 0);
1647 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
1648 src10 = __lsx_vldx(_src, src_stride3);
1649 _src += src_stride4;
1650 src0 = __lsx_vldrepl_d(dst_tmp, 0);
1651 dst_tmp += dst_stride;
1652 src1 = __lsx_vldrepl_d(dst_tmp, 0);
1653 dst_tmp += dst_stride;
1654 src2 = __lsx_vldrepl_d(dst_tmp, 0);
1655 dst_tmp += dst_stride;
1656 src3 = __lsx_vldrepl_d(dst_tmp, 0);
1657 dst_tmp += dst_stride;
1658 DUP2_ARG2(__lsx_vilvl_d, src1, src0, src3, src2, src0, src1);
1659 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
1660 src7, src8, src9, src10);
1661 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
1662 src9, tmp0, tmp1, tmp2, tmp3);
1663 out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0,
1664 filter1, filter2, filter3);
1665 out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0,
1666 filter1, filter2, filter3);
1667 out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0,
1668 filter1, filter2, filter3);
1669 out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0,
1670 filter1, filter2, filter3);
1671 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
1672 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1673 DUP2_ARG2(__lsx_vavgr_bu, out0, src0, out1, src1, out0, out1);
1674 __lsx_vstelm_d(out0, dst, 0, 0);
1675 dst += dst_stride;
1676 __lsx_vstelm_d(out0, dst, 0, 1);
1677 dst += dst_stride;
1678 __lsx_vstelm_d(out1, dst, 0, 0);
1679 dst += dst_stride;
1680 __lsx_vstelm_d(out1, dst, 0, 1);
1681 dst += dst_stride;
1682
1683 reg0 = reg2;
1684 reg1 = tmp0;
1685 reg2 = tmp2;
1686 reg3 = reg5;
1687 reg4 = tmp1;
1688 reg5 = tmp3;
1689 src6 = src10;
1690 }
1691 }
1692
common_vt_8t_and_aver_dst_16w_mult_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)1693 static void common_vt_8t_and_aver_dst_16w_mult_lsx(const uint8_t *src,
1694 int32_t src_stride,
1695 uint8_t *dst,
1696 int32_t dst_stride,
1697 const int8_t *filter,
1698 int32_t height,
1699 int32_t width)
1700 {
1701 uint8_t *src_tmp;
1702 uint32_t cnt = width >> 4;
1703 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1704 __m128i filter0, filter1, filter2, filter3;
1705 __m128i reg0, reg1, reg2, reg3, reg4, reg5;
1706 __m128i reg6, reg7, reg8, reg9, reg10, reg11;
1707 __m128i tmp0, tmp1, tmp2, tmp3;
1708 int32_t src_stride2 = src_stride << 1;
1709 int32_t src_stride3 = src_stride + src_stride2;
1710 int32_t src_stride4 = src_stride2 << 1;
1711 int32_t dst_stride2 = dst_stride << 1;
1712 int32_t dst_stride3 = dst_stride2 + dst_stride;
1713 int32_t dst_stride4 = dst_stride2 << 1;
1714 uint8_t *_src = (uint8_t*)src - src_stride3;
1715
1716 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1717 filter0, filter1, filter2, filter3);
1718 for (;cnt--;) {
1719 uint32_t loop_cnt = height >> 2;
1720 uint8_t *dst_reg = dst;
1721
1722 src_tmp = _src;
1723 src0 = __lsx_vld(src_tmp, 0);
1724 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
1725 src1, src2);
1726 src3 = __lsx_vldx(src_tmp, src_stride3);
1727 src_tmp += src_stride4;
1728 src4 = __lsx_vld(src_tmp, 0);
1729 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
1730 src5, src6);
1731 src_tmp += src_stride3;
1732 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1733 src0, src1, src2, src3);
1734 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1735 src6 = __lsx_vxori_b(src6, 128);
1736 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
1737 reg0, reg1, reg2, reg3);
1738 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
1739 DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
1740 reg6, reg7, reg8, reg9);
1741 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
1742
1743 for (;loop_cnt--;) {
1744 src7 = __lsx_vld(src_tmp, 0);
1745 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
1746 src8, src9);
1747 src10 = __lsx_vldx(src_tmp, src_stride3);
1748 src_tmp += src_stride4;
1749 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10,
1750 128, src7, src8, src9, src10);
1751 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8,
1752 src10, src9, src0, src1, src2, src3);
1753 DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8,
1754 src10, src9, src4, src5, src7, src8);
1755 tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0,
1756 filter1, filter2, filter3);
1757 tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0,
1758 filter1, filter2, filter3);
1759 tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0,
1760 filter1, filter2, filter3);
1761 tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0,
1762 filter1, filter2, filter3);
1763 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
1764 tmp0, tmp1);
1765 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1766 tmp2 = __lsx_vld(dst_reg, 0);
1767 tmp3 = __lsx_vldx(dst_reg, dst_stride);
1768 DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
1769 __lsx_vst(tmp0, dst_reg, 0);
1770 __lsx_vstx(tmp1, dst_reg, dst_stride);
1771 tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0,
1772 filter1, filter2, filter3);
1773 tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0,
1774 filter1, filter2, filter3);
1775 tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0,
1776 filter1, filter2, filter3);
1777 tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0,
1778 filter1, filter2, filter3);
1779 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
1780 tmp0, tmp1);
1781 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1782 tmp2 = __lsx_vldx(dst_reg, dst_stride2);
1783 tmp3 = __lsx_vldx(dst_reg, dst_stride3);
1784 DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
1785 __lsx_vstx(tmp0, dst_reg, dst_stride2);
1786 __lsx_vstx(tmp1, dst_reg, dst_stride3);
1787 dst_reg += dst_stride4;
1788
1789 reg0 = reg2;
1790 reg1 = src0;
1791 reg2 = src2;
1792 reg3 = reg5;
1793 reg4 = src1;
1794 reg5 = src3;
1795 reg6 = reg8;
1796 reg7 = src4;
1797 reg8 = src7;
1798 reg9 = reg11;
1799 reg10 = src5;
1800 reg11 = src8;
1801 src6 = src10;
1802 }
1803 _src += 16;
1804 dst += 16;
1805 }
1806 }
1807
common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)1808 static void common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src,
1809 int32_t src_stride,
1810 uint8_t *dst, int32_t dst_stride,
1811 const int8_t *filter,
1812 int32_t height)
1813 {
1814 common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
1815 filter, height, 16);
1816 }
1817
common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)1818 static void common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src,
1819 int32_t src_stride,
1820 uint8_t *dst, int32_t dst_stride,
1821 const int8_t *filter,
1822 int32_t height)
1823 {
1824 common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
1825 filter, height, 32);
1826 }
1827
common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)1828 static void common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src,
1829 int32_t src_stride,
1830 uint8_t *dst, int32_t dst_stride,
1831 const int8_t *filter,
1832 int32_t height)
1833 {
1834 common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
1835 filter, height, 64);
1836 }
1837
common_hv_8ht_8vt_and_aver_dst_4w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)1838 static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(const uint8_t *src,
1839 int32_t src_stride,
1840 uint8_t *dst,
1841 int32_t dst_stride,
1842 const int8_t *filter_horiz,
1843 const int8_t *filter_vert,
1844 int32_t height)
1845 {
1846 uint32_t loop_cnt = height >> 2;
1847 uint8_t *dst_tmp = dst;
1848 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1849 __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1850 __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1851 __m128i mask0, mask1, mask2, mask3;
1852 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1853 __m128i out0, out1;
1854 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
1855 int32_t src_stride2 = src_stride << 1;
1856 int32_t src_stride3 = src_stride + src_stride2;
1857 int32_t src_stride4 = src_stride2 << 1;
1858 uint8_t* _src = (uint8_t*)src - 3 - src_stride3;
1859
1860 mask0 = __lsx_vld(mc_filt_mask_arr, 16);
1861 DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz,
1862 4, filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1863 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1864 mask3 = __lsx_vaddi_bu(mask0, 6);
1865
1866 src0 = __lsx_vld(_src, 0);
1867 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1868 src3 = __lsx_vldx(_src, src_stride3);
1869 _src += src_stride4;
1870 src4 = __lsx_vld(_src, 0);
1871 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1872 _src += src_stride3;
1873
1874 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1875 src0, src1, src2, src3);
1876 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1877 src6 = __lsx_vxori_b(src6, 128);
1878
1879 tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
1880 filt_hz1, filt_hz2, filt_hz3);
1881 tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
1882 filt_hz1, filt_hz2, filt_hz3);
1883 tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
1884 filt_hz1, filt_hz2, filt_hz3);
1885 tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
1886 filt_hz1, filt_hz2, filt_hz3);
1887 DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
1888 DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
1889 filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1890 DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1891 tmp2 = __lsx_vpackev_b(tmp5, tmp4);
1892
1893 for (;loop_cnt--;) {
1894 src7 = __lsx_vld(_src, 0);
1895 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
1896 src10 = __lsx_vldx(_src, src_stride3);
1897 _src += src_stride4;
1898 src2 = __lsx_vldrepl_w(dst_tmp, 0);
1899 dst_tmp += dst_stride;
1900 src3 = __lsx_vldrepl_w(dst_tmp, 0);
1901 dst_tmp += dst_stride;
1902 src4 = __lsx_vldrepl_w(dst_tmp, 0);
1903 dst_tmp += dst_stride;
1904 src5 = __lsx_vldrepl_w(dst_tmp, 0);
1905 dst_tmp += dst_stride;
1906 DUP2_ARG2(__lsx_vilvl_w, src3, src2, src5, src4, src2, src3);
1907 src2 = __lsx_vilvl_d(src3, src2);
1908 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
1909 src7, src8, src9, src10);
1910 tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
1911 filt_hz1, filt_hz2, filt_hz3);
1912 tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
1913 tmp4 = __lsx_vpackev_b(tmp3, tmp4);
1914 out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
1915 filt_vt2, filt_vt3);
1916 src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
1917 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1918 src0 = __lsx_vshuf_b(src1, tmp3, shuff);
1919 src0 = __lsx_vpackev_b(src1, src0);
1920 out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
1921 filt_vt2, filt_vt3);
1922 out0 = __lsx_vssrarni_b_h(out1, out0, 7);
1923 out0 = __lsx_vxori_b(out0, 128);
1924 out0 = __lsx_vavgr_bu(out0, src2);
1925 __lsx_vstelm_w(out0, dst, 0, 0);
1926 dst += dst_stride;
1927 __lsx_vstelm_w(out0, dst, 0, 1);
1928 dst += dst_stride;
1929 __lsx_vstelm_w(out0, dst, 0, 2);
1930 dst += dst_stride;
1931 __lsx_vstelm_w(out0, dst, 0, 3);
1932 dst += dst_stride;
1933
1934 tmp5 = src1;
1935 tmp0 = tmp2;
1936 tmp1 = tmp4;
1937 tmp2 = src0;
1938 }
1939 }
1940
common_hv_8ht_8vt_and_aver_dst_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)1941 static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(const uint8_t *src,
1942 int32_t src_stride,
1943 uint8_t *dst,
1944 int32_t dst_stride,
1945 const int8_t *filter_horiz,
1946 const int8_t *filter_vert,
1947 int32_t height)
1948 {
1949 uint32_t loop_cnt = height >> 2;
1950 uint8_t *dst_tmp = dst;
1951 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1952 __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1953 __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1954 __m128i mask0, mask1, mask2, mask3;
1955 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1956 __m128i out0, out1;
1957 int32_t src_stride2 = src_stride << 1;
1958 int32_t src_stride3 = src_stride + src_stride2;
1959 int32_t src_stride4 = src_stride2 << 1;
1960 uint8_t* _src = (uint8_t*)src - 3 - src_stride3;
1961
1962 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
1963 DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz,
1964 4, filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1965 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1966 mask3 = __lsx_vaddi_bu(mask0, 6);
1967
1968 src0 = __lsx_vld(_src, 0);
1969 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1970 src3 = __lsx_vldx(_src, src_stride3);
1971 _src += src_stride4;
1972 src4 = __lsx_vld(_src, 0);
1973 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1974 _src += src_stride3;
1975 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1976 src0, src1, src2, src3);
1977 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1978 src6 = __lsx_vxori_b(src6, 128);
1979
1980 src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
1981 filt_hz1, filt_hz2, filt_hz3);
1982 src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
1983 filt_hz1, filt_hz2, filt_hz3);
1984 src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
1985 filt_hz1, filt_hz2, filt_hz3);
1986 src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
1987 filt_hz1, filt_hz2, filt_hz3);
1988 src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
1989 filt_hz1, filt_hz2, filt_hz3);
1990 src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
1991 filt_hz1, filt_hz2, filt_hz3);
1992 src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
1993 filt_hz1, filt_hz2, filt_hz3);
1994
1995 DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
1996 filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1997 DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4,
1998 src2, src1, tmp0, tmp1, tmp2, tmp4);
1999 DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
2000
2001 for (;loop_cnt--;) {
2002 src7 = __lsx_vld(_src, 0);
2003 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
2004 src10 = __lsx_vldx(_src, src_stride3);
2005 _src += src_stride4;
2006
2007 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
2008 src7, src8, src9, src10);
2009 src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
2010 filt_hz1, filt_hz2, filt_hz3);
2011 tmp3 = __lsx_vpackev_b(src7, src6);
2012 out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
2013 filt_vt2, filt_vt3);
2014 src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
2015 filt_hz1, filt_hz2, filt_hz3);
2016 src0 = __lsx_vpackev_b(src8, src7);
2017 out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
2018 filt_vt2, filt_vt3);
2019 src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
2020 filt_hz1, filt_hz2, filt_hz3);
2021 src1 = __lsx_vpackev_b(src9, src8);
2022 src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
2023 filt_vt2, filt_vt3);
2024 src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
2025 filt_hz1, filt_hz2, filt_hz3);
2026 src2 = __lsx_vpackev_b(src10, src9);
2027 src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
2028 filt_vt2, filt_vt3);
2029 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
2030 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
2031 src5 = __lsx_vldrepl_d(dst_tmp, 0);
2032 dst_tmp += dst_stride;
2033 src7 = __lsx_vldrepl_d(dst_tmp, 0);
2034 dst_tmp += dst_stride;
2035 src8 = __lsx_vldrepl_d(dst_tmp, 0);
2036 dst_tmp += dst_stride;
2037 src9 = __lsx_vldrepl_d(dst_tmp, 0);
2038 dst_tmp += dst_stride;
2039 DUP2_ARG2(__lsx_vilvl_d, src7, src5, src9, src8, src5, src7);
2040 DUP2_ARG2(__lsx_vavgr_bu, out0, src5, out1, src7, out0, out1);
2041 __lsx_vstelm_d(out0, dst, 0, 0);
2042 dst += dst_stride;
2043 __lsx_vstelm_d(out0, dst, 0, 1);
2044 dst += dst_stride;
2045 __lsx_vstelm_d(out1, dst, 0, 0);
2046 dst += dst_stride;
2047 __lsx_vstelm_d(out1, dst, 0, 1);
2048 dst += dst_stride;
2049
2050 src6 = src10;
2051 tmp0 = tmp2;
2052 tmp1 = tmp3;
2053 tmp2 = src1;
2054 tmp4 = tmp6;
2055 tmp5 = src0;
2056 tmp6 = src2;
2057 }
2058 }
2059
common_hv_8ht_8vt_and_aver_dst_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)2060 static void common_hv_8ht_8vt_and_aver_dst_16w_lsx(const uint8_t *src,
2061 int32_t src_stride,
2062 uint8_t *dst,
2063 int32_t dst_stride,
2064 const int8_t *filter_horiz,
2065 const int8_t *filter_vert,
2066 int32_t height)
2067 {
2068 int32_t multiple8_cnt;
2069
2070 for (multiple8_cnt = 2; multiple8_cnt--;) {
2071 common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
2072 filter_horiz, filter_vert,
2073 height);
2074
2075 src += 8;
2076 dst += 8;
2077 }
2078 }
2079
common_hv_8ht_8vt_and_aver_dst_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)2080 static void common_hv_8ht_8vt_and_aver_dst_32w_lsx(const uint8_t *src,
2081 int32_t src_stride,
2082 uint8_t *dst,
2083 int32_t dst_stride,
2084 const int8_t *filter_horiz,
2085 const int8_t *filter_vert,
2086 int32_t height)
2087 {
2088 int32_t multiple8_cnt;
2089
2090 for (multiple8_cnt = 4; multiple8_cnt--;) {
2091 common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
2092 filter_horiz, filter_vert,
2093 height);
2094
2095 src += 8;
2096 dst += 8;
2097 }
2098 }
2099
common_hv_8ht_8vt_and_aver_dst_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)2100 static void common_hv_8ht_8vt_and_aver_dst_64w_lsx(const uint8_t *src,
2101 int32_t src_stride,
2102 uint8_t *dst,
2103 int32_t dst_stride,
2104 const int8_t *filter_horiz,
2105 const int8_t *filter_vert,
2106 int32_t height)
2107 {
2108 int32_t multiple8_cnt;
2109
2110 for (multiple8_cnt = 8; multiple8_cnt--;) {
2111 common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
2112 filter_horiz, filter_vert,
2113 height);
2114
2115 src += 8;
2116 dst += 8;
2117 }
2118 }
2119
avg_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)2120 static void avg_width8_lsx(const uint8_t *src, int32_t src_stride,
2121 uint8_t *dst, int32_t dst_stride,
2122 int32_t height)
2123 {
2124 int32_t cnt = height >> 2;
2125 uint8_t *dst_tmp = dst;
2126 __m128i src0, src1, dst0, dst1;
2127 __m128i tmp0, tmp1, tmp2, tmp3;
2128
2129 for (;cnt--;) {
2130 tmp0 = __lsx_vldrepl_d(src, 0);
2131 src += src_stride;
2132 tmp1 = __lsx_vldrepl_d(src, 0);
2133 src += src_stride;
2134 tmp2 = __lsx_vldrepl_d(src, 0);
2135 src += src_stride;
2136 tmp3 = __lsx_vldrepl_d(src, 0);
2137 src += src_stride;
2138 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, src0, src1);
2139 tmp0 = __lsx_vldrepl_d(dst_tmp, 0);
2140 dst_tmp += dst_stride;
2141 tmp1 = __lsx_vldrepl_d(dst_tmp, 0);
2142 dst_tmp += dst_stride;
2143 tmp2 = __lsx_vldrepl_d(dst_tmp, 0);
2144 dst_tmp += dst_stride;
2145 tmp3 = __lsx_vldrepl_d(dst_tmp, 0);
2146 dst_tmp += dst_stride;
2147 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, dst0, dst1);
2148 DUP2_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, dst0, dst1);
2149 __lsx_vstelm_d(dst0, dst, 0, 0);
2150 dst += dst_stride;
2151 __lsx_vstelm_d(dst0, dst, 0, 1);
2152 dst += dst_stride;
2153 __lsx_vstelm_d(dst1, dst, 0, 0);
2154 dst += dst_stride;
2155 __lsx_vstelm_d(dst1, dst, 0, 1);
2156 dst += dst_stride;
2157 }
2158 }
2159
avg_width16_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)2160 static void avg_width16_lsx(const uint8_t *src, int32_t src_stride,
2161 uint8_t *dst, int32_t dst_stride,
2162 int32_t height)
2163 {
2164 int32_t cnt = height >> 2;
2165 __m128i src0, src1, src2, src3;
2166 __m128i dst0, dst1, dst2, dst3;
2167 int32_t src_stride2 = src_stride << 1;
2168 int32_t src_stride3 = src_stride + src_stride2;
2169 int32_t src_stride4 = src_stride2 << 1;
2170 int32_t dst_stride2 = dst_stride << 1;
2171 int32_t dst_stride3 = dst_stride2 + dst_stride;
2172 int32_t dst_stride4 = dst_stride2 << 1;
2173 uint8_t* _src = (uint8_t*)src;
2174
2175 for (;cnt--;) {
2176 src0 = __lsx_vld(_src, 0);
2177 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
2178 src3 = __lsx_vldx(_src, src_stride3);
2179 _src += src_stride4;
2180
2181 dst0 = __lsx_vld(dst, 0);
2182 DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2,
2183 dst1, dst2);
2184 dst3 = __lsx_vldx(dst, dst_stride3);
2185 DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1,
2186 src2, dst2, src3, dst3, dst0, dst1, dst2, dst3);
2187 __lsx_vst(dst0, dst, 0);
2188 __lsx_vstx(dst1, dst, dst_stride);
2189 __lsx_vstx(dst2, dst, dst_stride2);
2190 __lsx_vstx(dst3, dst, dst_stride3);
2191 dst += dst_stride4;
2192 }
2193 }
2194
avg_width32_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)2195 static void avg_width32_lsx(const uint8_t *src, int32_t src_stride,
2196 uint8_t *dst, int32_t dst_stride,
2197 int32_t height)
2198 {
2199 int32_t cnt = height >> 2;
2200 uint8_t *src_tmp1 = (uint8_t*)src;
2201 uint8_t *src_tmp2 = src_tmp1 + 16;
2202 uint8_t *dst_tmp1, *dst_tmp2;
2203 __m128i src0, src1, src2, src3, src4, src5, src6, src7;
2204 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2205 int32_t src_stride2 = src_stride << 1;
2206 int32_t src_stride3 = src_stride + src_stride2;
2207 int32_t src_stride4 = src_stride2 << 1;
2208 int32_t dst_stride2 = dst_stride << 1;
2209 int32_t dst_stride3 = dst_stride2 + dst_stride;
2210 int32_t dst_stride4 = dst_stride2 << 1;
2211
2212 dst_tmp1 = dst;
2213 dst_tmp2 = dst + 16;
2214 for (;cnt--;) {
2215 src0 = __lsx_vld(src_tmp1, 0);
2216 DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2,
2217 src2, src4);
2218 src6 = __lsx_vldx(src_tmp1, src_stride3);
2219 src_tmp1 += src_stride4;
2220
2221 src1 = __lsx_vld(src_tmp2, 0);
2222 DUP2_ARG2(__lsx_vldx, src_tmp2, src_stride, src_tmp2, src_stride2,
2223 src3, src5);
2224 src7 = __lsx_vldx(src_tmp2, src_stride3);
2225 src_tmp2 += src_stride4;
2226
2227 dst0 = __lsx_vld(dst_tmp1, 0);
2228 DUP2_ARG2(__lsx_vldx, dst_tmp1, dst_stride, dst_tmp1, dst_stride2,
2229 dst2, dst4);
2230 dst6 = __lsx_vldx(dst_tmp1, dst_stride3);
2231 dst1 = __lsx_vld(dst_tmp2, 0);
2232 DUP2_ARG2(__lsx_vldx, dst_tmp2, dst_stride, dst_tmp2, dst_stride2,
2233 dst3, dst5);
2234 dst7 = __lsx_vldx(dst_tmp2, dst_stride3);
2235
2236 DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1,
2237 src2, dst2, src3, dst3, dst0, dst1, dst2, dst3);
2238 DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5,
2239 src6, dst6, src7, dst7, dst4, dst5, dst6, dst7);
2240 __lsx_vst(dst0, dst_tmp1, 0);
2241 __lsx_vstx(dst2, dst_tmp1, dst_stride);
2242 __lsx_vstx(dst4, dst_tmp1, dst_stride2);
2243 __lsx_vstx(dst6, dst_tmp1, dst_stride3);
2244 dst_tmp1 += dst_stride4;
2245 __lsx_vst(dst1, dst_tmp2, 0);
2246 __lsx_vstx(dst3, dst_tmp2, dst_stride);
2247 __lsx_vstx(dst5, dst_tmp2, dst_stride2);
2248 __lsx_vstx(dst7, dst_tmp2, dst_stride3);
2249 dst_tmp2 += dst_stride4;
2250 }
2251 }
2252
avg_width64_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)2253 static void avg_width64_lsx(const uint8_t *src, int32_t src_stride,
2254 uint8_t *dst, int32_t dst_stride,
2255 int32_t height)
2256 {
2257 int32_t cnt = height >> 2;
2258 uint8_t *dst_tmp = dst;
2259 __m128i src0, src1, src2, src3, src4, src5, src6, src7;
2260 __m128i src8, src9, src10, src11, src12, src13, src14, src15;
2261 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2262 __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
2263
2264 for (;cnt--;) {
2265 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
2266 src0, src1, src2, src3);
2267 src += src_stride;
2268 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
2269 src4, src5, src6, src7);
2270 src += src_stride;
2271 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
2272 src8, src9, src10, src11);
2273 src += src_stride;
2274 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
2275 src12, src13, src14, src15);
2276 src += src_stride;
2277 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2278 dst0, dst1, dst2, dst3);
2279 dst_tmp += dst_stride;
2280 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2281 dst4, dst5, dst6, dst7);
2282 dst_tmp += dst_stride;
2283 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2284 dst8, dst9, dst10, dst11);
2285 dst_tmp += dst_stride;
2286 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2287 dst12, dst13, dst14, dst15);
2288 dst_tmp += dst_stride;
2289 DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1,
2290 src2, dst2, src3, dst3, dst0, dst1, dst2, dst3);
2291 DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5,
2292 src6, dst6, src7, dst7, dst4, dst5, dst6, dst7);
2293 DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10,
2294 dst10, src11, dst11, dst8, dst9, dst10, dst11);
2295 DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14,
2296 dst14, src15, dst15, dst12, dst13, dst14, dst15);
2297 __lsx_vst(dst0, dst, 0);
2298 __lsx_vst(dst1, dst, 16);
2299 __lsx_vst(dst2, dst, 32);
2300 __lsx_vst(dst3, dst, 48);
2301 dst += dst_stride;
2302 __lsx_vst(dst4, dst, 0);
2303 __lsx_vst(dst5, dst, 16);
2304 __lsx_vst(dst6, dst, 32);
2305 __lsx_vst(dst7, dst, 48);
2306 dst += dst_stride;
2307 __lsx_vst(dst8, dst, 0);
2308 __lsx_vst(dst9, dst, 16);
2309 __lsx_vst(dst10, dst, 32);
2310 __lsx_vst(dst11, dst, 48);
2311 dst += dst_stride;
2312 __lsx_vst(dst12, dst, 0);
2313 __lsx_vst(dst13, dst, 16);
2314 __lsx_vst(dst14, dst, 32);
2315 __lsx_vst(dst15, dst, 48);
2316 dst += dst_stride;
2317 }
2318 }
2319
2320 static const int8_t vp9_subpel_filters_lsx[3][15][8] = {
2321 [FILTER_8TAP_REGULAR] = {
2322 {0, 1, -5, 126, 8, -3, 1, 0},
2323 {-1, 3, -10, 122, 18, -6, 2, 0},
2324 {-1, 4, -13, 118, 27, -9, 3, -1},
2325 {-1, 4, -16, 112, 37, -11, 4, -1},
2326 {-1, 5, -18, 105, 48, -14, 4, -1},
2327 {-1, 5, -19, 97, 58, -16, 5, -1},
2328 {-1, 6, -19, 88, 68, -18, 5, -1},
2329 {-1, 6, -19, 78, 78, -19, 6, -1},
2330 {-1, 5, -18, 68, 88, -19, 6, -1},
2331 {-1, 5, -16, 58, 97, -19, 5, -1},
2332 {-1, 4, -14, 48, 105, -18, 5, -1},
2333 {-1, 4, -11, 37, 112, -16, 4, -1},
2334 {-1, 3, -9, 27, 118, -13, 4, -1},
2335 {0, 2, -6, 18, 122, -10, 3, -1},
2336 {0, 1, -3, 8, 126, -5, 1, 0},
2337 }, [FILTER_8TAP_SHARP] = {
2338 {-1, 3, -7, 127, 8, -3, 1, 0},
2339 {-2, 5, -13, 125, 17, -6, 3, -1},
2340 {-3, 7, -17, 121, 27, -10, 5, -2},
2341 {-4, 9, -20, 115, 37, -13, 6, -2},
2342 {-4, 10, -23, 108, 48, -16, 8, -3},
2343 {-4, 10, -24, 100, 59, -19, 9, -3},
2344 {-4, 11, -24, 90, 70, -21, 10, -4},
2345 {-4, 11, -23, 80, 80, -23, 11, -4},
2346 {-4, 10, -21, 70, 90, -24, 11, -4},
2347 {-3, 9, -19, 59, 100, -24, 10, -4},
2348 {-3, 8, -16, 48, 108, -23, 10, -4},
2349 {-2, 6, -13, 37, 115, -20, 9, -4},
2350 {-2, 5, -10, 27, 121, -17, 7, -3},
2351 {-1, 3, -6, 17, 125, -13, 5, -2},
2352 {0, 1, -3, 8, 127, -7, 3, -1},
2353 }, [FILTER_8TAP_SMOOTH] = {
2354 {-3, -1, 32, 64, 38, 1, -3, 0},
2355 {-2, -2, 29, 63, 41, 2, -3, 0},
2356 {-2, -2, 26, 63, 43, 4, -4, 0},
2357 {-2, -3, 24, 62, 46, 5, -4, 0},
2358 {-2, -3, 21, 60, 49, 7, -4, 0},
2359 {-1, -4, 18, 59, 51, 9, -4, 0},
2360 {-1, -4, 16, 57, 53, 12, -4, -1},
2361 {-1, -4, 14, 55, 55, 14, -4, -1},
2362 {-1, -4, 12, 53, 57, 16, -4, -1},
2363 {0, -4, 9, 51, 59, 18, -4, -1},
2364 {0, -4, 7, 49, 60, 21, -3, -2},
2365 {0, -4, 5, 46, 62, 24, -3, -2},
2366 {0, -4, 4, 43, 63, 26, -2, -2},
2367 {0, -3, 2, 41, 63, 29, -2, -2},
2368 {0, -3, 1, 38, 64, 32, -1, -3},
2369 }
2370 };
2371
2372 #define VP9_8TAP_LOONGARCH_LSX_FUNC(SIZE, type, type_idx) \
2373 void ff_put_8tap_##type##_##SIZE##h_lsx(uint8_t *dst, ptrdiff_t dststride, \
2374 const uint8_t *src, \
2375 ptrdiff_t srcstride, \
2376 int h, int mx, int my) \
2377 { \
2378 const int8_t *filter = vp9_subpel_filters_lsx[type_idx][mx-1]; \
2379 \
2380 common_hz_8t_##SIZE##w_lsx(src, srcstride, dst, dststride, filter, h); \
2381 } \
2382 \
2383 void ff_put_8tap_##type##_##SIZE##v_lsx(uint8_t *dst, ptrdiff_t dststride, \
2384 const uint8_t *src, \
2385 ptrdiff_t srcstride, \
2386 int h, int mx, int my) \
2387 { \
2388 const int8_t *filter = vp9_subpel_filters_lsx[type_idx][my-1]; \
2389 \
2390 common_vt_8t_##SIZE##w_lsx(src, srcstride, dst, dststride, filter, h); \
2391 } \
2392 \
2393 void ff_put_8tap_##type##_##SIZE##hv_lsx(uint8_t *dst, ptrdiff_t dststride, \
2394 const uint8_t *src, \
2395 ptrdiff_t srcstride, \
2396 int h, int mx, int my) \
2397 { \
2398 const int8_t *hfilter = vp9_subpel_filters_lsx[type_idx][mx-1]; \
2399 const int8_t *vfilter = vp9_subpel_filters_lsx[type_idx][my-1]; \
2400 \
2401 common_hv_8ht_8vt_##SIZE##w_lsx(src, srcstride, dst, dststride, hfilter, \
2402 vfilter, h); \
2403 } \
2404 \
2405 void ff_avg_8tap_##type##_##SIZE##h_lsx(uint8_t *dst, ptrdiff_t dststride, \
2406 const uint8_t *src, \
2407 ptrdiff_t srcstride, \
2408 int h, int mx, int my) \
2409 { \
2410 const int8_t *filter = vp9_subpel_filters_lsx[type_idx][mx-1]; \
2411 \
2412 common_hz_8t_and_aver_dst_##SIZE##w_lsx(src, srcstride, dst, \
2413 dststride, filter, h); \
2414 } \
2415 \
2416 void ff_avg_8tap_##type##_##SIZE##v_lsx(uint8_t *dst, ptrdiff_t dststride, \
2417 const uint8_t *src, \
2418 ptrdiff_t srcstride, \
2419 int h, int mx, int my) \
2420 { \
2421 const int8_t *filter = vp9_subpel_filters_lsx[type_idx][my-1]; \
2422 \
2423 common_vt_8t_and_aver_dst_##SIZE##w_lsx(src, srcstride, dst, dststride, \
2424 filter, h); \
2425 } \
2426 \
2427 void ff_avg_8tap_##type##_##SIZE##hv_lsx(uint8_t *dst, ptrdiff_t dststride, \
2428 const uint8_t *src, \
2429 ptrdiff_t srcstride, \
2430 int h, int mx, int my) \
2431 { \
2432 const int8_t *hfilter = vp9_subpel_filters_lsx[type_idx][mx-1]; \
2433 const int8_t *vfilter = vp9_subpel_filters_lsx[type_idx][my-1]; \
2434 \
2435 common_hv_8ht_8vt_and_aver_dst_##SIZE##w_lsx(src, srcstride, dst, \
2436 dststride, hfilter, \
2437 vfilter, h); \
2438 }
2439
2440 #define VP9_COPY_LOONGARCH_LSX_FUNC(SIZE) \
2441 void ff_copy##SIZE##_lsx(uint8_t *dst, ptrdiff_t dststride, \
2442 const uint8_t *src, ptrdiff_t srcstride, \
2443 int h, int mx, int my) \
2444 { \
2445 \
2446 copy_width##SIZE##_lsx(src, srcstride, dst, dststride, h); \
2447 } \
2448 void ff_avg##SIZE##_lsx(uint8_t *dst, ptrdiff_t dststride, \
2449 const uint8_t *src, ptrdiff_t srcstride, \
2450 int h, int mx, int my) \
2451 { \
2452 \
2453 avg_width##SIZE##_lsx(src, srcstride, dst, dststride, h); \
2454 }
2455
2456 VP9_8TAP_LOONGARCH_LSX_FUNC(64, regular, FILTER_8TAP_REGULAR);
2457 VP9_8TAP_LOONGARCH_LSX_FUNC(32, regular, FILTER_8TAP_REGULAR);
2458 VP9_8TAP_LOONGARCH_LSX_FUNC(16, regular, FILTER_8TAP_REGULAR);
2459 VP9_8TAP_LOONGARCH_LSX_FUNC(8, regular, FILTER_8TAP_REGULAR);
2460 VP9_8TAP_LOONGARCH_LSX_FUNC(4, regular, FILTER_8TAP_REGULAR);
2461
2462 VP9_8TAP_LOONGARCH_LSX_FUNC(64, sharp, FILTER_8TAP_SHARP);
2463 VP9_8TAP_LOONGARCH_LSX_FUNC(32, sharp, FILTER_8TAP_SHARP);
2464 VP9_8TAP_LOONGARCH_LSX_FUNC(16, sharp, FILTER_8TAP_SHARP);
2465 VP9_8TAP_LOONGARCH_LSX_FUNC(8, sharp, FILTER_8TAP_SHARP);
2466 VP9_8TAP_LOONGARCH_LSX_FUNC(4, sharp, FILTER_8TAP_SHARP);
2467
2468 VP9_8TAP_LOONGARCH_LSX_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
2469 VP9_8TAP_LOONGARCH_LSX_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
2470 VP9_8TAP_LOONGARCH_LSX_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
2471 VP9_8TAP_LOONGARCH_LSX_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
2472 VP9_8TAP_LOONGARCH_LSX_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
2473
2474 VP9_COPY_LOONGARCH_LSX_FUNC(64);
2475 VP9_COPY_LOONGARCH_LSX_FUNC(32);
2476 VP9_COPY_LOONGARCH_LSX_FUNC(16);
2477 VP9_COPY_LOONGARCH_LSX_FUNC(8);
2478
2479 #undef VP9_8TAP_LOONGARCH_LSX_FUNC
2480 #undef VP9_COPY_LOONGARCH_LSX_FUNC
2481