1 /*
2 * Loongson LASX optimized h264qpel
3 *
4 * Copyright (c) 2020 Loongson Technology Corporation Limited
5 * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #include "h264qpel_lasx.h"
25 #include "libavutil/loongarch/loongson_intrinsics.h"
26 #include "libavutil/attributes.h"
27
28 static const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = {
29 /* 8 width cases */
30 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
31 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
32 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
33 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
34 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
35 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
36 };
37
38 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \
39 ( { \
40 __m256i out0_m; \
41 __m256i tmp0_m; \
42 \
43 tmp0_m = __lasx_xvshuf_b(in1, in0, mask0); \
44 out0_m = __lasx_xvhaddw_h_b(tmp0_m, tmp0_m); \
45 tmp0_m = __lasx_xvshuf_b(in1, in0, mask1); \
46 out0_m = __lasx_xvdp2add_h_b(out0_m, minus5b, tmp0_m); \
47 tmp0_m = __lasx_xvshuf_b(in1, in0, mask2); \
48 out0_m = __lasx_xvdp2add_h_b(out0_m, plus20b, tmp0_m); \
49 \
50 out0_m; \
51 } )
52
53 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
54 ( { \
55 __m256i out0_m; \
56 \
57 out0_m = __lasx_xvdp2_h_b(in0, coeff0); \
58 DUP2_ARG3(__lasx_xvdp2add_h_b, out0_m, in1, coeff1, out0_m,\
59 in2, coeff2, out0_m, out0_m); \
60 \
61 out0_m; \
62 } )
63
64 static av_always_inline
avc_luma_hv_qrt_and_aver_dst_16x16_lasx(uint8_t *src_x, uint8_t *src_y, uint8_t *dst, ptrdiff_t stride)65 void avc_luma_hv_qrt_and_aver_dst_16x16_lasx(uint8_t *src_x,
66 uint8_t *src_y,
67 uint8_t *dst, ptrdiff_t stride)
68 {
69 const int16_t filt_const0 = 0xfb01;
70 const int16_t filt_const1 = 0x1414;
71 const int16_t filt_const2 = 0x1fb;
72 uint32_t loop_cnt;
73 ptrdiff_t stride_2x = stride << 1;
74 ptrdiff_t stride_3x = stride_2x + stride;
75 ptrdiff_t stride_4x = stride << 2;
76 __m256i tmp0, tmp1;
77 __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
78 __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
79 __m256i src_vt7, src_vt8;
80 __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
81 __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
82 __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
83 __m256i vt_out3, out0, out1, out2, out3;
84 __m256i minus5b = __lasx_xvldi(0xFB);
85 __m256i plus20b = __lasx_xvldi(20);
86
87 filt0 = __lasx_xvreplgr2vr_h(filt_const0);
88 filt1 = __lasx_xvreplgr2vr_h(filt_const1);
89 filt2 = __lasx_xvreplgr2vr_h(filt_const2);
90
91 mask0 = __lasx_xvld(luma_mask_arr, 0);
92 DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2);
93 src_vt0 = __lasx_xvld(src_y, 0);
94 DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x,
95 src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
96 src_y += stride_4x;
97
98 src_vt0 = __lasx_xvxori_b(src_vt0, 128);
99 DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
100 src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
101
102 for (loop_cnt = 4; loop_cnt--;) {
103 src_hz0 = __lasx_xvld(src_x, 0);
104 DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x,
105 src_hz1, src_hz2);
106 src_hz3 = __lasx_xvldx(src_x, stride_3x);
107 src_x += stride_4x;
108 src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
109 src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
110 src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
111 src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
112 DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
113 src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
114
115 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
116 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
117 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
118 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
119 hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
120 hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
121
122 DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x,
123 src_y, stride_3x, src_y, stride_4x,
124 src_vt5, src_vt6, src_vt7, src_vt8);
125 src_y += stride_4x;
126
127 DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
128 src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
129
130 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
131 0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
132 src_vt0, src_vt1, src_vt2, src_vt3);
133 src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
134 DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
135 src_vt3, src_vt2, src_vt87_h, src_vt3,
136 src_hz0, src_hz1, src_hz2, src_hz3);
137 DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
138 src_vt3, src_vt2, src_vt87_h, src_vt3,
139 src_vt0, src_vt1, src_vt2, src_vt3);
140 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1, src_hz1,
141 0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3, 0x02,
142 src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
143 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1, src_hz1,
144 0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3, 0x13,
145 src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
146 vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h, filt0,
147 filt1, filt2);
148 vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h, filt0,
149 filt1, filt2);
150 vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h, filt0,
151 filt1, filt2);
152 vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h, filt0,
153 filt1, filt2);
154 vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
155 vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
156
157 DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
158 out0, out2);
159 DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
160 out1, out3);
161 tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
162 tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
163
164 DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
165 out0 = __lasx_xvld(dst, 0);
166 DUP2_ARG2(__lasx_xvldx, dst, stride, dst, stride_2x, out1, out2);
167 out3 = __lasx_xvldx(dst, stride_3x);
168 out0 = __lasx_xvpermi_q(out0, out2, 0x02);
169 out1 = __lasx_xvpermi_q(out1, out3, 0x02);
170 out2 = __lasx_xvilvl_d(out1, out0);
171 out3 = __lasx_xvilvh_d(out1, out0);
172 out0 = __lasx_xvpermi_q(out2, out3, 0x02);
173 out1 = __lasx_xvpermi_q(out2, out3, 0x13);
174 tmp0 = __lasx_xvavgr_bu(out0, tmp0);
175 tmp1 = __lasx_xvavgr_bu(out1, tmp1);
176
177 __lasx_xvstelm_d(tmp0, dst, 0, 0);
178 __lasx_xvstelm_d(tmp0, dst + stride, 0, 1);
179 __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
180 __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
181
182 __lasx_xvstelm_d(tmp0, dst, 8, 2);
183 __lasx_xvstelm_d(tmp0, dst + stride, 8, 3);
184 __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
185 __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
186
187 dst += stride_4x;
188 src_vt0 = src_vt4;
189 src_vt1 = src_vt5;
190 src_vt2 = src_vt6;
191 src_vt3 = src_vt7;
192 src_vt4 = src_vt8;
193 }
194 }
195
196 static av_always_inline void
avc_luma_hv_qrt_16x16_lasx(uint8_t *src_x, uint8_t *src_y, uint8_t *dst, ptrdiff_t stride)197 avc_luma_hv_qrt_16x16_lasx(uint8_t *src_x, uint8_t *src_y,
198 uint8_t *dst, ptrdiff_t stride)
199 {
200 const int16_t filt_const0 = 0xfb01;
201 const int16_t filt_const1 = 0x1414;
202 const int16_t filt_const2 = 0x1fb;
203 uint32_t loop_cnt;
204 ptrdiff_t stride_2x = stride << 1;
205 ptrdiff_t stride_3x = stride_2x + stride;
206 ptrdiff_t stride_4x = stride << 2;
207 __m256i tmp0, tmp1;
208 __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
209 __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
210 __m256i src_vt7, src_vt8;
211 __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
212 __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
213 __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
214 __m256i vt_out3, out0, out1, out2, out3;
215 __m256i minus5b = __lasx_xvldi(0xFB);
216 __m256i plus20b = __lasx_xvldi(20);
217
218 filt0 = __lasx_xvreplgr2vr_h(filt_const0);
219 filt1 = __lasx_xvreplgr2vr_h(filt_const1);
220 filt2 = __lasx_xvreplgr2vr_h(filt_const2);
221
222 mask0 = __lasx_xvld(luma_mask_arr, 0);
223 DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2);
224 src_vt0 = __lasx_xvld(src_y, 0);
225 DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x,
226 src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
227 src_y += stride_4x;
228
229 src_vt0 = __lasx_xvxori_b(src_vt0, 128);
230 DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
231 src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
232
233 for (loop_cnt = 4; loop_cnt--;) {
234 src_hz0 = __lasx_xvld(src_x, 0);
235 DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x,
236 src_hz1, src_hz2);
237 src_hz3 = __lasx_xvldx(src_x, stride_3x);
238 src_x += stride_4x;
239 src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
240 src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
241 src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
242 src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
243 DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
244 src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
245
246 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
247 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
248 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
249 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
250 hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
251 hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
252
253 DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x,
254 src_y, stride_3x, src_y, stride_4x,
255 src_vt5, src_vt6, src_vt7, src_vt8);
256 src_y += stride_4x;
257
258 DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
259 src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
260 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
261 0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
262 src_vt0, src_vt1, src_vt2, src_vt3);
263 src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
264 DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
265 src_vt3, src_vt2, src_vt87_h, src_vt3,
266 src_hz0, src_hz1, src_hz2, src_hz3);
267 DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
268 src_vt3, src_vt2, src_vt87_h, src_vt3,
269 src_vt0, src_vt1, src_vt2, src_vt3);
270 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1,
271 src_hz1, 0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3,
272 0x02, src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
273 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1,
274 src_hz1, 0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3,
275 0x13, src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
276
277 vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h,
278 filt0, filt1, filt2);
279 vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h,
280 filt0, filt1, filt2);
281 vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h,
282 filt0, filt1, filt2);
283 vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h,
284 filt0, filt1, filt2);
285 vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
286 vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
287
288 DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
289 out0, out2);
290 DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
291 out1, out3);
292 tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
293 tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
294
295 DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
296 __lasx_xvstelm_d(tmp0, dst, 0, 0);
297 __lasx_xvstelm_d(tmp0, dst + stride, 0, 1);
298 __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
299 __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
300
301 __lasx_xvstelm_d(tmp0, dst, 8, 2);
302 __lasx_xvstelm_d(tmp0, dst + stride, 8, 3);
303 __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
304 __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
305
306 dst += stride_4x;
307 src_vt0 = src_vt4;
308 src_vt1 = src_vt5;
309 src_vt2 = src_vt6;
310 src_vt3 = src_vt7;
311 src_vt4 = src_vt8;
312 }
313 }
314
315 /* put_pixels8_8_inline_asm: dst = src */
316 static av_always_inline void
put_pixels8_8_inline_asm(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)317 put_pixels8_8_inline_asm(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
318 {
319 uint64_t tmp[8];
320 ptrdiff_t stride_2, stride_3, stride_4;
321 __asm__ volatile (
322 "slli.d %[stride_2], %[stride], 1 \n\t"
323 "add.d %[stride_3], %[stride_2], %[stride] \n\t"
324 "slli.d %[stride_4], %[stride_2], 1 \n\t"
325 "ld.d %[tmp0], %[src], 0x0 \n\t"
326 "ldx.d %[tmp1], %[src], %[stride] \n\t"
327 "ldx.d %[tmp2], %[src], %[stride_2] \n\t"
328 "ldx.d %[tmp3], %[src], %[stride_3] \n\t"
329 "add.d %[src], %[src], %[stride_4] \n\t"
330 "ld.d %[tmp4], %[src], 0x0 \n\t"
331 "ldx.d %[tmp5], %[src], %[stride] \n\t"
332 "ldx.d %[tmp6], %[src], %[stride_2] \n\t"
333 "ldx.d %[tmp7], %[src], %[stride_3] \n\t"
334
335 "st.d %[tmp0], %[dst], 0x0 \n\t"
336 "stx.d %[tmp1], %[dst], %[stride] \n\t"
337 "stx.d %[tmp2], %[dst], %[stride_2] \n\t"
338 "stx.d %[tmp3], %[dst], %[stride_3] \n\t"
339 "add.d %[dst], %[dst], %[stride_4] \n\t"
340 "st.d %[tmp4], %[dst], 0x0 \n\t"
341 "stx.d %[tmp5], %[dst], %[stride] \n\t"
342 "stx.d %[tmp6], %[dst], %[stride_2] \n\t"
343 "stx.d %[tmp7], %[dst], %[stride_3] \n\t"
344 : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
345 [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
346 [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]),
347 [tmp6]"=&r"(tmp[6]), [tmp7]"=&r"(tmp[7]),
348 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
349 [stride_4]"=&r"(stride_4),
350 [dst]"+&r"(dst), [src]"+&r"(src)
351 : [stride]"r"(stride)
352 : "memory"
353 );
354 }
355
356 /* avg_pixels8_8_lsx : dst = avg(src, dst)
357 * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
358 * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
359 static av_always_inline void
avg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)360 avg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
361 {
362 uint8_t *tmp = dst;
363 ptrdiff_t stride_2, stride_3, stride_4;
364 __asm__ volatile (
365 /* h0~h7 */
366 "slli.d %[stride_2], %[stride], 1 \n\t"
367 "add.d %[stride_3], %[stride_2], %[stride] \n\t"
368 "slli.d %[stride_4], %[stride_2], 1 \n\t"
369 "vld $vr0, %[src], 0 \n\t"
370 "vldx $vr1, %[src], %[stride] \n\t"
371 "vldx $vr2, %[src], %[stride_2] \n\t"
372 "vldx $vr3, %[src], %[stride_3] \n\t"
373 "add.d %[src], %[src], %[stride_4] \n\t"
374 "vld $vr4, %[src], 0 \n\t"
375 "vldx $vr5, %[src], %[stride] \n\t"
376 "vldx $vr6, %[src], %[stride_2] \n\t"
377 "vldx $vr7, %[src], %[stride_3] \n\t"
378
379 "vld $vr8, %[tmp], 0 \n\t"
380 "vldx $vr9, %[tmp], %[stride] \n\t"
381 "vldx $vr10, %[tmp], %[stride_2] \n\t"
382 "vldx $vr11, %[tmp], %[stride_3] \n\t"
383 "add.d %[tmp], %[tmp], %[stride_4] \n\t"
384 "vld $vr12, %[tmp], 0 \n\t"
385 "vldx $vr13, %[tmp], %[stride] \n\t"
386 "vldx $vr14, %[tmp], %[stride_2] \n\t"
387 "vldx $vr15, %[tmp], %[stride_3] \n\t"
388
389 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
390 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
391 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
392 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
393 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
394 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
395 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
396 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
397
398 "vstelm.d $vr0, %[dst], 0, 0 \n\t"
399 "add.d %[dst], %[dst], %[stride] \n\t"
400 "vstelm.d $vr1, %[dst], 0, 0 \n\t"
401 "add.d %[dst], %[dst], %[stride] \n\t"
402 "vstelm.d $vr2, %[dst], 0, 0 \n\t"
403 "add.d %[dst], %[dst], %[stride] \n\t"
404 "vstelm.d $vr3, %[dst], 0, 0 \n\t"
405 "add.d %[dst], %[dst], %[stride] \n\t"
406 "vstelm.d $vr4, %[dst], 0, 0 \n\t"
407 "add.d %[dst], %[dst], %[stride] \n\t"
408 "vstelm.d $vr5, %[dst], 0, 0 \n\t"
409 "add.d %[dst], %[dst], %[stride] \n\t"
410 "vstelm.d $vr6, %[dst], 0, 0 \n\t"
411 "add.d %[dst], %[dst], %[stride] \n\t"
412 "vstelm.d $vr7, %[dst], 0, 0 \n\t"
413 : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src),
414 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
415 [stride_4]"=&r"(stride_4)
416 : [stride]"r"(stride)
417 : "memory"
418 );
419 }
420
421 /* avg_pixels8_8_lsx : dst = avg(src, dst)
422 * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
423 * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
424 static av_always_inline void
put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)425 put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
426 ptrdiff_t dstStride, ptrdiff_t srcStride)
427 {
428 ptrdiff_t stride_2, stride_3, stride_4;
429 __asm__ volatile (
430 /* h0~h7 */
431 "slli.d %[stride_2], %[srcStride], 1 \n\t"
432 "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
433 "slli.d %[stride_4], %[stride_2], 1 \n\t"
434 "vld $vr0, %[src], 0 \n\t"
435 "vldx $vr1, %[src], %[srcStride] \n\t"
436 "vldx $vr2, %[src], %[stride_2] \n\t"
437 "vldx $vr3, %[src], %[stride_3] \n\t"
438 "add.d %[src], %[src], %[stride_4] \n\t"
439 "vld $vr4, %[src], 0 \n\t"
440 "vldx $vr5, %[src], %[srcStride] \n\t"
441 "vldx $vr6, %[src], %[stride_2] \n\t"
442 "vldx $vr7, %[src], %[stride_3] \n\t"
443
444 "vld $vr8, %[half], 0x00 \n\t"
445 "vld $vr9, %[half], 0x08 \n\t"
446 "vld $vr10, %[half], 0x10 \n\t"
447 "vld $vr11, %[half], 0x18 \n\t"
448 "vld $vr12, %[half], 0x20 \n\t"
449 "vld $vr13, %[half], 0x28 \n\t"
450 "vld $vr14, %[half], 0x30 \n\t"
451 "vld $vr15, %[half], 0x38 \n\t"
452
453 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
454 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
455 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
456 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
457 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
458 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
459 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
460 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
461
462 "vstelm.d $vr0, %[dst], 0, 0 \n\t"
463 "add.d %[dst], %[dst], %[dstStride] \n\t"
464 "vstelm.d $vr1, %[dst], 0, 0 \n\t"
465 "add.d %[dst], %[dst], %[dstStride] \n\t"
466 "vstelm.d $vr2, %[dst], 0, 0 \n\t"
467 "add.d %[dst], %[dst], %[dstStride] \n\t"
468 "vstelm.d $vr3, %[dst], 0, 0 \n\t"
469 "add.d %[dst], %[dst], %[dstStride] \n\t"
470 "vstelm.d $vr4, %[dst], 0, 0 \n\t"
471 "add.d %[dst], %[dst], %[dstStride] \n\t"
472 "vstelm.d $vr5, %[dst], 0, 0 \n\t"
473 "add.d %[dst], %[dst], %[dstStride] \n\t"
474 "vstelm.d $vr6, %[dst], 0, 0 \n\t"
475 "add.d %[dst], %[dst], %[dstStride] \n\t"
476 "vstelm.d $vr7, %[dst], 0, 0 \n\t"
477 : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
478 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
479 [stride_4]"=&r"(stride_4)
480 : [srcStride]"r"(srcStride), [dstStride]"r"(dstStride)
481 : "memory"
482 );
483 }
484
485 /* avg_pixels8_8_lsx : dst = avg(src, dst)
486 * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
487 * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
488 static av_always_inline void
avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)489 avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
490 ptrdiff_t dstStride, ptrdiff_t srcStride)
491 {
492 uint8_t *tmp = dst;
493 ptrdiff_t stride_2, stride_3, stride_4;
494 __asm__ volatile (
495 /* h0~h7 */
496 "slli.d %[stride_2], %[srcStride], 1 \n\t"
497 "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
498 "slli.d %[stride_4], %[stride_2], 1 \n\t"
499 "vld $vr0, %[src], 0 \n\t"
500 "vldx $vr1, %[src], %[srcStride] \n\t"
501 "vldx $vr2, %[src], %[stride_2] \n\t"
502 "vldx $vr3, %[src], %[stride_3] \n\t"
503 "add.d %[src], %[src], %[stride_4] \n\t"
504 "vld $vr4, %[src], 0 \n\t"
505 "vldx $vr5, %[src], %[srcStride] \n\t"
506 "vldx $vr6, %[src], %[stride_2] \n\t"
507 "vldx $vr7, %[src], %[stride_3] \n\t"
508
509 "vld $vr8, %[half], 0x00 \n\t"
510 "vld $vr9, %[half], 0x08 \n\t"
511 "vld $vr10, %[half], 0x10 \n\t"
512 "vld $vr11, %[half], 0x18 \n\t"
513 "vld $vr12, %[half], 0x20 \n\t"
514 "vld $vr13, %[half], 0x28 \n\t"
515 "vld $vr14, %[half], 0x30 \n\t"
516 "vld $vr15, %[half], 0x38 \n\t"
517
518 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
519 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
520 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
521 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
522 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
523 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
524 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
525 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
526
527 "slli.d %[stride_2], %[dstStride], 1 \n\t"
528 "add.d %[stride_3], %[stride_2], %[dstStride] \n\t"
529 "slli.d %[stride_4], %[stride_2], 1 \n\t"
530 "vld $vr8, %[tmp], 0 \n\t"
531 "vldx $vr9, %[tmp], %[dstStride] \n\t"
532 "vldx $vr10, %[tmp], %[stride_2] \n\t"
533 "vldx $vr11, %[tmp], %[stride_3] \n\t"
534 "add.d %[tmp], %[tmp], %[stride_4] \n\t"
535 "vld $vr12, %[tmp], 0 \n\t"
536 "vldx $vr13, %[tmp], %[dstStride] \n\t"
537 "vldx $vr14, %[tmp], %[stride_2] \n\t"
538 "vldx $vr15, %[tmp], %[stride_3] \n\t"
539
540 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
541 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
542 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
543 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
544 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
545 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
546 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
547 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
548
549 "vstelm.d $vr0, %[dst], 0, 0 \n\t"
550 "add.d %[dst], %[dst], %[dstStride] \n\t"
551 "vstelm.d $vr1, %[dst], 0, 0 \n\t"
552 "add.d %[dst], %[dst], %[dstStride] \n\t"
553 "vstelm.d $vr2, %[dst], 0, 0 \n\t"
554 "add.d %[dst], %[dst], %[dstStride] \n\t"
555 "vstelm.d $vr3, %[dst], 0, 0 \n\t"
556 "add.d %[dst], %[dst], %[dstStride] \n\t"
557 "vstelm.d $vr4, %[dst], 0, 0 \n\t"
558 "add.d %[dst], %[dst], %[dstStride] \n\t"
559 "vstelm.d $vr5, %[dst], 0, 0 \n\t"
560 "add.d %[dst], %[dst], %[dstStride] \n\t"
561 "vstelm.d $vr6, %[dst], 0, 0 \n\t"
562 "add.d %[dst], %[dst], %[dstStride] \n\t"
563 "vstelm.d $vr7, %[dst], 0, 0 \n\t"
564 : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half),
565 [src]"+&r"(src), [stride_2]"=&r"(stride_2),
566 [stride_3]"=&r"(stride_3), [stride_4]"=&r"(stride_4)
567 : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
568 : "memory"
569 );
570 }
571
572 /* put_pixels16_8_lsx: dst = src */
573 static av_always_inline void
put_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)574 put_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
575 {
576 ptrdiff_t stride_2, stride_3, stride_4;
577 __asm__ volatile (
578 "slli.d %[stride_2], %[stride], 1 \n\t"
579 "add.d %[stride_3], %[stride_2], %[stride] \n\t"
580 "slli.d %[stride_4], %[stride_2], 1 \n\t"
581 "vld $vr0, %[src], 0 \n\t"
582 "vldx $vr1, %[src], %[stride] \n\t"
583 "vldx $vr2, %[src], %[stride_2] \n\t"
584 "vldx $vr3, %[src], %[stride_3] \n\t"
585 "add.d %[src], %[src], %[stride_4] \n\t"
586 "vld $vr4, %[src], 0 \n\t"
587 "vldx $vr5, %[src], %[stride] \n\t"
588 "vldx $vr6, %[src], %[stride_2] \n\t"
589 "vldx $vr7, %[src], %[stride_3] \n\t"
590 "add.d %[src], %[src], %[stride_4] \n\t"
591
592 "vst $vr0, %[dst], 0 \n\t"
593 "vstx $vr1, %[dst], %[stride] \n\t"
594 "vstx $vr2, %[dst], %[stride_2] \n\t"
595 "vstx $vr3, %[dst], %[stride_3] \n\t"
596 "add.d %[dst], %[dst], %[stride_4] \n\t"
597 "vst $vr4, %[dst], 0 \n\t"
598 "vstx $vr5, %[dst], %[stride] \n\t"
599 "vstx $vr6, %[dst], %[stride_2] \n\t"
600 "vstx $vr7, %[dst], %[stride_3] \n\t"
601 "add.d %[dst], %[dst], %[stride_4] \n\t"
602
603 "vld $vr0, %[src], 0 \n\t"
604 "vldx $vr1, %[src], %[stride] \n\t"
605 "vldx $vr2, %[src], %[stride_2] \n\t"
606 "vldx $vr3, %[src], %[stride_3] \n\t"
607 "add.d %[src], %[src], %[stride_4] \n\t"
608 "vld $vr4, %[src], 0 \n\t"
609 "vldx $vr5, %[src], %[stride] \n\t"
610 "vldx $vr6, %[src], %[stride_2] \n\t"
611 "vldx $vr7, %[src], %[stride_3] \n\t"
612
613 "vst $vr0, %[dst], 0 \n\t"
614 "vstx $vr1, %[dst], %[stride] \n\t"
615 "vstx $vr2, %[dst], %[stride_2] \n\t"
616 "vstx $vr3, %[dst], %[stride_3] \n\t"
617 "add.d %[dst], %[dst], %[stride_4] \n\t"
618 "vst $vr4, %[dst], 0 \n\t"
619 "vstx $vr5, %[dst], %[stride] \n\t"
620 "vstx $vr6, %[dst], %[stride_2] \n\t"
621 "vstx $vr7, %[dst], %[stride_3] \n\t"
622 : [dst]"+&r"(dst), [src]"+&r"(src),
623 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
624 [stride_4]"=&r"(stride_4)
625 : [stride]"r"(stride)
626 : "memory"
627 );
628 }
629
630 /* avg_pixels16_8_lsx : dst = avg(src, dst)
631 * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
632 * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
633 static av_always_inline void
avg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)634 avg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
635 {
636 uint8_t *tmp = dst;
637 ptrdiff_t stride_2, stride_3, stride_4;
638 __asm__ volatile (
639 /* h0~h7 */
640 "slli.d %[stride_2], %[stride], 1 \n\t"
641 "add.d %[stride_3], %[stride_2], %[stride] \n\t"
642 "slli.d %[stride_4], %[stride_2], 1 \n\t"
643 "vld $vr0, %[src], 0 \n\t"
644 "vldx $vr1, %[src], %[stride] \n\t"
645 "vldx $vr2, %[src], %[stride_2] \n\t"
646 "vldx $vr3, %[src], %[stride_3] \n\t"
647 "add.d %[src], %[src], %[stride_4] \n\t"
648 "vld $vr4, %[src], 0 \n\t"
649 "vldx $vr5, %[src], %[stride] \n\t"
650 "vldx $vr6, %[src], %[stride_2] \n\t"
651 "vldx $vr7, %[src], %[stride_3] \n\t"
652 "add.d %[src], %[src], %[stride_4] \n\t"
653
654 "vld $vr8, %[tmp], 0 \n\t"
655 "vldx $vr9, %[tmp], %[stride] \n\t"
656 "vldx $vr10, %[tmp], %[stride_2] \n\t"
657 "vldx $vr11, %[tmp], %[stride_3] \n\t"
658 "add.d %[tmp], %[tmp], %[stride_4] \n\t"
659 "vld $vr12, %[tmp], 0 \n\t"
660 "vldx $vr13, %[tmp], %[stride] \n\t"
661 "vldx $vr14, %[tmp], %[stride_2] \n\t"
662 "vldx $vr15, %[tmp], %[stride_3] \n\t"
663 "add.d %[tmp], %[tmp], %[stride_4] \n\t"
664
665 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
666 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
667 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
668 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
669 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
670 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
671 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
672 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
673
674 "vst $vr0, %[dst], 0 \n\t"
675 "vstx $vr1, %[dst], %[stride] \n\t"
676 "vstx $vr2, %[dst], %[stride_2] \n\t"
677 "vstx $vr3, %[dst], %[stride_3] \n\t"
678 "add.d %[dst], %[dst], %[stride_4] \n\t"
679 "vst $vr4, %[dst], 0 \n\t"
680 "vstx $vr5, %[dst], %[stride] \n\t"
681 "vstx $vr6, %[dst], %[stride_2] \n\t"
682 "vstx $vr7, %[dst], %[stride_3] \n\t"
683 "add.d %[dst], %[dst], %[stride_4] \n\t"
684
685 /* h8~h15 */
686 "vld $vr0, %[src], 0 \n\t"
687 "vldx $vr1, %[src], %[stride] \n\t"
688 "vldx $vr2, %[src], %[stride_2] \n\t"
689 "vldx $vr3, %[src], %[stride_3] \n\t"
690 "add.d %[src], %[src], %[stride_4] \n\t"
691 "vld $vr4, %[src], 0 \n\t"
692 "vldx $vr5, %[src], %[stride] \n\t"
693 "vldx $vr6, %[src], %[stride_2] \n\t"
694 "vldx $vr7, %[src], %[stride_3] \n\t"
695
696 "vld $vr8, %[tmp], 0 \n\t"
697 "vldx $vr9, %[tmp], %[stride] \n\t"
698 "vldx $vr10, %[tmp], %[stride_2] \n\t"
699 "vldx $vr11, %[tmp], %[stride_3] \n\t"
700 "add.d %[tmp], %[tmp], %[stride_4] \n\t"
701 "vld $vr12, %[tmp], 0 \n\t"
702 "vldx $vr13, %[tmp], %[stride] \n\t"
703 "vldx $vr14, %[tmp], %[stride_2] \n\t"
704 "vldx $vr15, %[tmp], %[stride_3] \n\t"
705
706 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
707 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
708 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
709 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
710 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
711 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
712 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
713 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
714
715 "vst $vr0, %[dst], 0 \n\t"
716 "vstx $vr1, %[dst], %[stride] \n\t"
717 "vstx $vr2, %[dst], %[stride_2] \n\t"
718 "vstx $vr3, %[dst], %[stride_3] \n\t"
719 "add.d %[dst], %[dst], %[stride_4] \n\t"
720 "vst $vr4, %[dst], 0 \n\t"
721 "vstx $vr5, %[dst], %[stride] \n\t"
722 "vstx $vr6, %[dst], %[stride_2] \n\t"
723 "vstx $vr7, %[dst], %[stride_3] \n\t"
724 : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src),
725 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
726 [stride_4]"=&r"(stride_4)
727 : [stride]"r"(stride)
728 : "memory"
729 );
730 }
731
732 /* avg_pixels16_8_lsx : dst = avg(src, dst)
733 * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
734 * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
735 static av_always_inline void
put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)736 put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
737 ptrdiff_t dstStride, ptrdiff_t srcStride)
738 {
739 ptrdiff_t stride_2, stride_3, stride_4;
740 ptrdiff_t dstride_2, dstride_3, dstride_4;
741 __asm__ volatile (
742 "slli.d %[stride_2], %[srcStride], 1 \n\t"
743 "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
744 "slli.d %[stride_4], %[stride_2], 1 \n\t"
745 "slli.d %[dstride_2], %[dstStride], 1 \n\t"
746 "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t"
747 "slli.d %[dstride_4], %[dstride_2], 1 \n\t"
748 /* h0~h7 */
749 "vld $vr0, %[src], 0 \n\t"
750 "vldx $vr1, %[src], %[srcStride] \n\t"
751 "vldx $vr2, %[src], %[stride_2] \n\t"
752 "vldx $vr3, %[src], %[stride_3] \n\t"
753 "add.d %[src], %[src], %[stride_4] \n\t"
754 "vld $vr4, %[src], 0 \n\t"
755 "vldx $vr5, %[src], %[srcStride] \n\t"
756 "vldx $vr6, %[src], %[stride_2] \n\t"
757 "vldx $vr7, %[src], %[stride_3] \n\t"
758 "add.d %[src], %[src], %[stride_4] \n\t"
759
760 "vld $vr8, %[half], 0x00 \n\t"
761 "vld $vr9, %[half], 0x10 \n\t"
762 "vld $vr10, %[half], 0x20 \n\t"
763 "vld $vr11, %[half], 0x30 \n\t"
764 "vld $vr12, %[half], 0x40 \n\t"
765 "vld $vr13, %[half], 0x50 \n\t"
766 "vld $vr14, %[half], 0x60 \n\t"
767 "vld $vr15, %[half], 0x70 \n\t"
768
769 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
770 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
771 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
772 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
773 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
774 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
775 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
776 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
777
778 "vst $vr0, %[dst], 0 \n\t"
779 "vstx $vr1, %[dst], %[dstStride] \n\t"
780 "vstx $vr2, %[dst], %[dstride_2] \n\t"
781 "vstx $vr3, %[dst], %[dstride_3] \n\t"
782 "add.d %[dst], %[dst], %[dstride_4] \n\t"
783 "vst $vr4, %[dst], 0 \n\t"
784 "vstx $vr5, %[dst], %[dstStride] \n\t"
785 "vstx $vr6, %[dst], %[dstride_2] \n\t"
786 "vstx $vr7, %[dst], %[dstride_3] \n\t"
787 "add.d %[dst], %[dst], %[dstride_4] \n\t"
788
789 /* h8~h15 */
790 "vld $vr0, %[src], 0 \n\t"
791 "vldx $vr1, %[src], %[srcStride] \n\t"
792 "vldx $vr2, %[src], %[stride_2] \n\t"
793 "vldx $vr3, %[src], %[stride_3] \n\t"
794 "add.d %[src], %[src], %[stride_4] \n\t"
795 "vld $vr4, %[src], 0 \n\t"
796 "vldx $vr5, %[src], %[srcStride] \n\t"
797 "vldx $vr6, %[src], %[stride_2] \n\t"
798 "vldx $vr7, %[src], %[stride_3] \n\t"
799
800 "vld $vr8, %[half], 0x80 \n\t"
801 "vld $vr9, %[half], 0x90 \n\t"
802 "vld $vr10, %[half], 0xa0 \n\t"
803 "vld $vr11, %[half], 0xb0 \n\t"
804 "vld $vr12, %[half], 0xc0 \n\t"
805 "vld $vr13, %[half], 0xd0 \n\t"
806 "vld $vr14, %[half], 0xe0 \n\t"
807 "vld $vr15, %[half], 0xf0 \n\t"
808
809 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
810 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
811 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
812 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
813 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
814 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
815 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
816 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
817
818 "vst $vr0, %[dst], 0 \n\t"
819 "vstx $vr1, %[dst], %[dstStride] \n\t"
820 "vstx $vr2, %[dst], %[dstride_2] \n\t"
821 "vstx $vr3, %[dst], %[dstride_3] \n\t"
822 "add.d %[dst], %[dst], %[dstride_4] \n\t"
823 "vst $vr4, %[dst], 0 \n\t"
824 "vstx $vr5, %[dst], %[dstStride] \n\t"
825 "vstx $vr6, %[dst], %[dstride_2] \n\t"
826 "vstx $vr7, %[dst], %[dstride_3] \n\t"
827 : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
828 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
829 [stride_4]"=&r"(stride_4), [dstride_2]"=&r"(dstride_2),
830 [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
831 : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
832 : "memory"
833 );
834 }
835
836 /* avg_pixels16_8_lsx : dst = avg(src, dst)
837 * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
838 * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
839 static av_always_inline void
avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)840 avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
841 ptrdiff_t dstStride, ptrdiff_t srcStride)
842 {
843 uint8_t *tmp = dst;
844 ptrdiff_t stride_2, stride_3, stride_4;
845 ptrdiff_t dstride_2, dstride_3, dstride_4;
846 __asm__ volatile (
847 "slli.d %[stride_2], %[srcStride], 1 \n\t"
848 "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
849 "slli.d %[stride_4], %[stride_2], 1 \n\t"
850 "slli.d %[dstride_2], %[dstStride], 1 \n\t"
851 "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t"
852 "slli.d %[dstride_4], %[dstride_2], 1 \n\t"
853 /* h0~h7 */
854 "vld $vr0, %[src], 0 \n\t"
855 "vldx $vr1, %[src], %[srcStride] \n\t"
856 "vldx $vr2, %[src], %[stride_2] \n\t"
857 "vldx $vr3, %[src], %[stride_3] \n\t"
858 "add.d %[src], %[src], %[stride_4] \n\t"
859 "vld $vr4, %[src], 0 \n\t"
860 "vldx $vr5, %[src], %[srcStride] \n\t"
861 "vldx $vr6, %[src], %[stride_2] \n\t"
862 "vldx $vr7, %[src], %[stride_3] \n\t"
863 "add.d %[src], %[src], %[stride_4] \n\t"
864
865 "vld $vr8, %[half], 0x00 \n\t"
866 "vld $vr9, %[half], 0x10 \n\t"
867 "vld $vr10, %[half], 0x20 \n\t"
868 "vld $vr11, %[half], 0x30 \n\t"
869 "vld $vr12, %[half], 0x40 \n\t"
870 "vld $vr13, %[half], 0x50 \n\t"
871 "vld $vr14, %[half], 0x60 \n\t"
872 "vld $vr15, %[half], 0x70 \n\t"
873
874 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
875 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
876 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
877 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
878 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
879 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
880 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
881 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
882
883 "vld $vr8, %[tmp], 0 \n\t"
884 "vldx $vr9, %[tmp], %[dstStride] \n\t"
885 "vldx $vr10, %[tmp], %[dstride_2] \n\t"
886 "vldx $vr11, %[tmp], %[dstride_3] \n\t"
887 "add.d %[tmp], %[tmp], %[dstride_4] \n\t"
888 "vld $vr12, %[tmp], 0 \n\t"
889 "vldx $vr13, %[tmp], %[dstStride] \n\t"
890 "vldx $vr14, %[tmp], %[dstride_2] \n\t"
891 "vldx $vr15, %[tmp], %[dstride_3] \n\t"
892 "add.d %[tmp], %[tmp], %[dstride_4] \n\t"
893
894 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
895 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
896 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
897 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
898 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
899 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
900 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
901 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
902
903 "vst $vr0, %[dst], 0 \n\t"
904 "vstx $vr1, %[dst], %[dstStride] \n\t"
905 "vstx $vr2, %[dst], %[dstride_2] \n\t"
906 "vstx $vr3, %[dst], %[dstride_3] \n\t"
907 "add.d %[dst], %[dst], %[dstride_4] \n\t"
908 "vst $vr4, %[dst], 0 \n\t"
909 "vstx $vr5, %[dst], %[dstStride] \n\t"
910 "vstx $vr6, %[dst], %[dstride_2] \n\t"
911 "vstx $vr7, %[dst], %[dstride_3] \n\t"
912 "add.d %[dst], %[dst], %[dstride_4] \n\t"
913
914 /* h8~h15 */
915 "vld $vr0, %[src], 0 \n\t"
916 "vldx $vr1, %[src], %[srcStride] \n\t"
917 "vldx $vr2, %[src], %[stride_2] \n\t"
918 "vldx $vr3, %[src], %[stride_3] \n\t"
919 "add.d %[src], %[src], %[stride_4] \n\t"
920 "vld $vr4, %[src], 0 \n\t"
921 "vldx $vr5, %[src], %[srcStride] \n\t"
922 "vldx $vr6, %[src], %[stride_2] \n\t"
923 "vldx $vr7, %[src], %[stride_3] \n\t"
924
925 "vld $vr8, %[half], 0x80 \n\t"
926 "vld $vr9, %[half], 0x90 \n\t"
927 "vld $vr10, %[half], 0xa0 \n\t"
928 "vld $vr11, %[half], 0xb0 \n\t"
929 "vld $vr12, %[half], 0xc0 \n\t"
930 "vld $vr13, %[half], 0xd0 \n\t"
931 "vld $vr14, %[half], 0xe0 \n\t"
932 "vld $vr15, %[half], 0xf0 \n\t"
933
934 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
935 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
936 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
937 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
938 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
939 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
940 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
941 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
942
943 "vld $vr8, %[tmp], 0 \n\t"
944 "vldx $vr9, %[tmp], %[dstStride] \n\t"
945 "vldx $vr10, %[tmp], %[dstride_2] \n\t"
946 "vldx $vr11, %[tmp], %[dstride_3] \n\t"
947 "add.d %[tmp], %[tmp], %[dstride_4] \n\t"
948 "vld $vr12, %[tmp], 0 \n\t"
949 "vldx $vr13, %[tmp], %[dstStride] \n\t"
950 "vldx $vr14, %[tmp], %[dstride_2] \n\t"
951 "vldx $vr15, %[tmp], %[dstride_3] \n\t"
952
953 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
954 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
955 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
956 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
957 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
958 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
959 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
960 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
961
962 "vst $vr0, %[dst], 0 \n\t"
963 "vstx $vr1, %[dst], %[dstStride] \n\t"
964 "vstx $vr2, %[dst], %[dstride_2] \n\t"
965 "vstx $vr3, %[dst], %[dstride_3] \n\t"
966 "add.d %[dst], %[dst], %[dstride_4] \n\t"
967 "vst $vr4, %[dst], 0 \n\t"
968 "vstx $vr5, %[dst], %[dstStride] \n\t"
969 "vstx $vr6, %[dst], %[dstride_2] \n\t"
970 "vstx $vr7, %[dst], %[dstride_3] \n\t"
971 : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half), [src]"+&r"(src),
972 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
973 [stride_4]"=&r"(stride_4), [dstride_2]"=&r"(dstride_2),
974 [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
975 : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
976 : "memory"
977 );
978 }
979
980 #define QPEL8_H_LOWPASS(out_v) \
981 src00 = __lasx_xvld(src, - 2); \
982 src += srcStride; \
983 src10 = __lasx_xvld(src, - 2); \
984 src += srcStride; \
985 src00 = __lasx_xvpermi_q(src00, src10, 0x02); \
986 src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1); \
987 src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2); \
988 src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3); \
989 src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4); \
990 src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5); \
991 DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
992 src00 = __lasx_xvaddwl_h_bu(src00, src05); \
993 src02 = __lasx_xvmul_h(src02, h_20); \
994 src01 = __lasx_xvmul_h(src01, h_5); \
995 src02 = __lasx_xvssub_h(src02, src01); \
996 src02 = __lasx_xvsadd_h(src02, src00); \
997 src02 = __lasx_xvsadd_h(src02, h_16); \
998 out_v = __lasx_xvssrani_bu_h(src02, src02, 5); \
999
1000 static av_always_inline void
put_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)1001 put_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride,
1002 int srcStride)
1003 {
1004 int dstStride_2x = dstStride << 1;
1005 __m256i src00, src01, src02, src03, src04, src05, src10;
1006 __m256i out0, out1, out2, out3;
1007 __m256i h_20 = __lasx_xvldi(0x414);
1008 __m256i h_5 = __lasx_xvldi(0x405);
1009 __m256i h_16 = __lasx_xvldi(0x410);
1010 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1011 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1012 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1013 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1014 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1015
1016 QPEL8_H_LOWPASS(out0)
1017 QPEL8_H_LOWPASS(out1)
1018 QPEL8_H_LOWPASS(out2)
1019 QPEL8_H_LOWPASS(out3)
1020 __lasx_xvstelm_d(out0, dst, 0, 0);
1021 __lasx_xvstelm_d(out0, dst + dstStride, 0, 2);
1022 dst += dstStride_2x;
1023 __lasx_xvstelm_d(out1, dst, 0, 0);
1024 __lasx_xvstelm_d(out1, dst + dstStride, 0, 2);
1025 dst += dstStride_2x;
1026 __lasx_xvstelm_d(out2, dst, 0, 0);
1027 __lasx_xvstelm_d(out2, dst + dstStride, 0, 2);
1028 dst += dstStride_2x;
1029 __lasx_xvstelm_d(out3, dst, 0, 0);
1030 __lasx_xvstelm_d(out3, dst + dstStride, 0, 2);
1031 }
1032
1033 #define QPEL8_V_LOWPASS(src0, src1, src2, src3, src4, src5, src6, \
1034 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5) \
1035 { \
1036 tmp0 = __lasx_xvpermi_q(src0, src1, 0x02); \
1037 tmp1 = __lasx_xvpermi_q(src1, src2, 0x02); \
1038 tmp2 = __lasx_xvpermi_q(src2, src3, 0x02); \
1039 tmp3 = __lasx_xvpermi_q(src3, src4, 0x02); \
1040 tmp4 = __lasx_xvpermi_q(src4, src5, 0x02); \
1041 tmp5 = __lasx_xvpermi_q(src5, src6, 0x02); \
1042 DUP2_ARG2(__lasx_xvaddwl_h_bu, tmp2, tmp3, tmp1, tmp4, tmp2, tmp1); \
1043 tmp0 = __lasx_xvaddwl_h_bu(tmp0, tmp5); \
1044 tmp2 = __lasx_xvmul_h(tmp2, h_20); \
1045 tmp1 = __lasx_xvmul_h(tmp1, h_5); \
1046 tmp2 = __lasx_xvssub_h(tmp2, tmp1); \
1047 tmp2 = __lasx_xvsadd_h(tmp2, tmp0); \
1048 tmp2 = __lasx_xvsadd_h(tmp2, h_16); \
1049 tmp2 = __lasx_xvssrani_bu_h(tmp2, tmp2, 5); \
1050 }
1051
1052 static av_always_inline void
put_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride)1053 put_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride,
1054 int srcStride)
1055 {
1056 int srcStride_2x = srcStride << 1;
1057 int dstStride_2x = dstStride << 1;
1058 int srcStride_4x = srcStride << 2;
1059 int srcStride_3x = srcStride_2x + srcStride;
1060 __m256i src00, src01, src02, src03, src04, src05, src06;
1061 __m256i src07, src08, src09, src10, src11, src12;
1062 __m256i tmp00, tmp01, tmp02, tmp03, tmp04, tmp05;
1063 __m256i h_20 = __lasx_xvldi(0x414);
1064 __m256i h_5 = __lasx_xvldi(0x405);
1065 __m256i h_16 = __lasx_xvldi(0x410);
1066
1067 DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0,
1068 src00, src01);
1069 src02 = __lasx_xvld(src, 0);
1070 DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1071 srcStride_3x, src, srcStride_4x, src03, src04, src05, src06);
1072 src += srcStride_4x;
1073 DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1074 srcStride_3x, src, srcStride_4x, src07, src08, src09, src10);
1075 src += srcStride_4x;
1076 DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12);
1077
1078 QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06,
1079 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1080 __lasx_xvstelm_d(tmp02, dst, 0, 0);
1081 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1082 dst += dstStride_2x;
1083 QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08,
1084 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1085 __lasx_xvstelm_d(tmp02, dst, 0, 0);
1086 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1087 dst += dstStride_2x;
1088 QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10,
1089 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1090 __lasx_xvstelm_d(tmp02, dst, 0, 0);
1091 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1092 dst += dstStride_2x;
1093 QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12,
1094 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1095 __lasx_xvstelm_d(tmp02, dst, 0, 0);
1096 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1097 }
1098
1099 static av_always_inline void
avg_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride)1100 avg_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride,
1101 int srcStride)
1102 {
1103 int srcStride_2x = srcStride << 1;
1104 int srcStride_4x = srcStride << 2;
1105 int dstStride_2x = dstStride << 1;
1106 int dstStride_4x = dstStride << 2;
1107 int srcStride_3x = srcStride_2x + srcStride;
1108 int dstStride_3x = dstStride_2x + dstStride;
1109 __m256i src00, src01, src02, src03, src04, src05, src06;
1110 __m256i src07, src08, src09, src10, src11, src12, tmp00;
1111 __m256i tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp08, tmp09;
1112 __m256i h_20 = __lasx_xvldi(0x414);
1113 __m256i h_5 = __lasx_xvldi(0x405);
1114 __m256i h_16 = __lasx_xvldi(0x410);
1115
1116
1117 DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0,
1118 src00, src01);
1119 src02 = __lasx_xvld(src, 0);
1120 DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1121 srcStride_3x, src, srcStride_4x, src03, src04, src05, src06);
1122 src += srcStride_4x;
1123 DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1124 srcStride_3x, src, srcStride_4x, src07, src08, src09, src10);
1125 src += srcStride_4x;
1126 DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12);
1127
1128 tmp06 = __lasx_xvld(dst, 0);
1129 DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
1130 dst, dstStride_3x, dst, dstStride_4x,
1131 tmp07, tmp02, tmp03, tmp04);
1132 dst += dstStride_4x;
1133 DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
1134 tmp05, tmp00);
1135 tmp01 = __lasx_xvldx(dst, dstStride_3x);
1136 dst -= dstStride_4x;
1137
1138 tmp06 = __lasx_xvpermi_q(tmp06, tmp07, 0x02);
1139 tmp07 = __lasx_xvpermi_q(tmp02, tmp03, 0x02);
1140 tmp08 = __lasx_xvpermi_q(tmp04, tmp05, 0x02);
1141 tmp09 = __lasx_xvpermi_q(tmp00, tmp01, 0x02);
1142
1143 QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06,
1144 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1145 tmp06 = __lasx_xvavgr_bu(tmp06, tmp02);
1146 __lasx_xvstelm_d(tmp06, dst, 0, 0);
1147 __lasx_xvstelm_d(tmp06, dst + dstStride, 0, 2);
1148 dst += dstStride_2x;
1149 QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08,
1150 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1151 tmp07 = __lasx_xvavgr_bu(tmp07, tmp02);
1152 __lasx_xvstelm_d(tmp07, dst, 0, 0);
1153 __lasx_xvstelm_d(tmp07, dst + dstStride, 0, 2);
1154 dst += dstStride_2x;
1155 QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10,
1156 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1157 tmp08 = __lasx_xvavgr_bu(tmp08, tmp02);
1158 __lasx_xvstelm_d(tmp08, dst, 0, 0);
1159 __lasx_xvstelm_d(tmp08, dst + dstStride, 0, 2);
1160 dst += dstStride_2x;
1161 QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12,
1162 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1163 tmp09 = __lasx_xvavgr_bu(tmp09, tmp02);
1164 __lasx_xvstelm_d(tmp09, dst, 0, 0);
1165 __lasx_xvstelm_d(tmp09, dst + dstStride, 0, 2);
1166 }
1167
1168 #define QPEL8_HV_LOWPASS_H(tmp) \
1169 { \
1170 src00 = __lasx_xvld(src, -2); \
1171 src += srcStride; \
1172 src10 = __lasx_xvld(src, -2); \
1173 src += srcStride; \
1174 src00 = __lasx_xvpermi_q(src00, src10, 0x02); \
1175 src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1); \
1176 src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2); \
1177 src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3); \
1178 src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4); \
1179 src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5); \
1180 DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
1181 src00 = __lasx_xvaddwl_h_bu(src00, src05); \
1182 src02 = __lasx_xvmul_h(src02, h_20); \
1183 src01 = __lasx_xvmul_h(src01, h_5); \
1184 src02 = __lasx_xvssub_h(src02, src01); \
1185 tmp = __lasx_xvsadd_h(src02, src00); \
1186 }
1187
1188 #define QPEL8_HV_LOWPASS_V(src0, src1, src2, src3, \
1189 src4, src5, temp0, temp1, \
1190 temp2, temp3, temp4, temp5, \
1191 out) \
1192 { \
1193 DUP2_ARG2(__lasx_xvaddwl_w_h, src2, src3, src1, src4, temp0, temp2); \
1194 DUP2_ARG2(__lasx_xvaddwh_w_h, src2, src3, src1, src4, temp1, temp3); \
1195 temp4 = __lasx_xvaddwl_w_h(src0, src5); \
1196 temp5 = __lasx_xvaddwh_w_h(src0, src5); \
1197 temp0 = __lasx_xvmul_w(temp0, w_20); \
1198 temp1 = __lasx_xvmul_w(temp1, w_20); \
1199 temp2 = __lasx_xvmul_w(temp2, w_5); \
1200 temp3 = __lasx_xvmul_w(temp3, w_5); \
1201 temp0 = __lasx_xvssub_w(temp0, temp2); \
1202 temp1 = __lasx_xvssub_w(temp1, temp3); \
1203 temp0 = __lasx_xvsadd_w(temp0, temp4); \
1204 temp1 = __lasx_xvsadd_w(temp1, temp5); \
1205 temp0 = __lasx_xvsadd_w(temp0, w_512); \
1206 temp1 = __lasx_xvsadd_w(temp1, w_512); \
1207 temp0 = __lasx_xvssrani_hu_w(temp0, temp0, 10); \
1208 temp1 = __lasx_xvssrani_hu_w(temp1, temp1, 10); \
1209 temp0 = __lasx_xvpackev_d(temp1, temp0); \
1210 out = __lasx_xvssrani_bu_h(temp0, temp0, 0); \
1211 }
1212
1213 static av_always_inline void
put_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)1214 put_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1215 ptrdiff_t dstStride, ptrdiff_t srcStride)
1216 {
1217 __m256i src00, src01, src02, src03, src04, src05, src10;
1218 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1219 __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
1220 __m256i h_20 = __lasx_xvldi(0x414);
1221 __m256i h_5 = __lasx_xvldi(0x405);
1222 __m256i w_20 = __lasx_xvldi(0x814);
1223 __m256i w_5 = __lasx_xvldi(0x805);
1224 __m256i w_512 = {512};
1225 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1226 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1227 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1228 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1229 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1230
1231 w_512 = __lasx_xvreplve0_w(w_512);
1232
1233 src -= srcStride << 1;
1234 QPEL8_HV_LOWPASS_H(tmp0)
1235 QPEL8_HV_LOWPASS_H(tmp2)
1236 QPEL8_HV_LOWPASS_H(tmp4)
1237 QPEL8_HV_LOWPASS_H(tmp6)
1238 QPEL8_HV_LOWPASS_H(tmp8)
1239 QPEL8_HV_LOWPASS_H(tmp10)
1240 QPEL8_HV_LOWPASS_H(tmp12)
1241 tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
1242 tmp9 = __lasx_xvpermi_q(tmp10, tmp8, 0x21);
1243 tmp7 = __lasx_xvpermi_q(tmp8, tmp6, 0x21);
1244 tmp5 = __lasx_xvpermi_q(tmp6, tmp4, 0x21);
1245 tmp3 = __lasx_xvpermi_q(tmp4, tmp2, 0x21);
1246 tmp1 = __lasx_xvpermi_q(tmp2, tmp0, 0x21);
1247
1248 QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01,
1249 src02, src03, src04, src05, tmp0)
1250 QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01,
1251 src02, src03, src04, src05, tmp2)
1252 QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01,
1253 src02, src03, src04, src05, tmp4)
1254 QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01,
1255 src02, src03, src04, src05, tmp6)
1256 __lasx_xvstelm_d(tmp0, dst, 0, 0);
1257 dst += dstStride;
1258 __lasx_xvstelm_d(tmp0, dst, 0, 2);
1259 dst += dstStride;
1260 __lasx_xvstelm_d(tmp2, dst, 0, 0);
1261 dst += dstStride;
1262 __lasx_xvstelm_d(tmp2, dst, 0, 2);
1263 dst += dstStride;
1264 __lasx_xvstelm_d(tmp4, dst, 0, 0);
1265 dst += dstStride;
1266 __lasx_xvstelm_d(tmp4, dst, 0, 2);
1267 dst += dstStride;
1268 __lasx_xvstelm_d(tmp6, dst, 0, 0);
1269 dst += dstStride;
1270 __lasx_xvstelm_d(tmp6, dst, 0, 2);
1271 }
1272
1273 static av_always_inline void
avg_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)1274 avg_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride,
1275 int srcStride)
1276 {
1277 int dstStride_2x = dstStride << 1;
1278 int dstStride_4x = dstStride << 2;
1279 int dstStride_3x = dstStride_2x + dstStride;
1280 __m256i src00, src01, src02, src03, src04, src05, src10;
1281 __m256i dst00, dst01, dst0, dst1, dst2, dst3;
1282 __m256i out0, out1, out2, out3;
1283 __m256i h_20 = __lasx_xvldi(0x414);
1284 __m256i h_5 = __lasx_xvldi(0x405);
1285 __m256i h_16 = __lasx_xvldi(0x410);
1286 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1287 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1288 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1289 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1290 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1291
1292 QPEL8_H_LOWPASS(out0)
1293 QPEL8_H_LOWPASS(out1)
1294 QPEL8_H_LOWPASS(out2)
1295 QPEL8_H_LOWPASS(out3)
1296 src00 = __lasx_xvld(dst, 0);
1297 DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
1298 dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
1299 dst += dstStride_4x;
1300 DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, dst00);
1301 dst01 = __lasx_xvldx(dst, dstStride_3x);
1302 dst -= dstStride_4x;
1303 dst0 = __lasx_xvpermi_q(src00, src01, 0x02);
1304 dst1 = __lasx_xvpermi_q(src02, src03, 0x02);
1305 dst2 = __lasx_xvpermi_q(src04, src05, 0x02);
1306 dst3 = __lasx_xvpermi_q(dst00, dst01, 0x02);
1307 dst0 = __lasx_xvavgr_bu(dst0, out0);
1308 dst1 = __lasx_xvavgr_bu(dst1, out1);
1309 dst2 = __lasx_xvavgr_bu(dst2, out2);
1310 dst3 = __lasx_xvavgr_bu(dst3, out3);
1311 __lasx_xvstelm_d(dst0, dst, 0, 0);
1312 __lasx_xvstelm_d(dst0, dst + dstStride, 0, 2);
1313 __lasx_xvstelm_d(dst1, dst + dstStride_2x, 0, 0);
1314 __lasx_xvstelm_d(dst1, dst + dstStride_3x, 0, 2);
1315 dst += dstStride_4x;
1316 __lasx_xvstelm_d(dst2, dst, 0, 0);
1317 __lasx_xvstelm_d(dst2, dst + dstStride, 0, 2);
1318 __lasx_xvstelm_d(dst3, dst + dstStride_2x, 0, 0);
1319 __lasx_xvstelm_d(dst3, dst + dstStride_3x, 0, 2);
1320 }
1321
1322 static av_always_inline void
avg_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)1323 avg_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1324 ptrdiff_t dstStride, ptrdiff_t srcStride)
1325 {
1326 __m256i src00, src01, src02, src03, src04, src05, src10;
1327 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1328 __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
1329 __m256i h_20 = __lasx_xvldi(0x414);
1330 __m256i h_5 = __lasx_xvldi(0x405);
1331 __m256i w_20 = __lasx_xvldi(0x814);
1332 __m256i w_5 = __lasx_xvldi(0x805);
1333 __m256i w_512 = {512};
1334 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1335 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1336 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1337 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1338 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1339 ptrdiff_t dstStride_2x = dstStride << 1;
1340 ptrdiff_t dstStride_4x = dstStride << 2;
1341 ptrdiff_t dstStride_3x = dstStride_2x + dstStride;
1342
1343 w_512 = __lasx_xvreplve0_w(w_512);
1344
1345 src -= srcStride << 1;
1346 QPEL8_HV_LOWPASS_H(tmp0)
1347 QPEL8_HV_LOWPASS_H(tmp2)
1348 QPEL8_HV_LOWPASS_H(tmp4)
1349 QPEL8_HV_LOWPASS_H(tmp6)
1350 QPEL8_HV_LOWPASS_H(tmp8)
1351 QPEL8_HV_LOWPASS_H(tmp10)
1352 QPEL8_HV_LOWPASS_H(tmp12)
1353 tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
1354 tmp9 = __lasx_xvpermi_q(tmp10, tmp8, 0x21);
1355 tmp7 = __lasx_xvpermi_q(tmp8, tmp6, 0x21);
1356 tmp5 = __lasx_xvpermi_q(tmp6, tmp4, 0x21);
1357 tmp3 = __lasx_xvpermi_q(tmp4, tmp2, 0x21);
1358 tmp1 = __lasx_xvpermi_q(tmp2, tmp0, 0x21);
1359
1360 QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01,
1361 src02, src03, src04, src05, tmp0)
1362 QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01,
1363 src02, src03, src04, src05, tmp2)
1364 QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01,
1365 src02, src03, src04, src05, tmp4)
1366 QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01,
1367 src02, src03, src04, src05, tmp6)
1368
1369 src00 = __lasx_xvld(dst, 0);
1370 DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
1371 dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
1372 dst += dstStride_4x;
1373 DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, tmp8);
1374 tmp9 = __lasx_xvldx(dst, dstStride_3x);
1375 dst -= dstStride_4x;
1376 tmp1 = __lasx_xvpermi_q(src00, src01, 0x02);
1377 tmp3 = __lasx_xvpermi_q(src02, src03, 0x02);
1378 tmp5 = __lasx_xvpermi_q(src04, src05, 0x02);
1379 tmp7 = __lasx_xvpermi_q(tmp8, tmp9, 0x02);
1380 tmp0 = __lasx_xvavgr_bu(tmp0, tmp1);
1381 tmp2 = __lasx_xvavgr_bu(tmp2, tmp3);
1382 tmp4 = __lasx_xvavgr_bu(tmp4, tmp5);
1383 tmp6 = __lasx_xvavgr_bu(tmp6, tmp7);
1384 __lasx_xvstelm_d(tmp0, dst, 0, 0);
1385 dst += dstStride;
1386 __lasx_xvstelm_d(tmp0, dst, 0, 2);
1387 dst += dstStride;
1388 __lasx_xvstelm_d(tmp2, dst, 0, 0);
1389 dst += dstStride;
1390 __lasx_xvstelm_d(tmp2, dst, 0, 2);
1391 dst += dstStride;
1392 __lasx_xvstelm_d(tmp4, dst, 0, 0);
1393 dst += dstStride;
1394 __lasx_xvstelm_d(tmp4, dst, 0, 2);
1395 dst += dstStride;
1396 __lasx_xvstelm_d(tmp6, dst, 0, 0);
1397 dst += dstStride;
1398 __lasx_xvstelm_d(tmp6, dst, 0, 2);
1399 }
1400
1401 static av_always_inline void
put_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)1402 put_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1403 int dstStride, int srcStride)
1404 {
1405 put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1406 put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1407 src += srcStride << 3;
1408 dst += dstStride << 3;
1409 put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1410 put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1411 }
1412
1413 static av_always_inline void
avg_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)1414 avg_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1415 int dstStride, int srcStride)
1416 {
1417 avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1418 avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1419 src += srcStride << 3;
1420 dst += dstStride << 3;
1421 avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1422 avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1423 }
1424
put_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)1425 static void put_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1426 int dstStride, int srcStride)
1427 {
1428 put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1429 put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1430 src += 8*srcStride;
1431 dst += 8*dstStride;
1432 put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1433 put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1434 }
1435
avg_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)1436 static void avg_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1437 int dstStride, int srcStride)
1438 {
1439 avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1440 avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1441 src += 8*srcStride;
1442 dst += 8*dstStride;
1443 avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1444 avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1445 }
1446
put_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)1447 static void put_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1448 ptrdiff_t dstStride, ptrdiff_t srcStride)
1449 {
1450 put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1451 put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1452 src += srcStride << 3;
1453 dst += dstStride << 3;
1454 put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1455 put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1456 }
1457
avg_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)1458 static void avg_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1459 ptrdiff_t dstStride, ptrdiff_t srcStride)
1460 {
1461 avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1462 avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1463 src += srcStride << 3;
1464 dst += dstStride << 3;
1465 avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1466 avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1467 }
1468
ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1469 void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
1470 ptrdiff_t stride)
1471 {
1472 /* In mmi optimization, it used function ff_put_pixels8_8_mmi
1473 * which implemented in hpeldsp_mmi.c */
1474 put_pixels8_8_inline_asm(dst, src, stride);
1475 }
1476
ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1477 void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
1478 ptrdiff_t stride)
1479 {
1480 uint8_t half[64];
1481
1482 put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1483 /* in qpel8, the stride of half and height of block is 8 */
1484 put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
1485 }
1486
ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1487 void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
1488 ptrdiff_t stride)
1489 {
1490 put_h264_qpel8_h_lowpass_lasx(dst, src, stride, stride);
1491 }
1492
ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1493 void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
1494 ptrdiff_t stride)
1495 {
1496 uint8_t half[64];
1497
1498 put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1499 put_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
1500 }
1501
ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1502 void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src,
1503 ptrdiff_t stride)
1504 {
1505 uint8_t half[64];
1506
1507 put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride);
1508 put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
1509 }
1510
ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1511 void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
1512 ptrdiff_t stride)
1513 {
1514 uint8_t halfH[64];
1515 uint8_t halfV[64];
1516
1517 put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1518 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1519 put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1520 }
1521
ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1522 void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
1523 ptrdiff_t stride)
1524 {
1525 uint8_t temp[128];
1526 uint8_t *const halfH = temp;
1527 uint8_t *const halfHV = temp + 64;
1528
1529 put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1530 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1531 put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1532 }
1533
ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1534 void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
1535 ptrdiff_t stride)
1536 {
1537 uint8_t halfH[64];
1538 uint8_t halfV[64];
1539
1540 put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1541 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1542 put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1543 }
1544
ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1545 void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
1546 ptrdiff_t stride)
1547 {
1548 put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride);
1549 }
1550
ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1551 void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
1552 ptrdiff_t stride)
1553 {
1554 uint8_t temp[128];
1555 uint8_t *const halfHV = temp;
1556 uint8_t *const halfH = temp + 64;
1557
1558 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1559 put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride);
1560 put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1561 }
1562
ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1563 void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
1564 ptrdiff_t stride)
1565 {
1566 put_h264_qpel8_hv_lowpass_lasx(dst, src, stride, stride);
1567 }
1568
ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1569 void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
1570 ptrdiff_t stride)
1571 {
1572 uint8_t temp[128];
1573 uint8_t *const halfHV = temp;
1574 uint8_t *const halfH = temp + 64;
1575
1576 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1577 put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride);
1578 put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1579 }
1580
ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1581 void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src,
1582 ptrdiff_t stride)
1583 {
1584 uint8_t half[64];
1585
1586 put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride);
1587 put_pixels8_l2_8_lsx(dst, src + stride, half, stride, stride);
1588 }
1589
ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1590 void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
1591 ptrdiff_t stride)
1592 {
1593 uint8_t halfH[64];
1594 uint8_t halfV[64];
1595
1596 put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1597 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1598 put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1599 }
1600
ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1601 void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
1602 ptrdiff_t stride)
1603 {
1604 uint8_t temp[128];
1605 uint8_t *const halfH = temp;
1606 uint8_t *const halfHV = temp + 64;
1607
1608 put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1609 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1610 put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1611 }
1612
ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1613 void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
1614 ptrdiff_t stride)
1615 {
1616 uint8_t halfH[64];
1617 uint8_t halfV[64];
1618
1619 put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1620 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1621 put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1622 }
1623
ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1624 void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
1625 ptrdiff_t stride)
1626 {
1627 /* In mmi optimization, it used function ff_avg_pixels8_8_mmi
1628 * which implemented in hpeldsp_mmi.c */
1629 avg_pixels8_8_lsx(dst, src, stride);
1630 }
1631
ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1632 void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
1633 ptrdiff_t stride)
1634 {
1635 uint8_t half[64];
1636
1637 put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1638 avg_pixels8_l2_8_lsx(dst, src, half, stride, stride);
1639 }
1640
ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1641 void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
1642 ptrdiff_t stride)
1643 {
1644 avg_h264_qpel8_h_lowpass_lasx(dst, src, stride, stride);
1645 }
1646
ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1647 void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
1648 ptrdiff_t stride)
1649 {
1650 uint8_t half[64];
1651
1652 put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1653 avg_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
1654 }
1655
ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1656 void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
1657 ptrdiff_t stride)
1658 {
1659 uint8_t halfH[64];
1660 uint8_t halfV[64];
1661
1662 put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1663 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1664 avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1665 }
1666
ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1667 void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
1668 ptrdiff_t stride)
1669 {
1670 uint8_t temp[128];
1671 uint8_t *const halfH = temp;
1672 uint8_t *const halfHV = temp + 64;
1673
1674 put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1675 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1676 avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1677 }
1678
ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1679 void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
1680 ptrdiff_t stride)
1681 {
1682 uint8_t halfH[64];
1683 uint8_t halfV[64];
1684
1685 put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1686 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1687 avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1688 }
1689
ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1690 void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
1691 ptrdiff_t stride)
1692 {
1693 avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride);
1694 }
1695
ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1696 void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
1697 ptrdiff_t stride)
1698 {
1699 uint8_t temp[128];
1700 uint8_t *const halfHV = temp;
1701 uint8_t *const halfH = temp + 64;
1702
1703 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1704 put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride);
1705 avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1706 }
1707
ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1708 void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
1709 ptrdiff_t stride)
1710 {
1711 avg_h264_qpel8_hv_lowpass_lasx(dst, src, stride, stride);
1712 }
1713
ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1714 void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
1715 ptrdiff_t stride)
1716 {
1717 uint8_t temp[128];
1718 uint8_t *const halfHV = temp;
1719 uint8_t *const halfH = temp + 64;
1720
1721 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1722 put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride);
1723 avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1724 }
1725
ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1726 void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
1727 ptrdiff_t stride)
1728 {
1729 uint8_t halfH[64];
1730 uint8_t halfV[64];
1731
1732 put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1733 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1734 avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1735 }
1736
ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1737 void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
1738 ptrdiff_t stride)
1739 {
1740 uint8_t temp[128];
1741 uint8_t *const halfH = temp;
1742 uint8_t *const halfHV = temp + 64;
1743
1744 put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1745 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1746 avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1747 }
1748
ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1749 void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
1750 ptrdiff_t stride)
1751 {
1752 uint8_t halfH[64];
1753 uint8_t halfV[64];
1754
1755 put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1756 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1757 avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1758 }
1759
ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1760 void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
1761 ptrdiff_t stride)
1762 {
1763 /* In mmi optimization, it used function ff_put_pixels16_8_mmi
1764 * which implemented in hpeldsp_mmi.c */
1765 put_pixels16_8_lsx(dst, src, stride);
1766 }
1767
ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1768 void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
1769 ptrdiff_t stride)
1770 {
1771 uint8_t half[256];
1772
1773 put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1774 put_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1775 }
1776
ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1777 void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
1778 ptrdiff_t stride)
1779 {
1780 put_h264_qpel16_h_lowpass_lasx(dst, src, stride, stride);
1781 }
1782
ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1783 void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
1784 ptrdiff_t stride)
1785 {
1786 uint8_t half[256];
1787
1788 put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1789 put_pixels16_l2_8_lsx(dst, src+1, half, stride, stride);
1790 }
1791
ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1792 void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
1793 ptrdiff_t stride)
1794 {
1795 uint8_t half[256];
1796
1797 put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
1798 put_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1799 }
1800
ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1801 void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
1802 ptrdiff_t stride)
1803 {
1804 avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2),
1805 dst, stride);
1806 }
1807
ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1808 void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
1809 ptrdiff_t stride)
1810 {
1811 uint8_t temp[512];
1812 uint8_t *const halfH = temp;
1813 uint8_t *const halfHV = temp + 256;
1814
1815 put_h264_qpel16_h_lowpass_lasx(halfH, src, 16, stride);
1816 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1817 put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1818 }
1819
ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1820 void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
1821 ptrdiff_t stride)
1822 {
1823 avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2) + 1,
1824 dst, stride);
1825 }
1826
ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1827 void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
1828 ptrdiff_t stride)
1829 {
1830 put_h264_qpel16_v_lowpass_lasx(dst, src, stride, stride);
1831 }
1832
ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1833 void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
1834 ptrdiff_t stride)
1835 {
1836 uint8_t temp[512];
1837 uint8_t *const halfHV = temp;
1838 uint8_t *const halfH = temp + 256;
1839
1840 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1841 put_h264_qpel16_v_lowpass_lasx(halfH, src, 16, stride);
1842 put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1843 }
1844
ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1845 void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
1846 ptrdiff_t stride)
1847 {
1848 put_h264_qpel16_hv_lowpass_lasx(dst, src, stride, stride);
1849 }
1850
ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1851 void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
1852 ptrdiff_t stride)
1853 {
1854 uint8_t temp[512];
1855 uint8_t *const halfHV = temp;
1856 uint8_t *const halfH = temp + 256;
1857
1858 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1859 put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride);
1860 put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1861 }
1862
ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1863 void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
1864 ptrdiff_t stride)
1865 {
1866 uint8_t half[256];
1867
1868 put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
1869 put_pixels16_l2_8_lsx(dst, src+stride, half, stride, stride);
1870 }
1871
ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1872 void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
1873 ptrdiff_t stride)
1874 {
1875 avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2, (uint8_t*)src - (stride * 2),
1876 dst, stride);
1877 }
1878
ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1879 void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
1880 ptrdiff_t stride)
1881 {
1882 uint8_t temp[512];
1883 uint8_t *const halfH = temp;
1884 uint8_t *const halfHV = temp + 256;
1885
1886 put_h264_qpel16_h_lowpass_lasx(halfH, src + stride, 16, stride);
1887 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1888 put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1889 }
1890
ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1891 void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
1892 ptrdiff_t stride)
1893 {
1894 avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2,
1895 (uint8_t*)src - (stride * 2) + 1, dst, stride);
1896 }
1897
ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1898 void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
1899 ptrdiff_t stride)
1900 {
1901 /* In mmi optimization, it used function ff_avg_pixels16_8_mmi
1902 * which implemented in hpeldsp_mmi.c */
1903 avg_pixels16_8_lsx(dst, src, stride);
1904 }
1905
ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1906 void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
1907 ptrdiff_t stride)
1908 {
1909 uint8_t half[256];
1910
1911 put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1912 avg_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1913 }
1914
ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1915 void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
1916 ptrdiff_t stride)
1917 {
1918 avg_h264_qpel16_h_lowpass_lasx(dst, src, stride, stride);
1919 }
1920
ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1921 void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
1922 ptrdiff_t stride)
1923 {
1924 uint8_t half[256];
1925
1926 put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1927 avg_pixels16_l2_8_lsx(dst, src+1, half, stride, stride);
1928 }
1929
ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1930 void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
1931 ptrdiff_t stride)
1932 {
1933 uint8_t half[256];
1934
1935 put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
1936 avg_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1937 }
1938
ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1939 void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
1940 ptrdiff_t stride)
1941 {
1942 avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src - 2,
1943 (uint8_t*)src - (stride * 2),
1944 dst, stride);
1945 }
1946
ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1947 void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
1948 ptrdiff_t stride)
1949 {
1950 uint8_t temp[512];
1951 uint8_t *const halfH = temp;
1952 uint8_t *const halfHV = temp + 256;
1953
1954 put_h264_qpel16_h_lowpass_lasx(halfH, src, 16, stride);
1955 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1956 avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1957 }
1958
ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1959 void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
1960 ptrdiff_t stride)
1961 {
1962 avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src - 2,
1963 (uint8_t*)src - (stride * 2) + 1,
1964 dst, stride);
1965 }
1966
ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1967 void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
1968 ptrdiff_t stride)
1969 {
1970 avg_h264_qpel16_v_lowpass_lasx(dst, src, stride, stride);
1971 }
1972
ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1973 void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
1974 ptrdiff_t stride)
1975 {
1976 uint8_t temp[512];
1977 uint8_t *const halfHV = temp;
1978 uint8_t *const halfH = temp + 256;
1979
1980 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1981 put_h264_qpel16_v_lowpass_lasx(halfH, src, 16, stride);
1982 avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1983 }
1984
ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1985 void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
1986 ptrdiff_t stride)
1987 {
1988 avg_h264_qpel16_hv_lowpass_lasx(dst, src, stride, stride);
1989 }
1990
ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)1991 void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
1992 ptrdiff_t stride)
1993 {
1994 uint8_t temp[512];
1995 uint8_t *const halfHV = temp;
1996 uint8_t *const halfH = temp + 256;
1997
1998 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1999 put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride);
2000 avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
2001 }
2002
ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)2003 void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
2004 ptrdiff_t stride)
2005 {
2006 uint8_t half[256];
2007
2008 put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
2009 avg_pixels16_l2_8_lsx(dst, src + stride, half, stride, stride);
2010 }
2011
ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)2012 void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
2013 ptrdiff_t stride)
2014 {
2015 avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src + stride - 2,
2016 (uint8_t*)src - (stride * 2),
2017 dst, stride);
2018 }
2019
ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)2020 void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
2021 ptrdiff_t stride)
2022 {
2023 uint8_t temp[512];
2024 uint8_t *const halfH = temp;
2025 uint8_t *const halfHV = temp + 256;
2026
2027 put_h264_qpel16_h_lowpass_lasx(halfH, src + stride, 16, stride);
2028 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
2029 avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
2030 }
2031
ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)2032 void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
2033 ptrdiff_t stride)
2034 {
2035 avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src + stride - 2,
2036 (uint8_t*)src - (stride * 2) + 1,
2037 dst, stride);
2038 }
2039