1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Loongson LASX optimized h264qpel
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * Copyright (c) 2020 Loongson Technology Corporation Limited
5cabdff1aSopenharmony_ci * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
6cabdff1aSopenharmony_ci *
7cabdff1aSopenharmony_ci * This file is part of FFmpeg.
8cabdff1aSopenharmony_ci *
9cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
10cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
11cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
12cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
13cabdff1aSopenharmony_ci *
14cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
15cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
16cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17cabdff1aSopenharmony_ci * Lesser General Public License for more details.
18cabdff1aSopenharmony_ci *
19cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
20cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
21cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22cabdff1aSopenharmony_ci */
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci#include "h264qpel_lasx.h"
25cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h"
26cabdff1aSopenharmony_ci#include "libavutil/attributes.h"
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_cistatic const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = {
29cabdff1aSopenharmony_ci    /* 8 width cases */
30cabdff1aSopenharmony_ci    0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
31cabdff1aSopenharmony_ci    0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
32cabdff1aSopenharmony_ci    1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
33cabdff1aSopenharmony_ci    1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
34cabdff1aSopenharmony_ci    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
35cabdff1aSopenharmony_ci    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
36cabdff1aSopenharmony_ci};
37cabdff1aSopenharmony_ci
38cabdff1aSopenharmony_ci#define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2)  \
39cabdff1aSopenharmony_ci( {                                                        \
40cabdff1aSopenharmony_ci    __m256i out0_m;                                        \
41cabdff1aSopenharmony_ci    __m256i tmp0_m;                                        \
42cabdff1aSopenharmony_ci                                                           \
43cabdff1aSopenharmony_ci    tmp0_m = __lasx_xvshuf_b(in1, in0, mask0);             \
44cabdff1aSopenharmony_ci    out0_m = __lasx_xvhaddw_h_b(tmp0_m, tmp0_m);           \
45cabdff1aSopenharmony_ci    tmp0_m = __lasx_xvshuf_b(in1, in0, mask1);             \
46cabdff1aSopenharmony_ci    out0_m = __lasx_xvdp2add_h_b(out0_m, minus5b, tmp0_m); \
47cabdff1aSopenharmony_ci    tmp0_m = __lasx_xvshuf_b(in1, in0, mask2);             \
48cabdff1aSopenharmony_ci    out0_m = __lasx_xvdp2add_h_b(out0_m, plus20b, tmp0_m); \
49cabdff1aSopenharmony_ci                                                           \
50cabdff1aSopenharmony_ci    out0_m;                                                \
51cabdff1aSopenharmony_ci} )
52cabdff1aSopenharmony_ci
53cabdff1aSopenharmony_ci#define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)  \
54cabdff1aSopenharmony_ci( {                                                            \
55cabdff1aSopenharmony_ci    __m256i out0_m;                                            \
56cabdff1aSopenharmony_ci                                                               \
57cabdff1aSopenharmony_ci    out0_m = __lasx_xvdp2_h_b(in0, coeff0);                    \
58cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvdp2add_h_b, out0_m, in1, coeff1, out0_m,\
59cabdff1aSopenharmony_ci              in2, coeff2, out0_m, out0_m);                    \
60cabdff1aSopenharmony_ci                                                               \
61cabdff1aSopenharmony_ci    out0_m;                                                    \
62cabdff1aSopenharmony_ci} )
63cabdff1aSopenharmony_ci
64cabdff1aSopenharmony_cistatic av_always_inline
65cabdff1aSopenharmony_civoid avc_luma_hv_qrt_and_aver_dst_16x16_lasx(uint8_t *src_x,
66cabdff1aSopenharmony_ci                                             uint8_t *src_y,
67cabdff1aSopenharmony_ci                                             uint8_t *dst, ptrdiff_t stride)
68cabdff1aSopenharmony_ci{
69cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
70cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
71cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
72cabdff1aSopenharmony_ci    uint32_t loop_cnt;
73cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
74cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
75cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
76cabdff1aSopenharmony_ci    __m256i tmp0, tmp1;
77cabdff1aSopenharmony_ci    __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
78cabdff1aSopenharmony_ci    __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
79cabdff1aSopenharmony_ci    __m256i src_vt7, src_vt8;
80cabdff1aSopenharmony_ci    __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
81cabdff1aSopenharmony_ci    __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
82cabdff1aSopenharmony_ci    __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
83cabdff1aSopenharmony_ci    __m256i vt_out3, out0, out1, out2, out3;
84cabdff1aSopenharmony_ci    __m256i minus5b = __lasx_xvldi(0xFB);
85cabdff1aSopenharmony_ci    __m256i plus20b = __lasx_xvldi(20);
86cabdff1aSopenharmony_ci
87cabdff1aSopenharmony_ci    filt0 = __lasx_xvreplgr2vr_h(filt_const0);
88cabdff1aSopenharmony_ci    filt1 = __lasx_xvreplgr2vr_h(filt_const1);
89cabdff1aSopenharmony_ci    filt2 = __lasx_xvreplgr2vr_h(filt_const2);
90cabdff1aSopenharmony_ci
91cabdff1aSopenharmony_ci    mask0 = __lasx_xvld(luma_mask_arr, 0);
92cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2);
93cabdff1aSopenharmony_ci    src_vt0 = __lasx_xvld(src_y, 0);
94cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x,
95cabdff1aSopenharmony_ci              src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
96cabdff1aSopenharmony_ci    src_y += stride_4x;
97cabdff1aSopenharmony_ci
98cabdff1aSopenharmony_ci    src_vt0 = __lasx_xvxori_b(src_vt0, 128);
99cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
100cabdff1aSopenharmony_ci              src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
101cabdff1aSopenharmony_ci
102cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
103cabdff1aSopenharmony_ci        src_hz0 = __lasx_xvld(src_x, 0);
104cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x,
105cabdff1aSopenharmony_ci                  src_hz1, src_hz2);
106cabdff1aSopenharmony_ci        src_hz3 = __lasx_xvldx(src_x, stride_3x);
107cabdff1aSopenharmony_ci        src_x  += stride_4x;
108cabdff1aSopenharmony_ci        src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
109cabdff1aSopenharmony_ci        src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
110cabdff1aSopenharmony_ci        src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
111cabdff1aSopenharmony_ci        src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
112cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
113cabdff1aSopenharmony_ci                  src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
114cabdff1aSopenharmony_ci
115cabdff1aSopenharmony_ci        hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
116cabdff1aSopenharmony_ci        hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
117cabdff1aSopenharmony_ci        hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
118cabdff1aSopenharmony_ci        hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
119cabdff1aSopenharmony_ci        hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
120cabdff1aSopenharmony_ci        hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
121cabdff1aSopenharmony_ci
122cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x,
123cabdff1aSopenharmony_ci                  src_y, stride_3x, src_y, stride_4x,
124cabdff1aSopenharmony_ci                  src_vt5, src_vt6, src_vt7, src_vt8);
125cabdff1aSopenharmony_ci        src_y += stride_4x;
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
128cabdff1aSopenharmony_ci                  src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
129cabdff1aSopenharmony_ci
130cabdff1aSopenharmony_ci        DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
131cabdff1aSopenharmony_ci                  0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
132cabdff1aSopenharmony_ci                  src_vt0, src_vt1, src_vt2, src_vt3);
133cabdff1aSopenharmony_ci        src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
134cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
135cabdff1aSopenharmony_ci                  src_vt3, src_vt2, src_vt87_h, src_vt3,
136cabdff1aSopenharmony_ci                  src_hz0, src_hz1, src_hz2, src_hz3);
137cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
138cabdff1aSopenharmony_ci                  src_vt3, src_vt2, src_vt87_h, src_vt3,
139cabdff1aSopenharmony_ci                  src_vt0, src_vt1, src_vt2, src_vt3);
140cabdff1aSopenharmony_ci        DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1, src_hz1,
141cabdff1aSopenharmony_ci                  0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3, 0x02,
142cabdff1aSopenharmony_ci                  src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
143cabdff1aSopenharmony_ci        DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1, src_hz1,
144cabdff1aSopenharmony_ci                  0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3, 0x13,
145cabdff1aSopenharmony_ci                  src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
146cabdff1aSopenharmony_ci        vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h, filt0,
147cabdff1aSopenharmony_ci                                 filt1, filt2);
148cabdff1aSopenharmony_ci        vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h, filt0,
149cabdff1aSopenharmony_ci                                 filt1, filt2);
150cabdff1aSopenharmony_ci        vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h, filt0,
151cabdff1aSopenharmony_ci                                 filt1, filt2);
152cabdff1aSopenharmony_ci        vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h, filt0,
153cabdff1aSopenharmony_ci                                 filt1, filt2);
154cabdff1aSopenharmony_ci        vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
155cabdff1aSopenharmony_ci        vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
156cabdff1aSopenharmony_ci
157cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
158cabdff1aSopenharmony_ci                  out0, out2);
159cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
160cabdff1aSopenharmony_ci                  out1, out3);
161cabdff1aSopenharmony_ci        tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
162cabdff1aSopenharmony_ci        tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
163cabdff1aSopenharmony_ci
164cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
165cabdff1aSopenharmony_ci        out0 = __lasx_xvld(dst, 0);
166cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvldx, dst, stride, dst, stride_2x, out1, out2);
167cabdff1aSopenharmony_ci        out3 = __lasx_xvldx(dst, stride_3x);
168cabdff1aSopenharmony_ci        out0 = __lasx_xvpermi_q(out0, out2, 0x02);
169cabdff1aSopenharmony_ci        out1 = __lasx_xvpermi_q(out1, out3, 0x02);
170cabdff1aSopenharmony_ci        out2 = __lasx_xvilvl_d(out1, out0);
171cabdff1aSopenharmony_ci        out3 = __lasx_xvilvh_d(out1, out0);
172cabdff1aSopenharmony_ci        out0 = __lasx_xvpermi_q(out2, out3, 0x02);
173cabdff1aSopenharmony_ci        out1 = __lasx_xvpermi_q(out2, out3, 0x13);
174cabdff1aSopenharmony_ci        tmp0 = __lasx_xvavgr_bu(out0, tmp0);
175cabdff1aSopenharmony_ci        tmp1 = __lasx_xvavgr_bu(out1, tmp1);
176cabdff1aSopenharmony_ci
177cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp0, dst, 0, 0);
178cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp0, dst + stride, 0, 1);
179cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
180cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
181cabdff1aSopenharmony_ci
182cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp0, dst, 8, 2);
183cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp0, dst + stride, 8, 3);
184cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
185cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
186cabdff1aSopenharmony_ci
187cabdff1aSopenharmony_ci        dst    += stride_4x;
188cabdff1aSopenharmony_ci        src_vt0 = src_vt4;
189cabdff1aSopenharmony_ci        src_vt1 = src_vt5;
190cabdff1aSopenharmony_ci        src_vt2 = src_vt6;
191cabdff1aSopenharmony_ci        src_vt3 = src_vt7;
192cabdff1aSopenharmony_ci        src_vt4 = src_vt8;
193cabdff1aSopenharmony_ci    }
194cabdff1aSopenharmony_ci}
195cabdff1aSopenharmony_ci
196cabdff1aSopenharmony_cistatic av_always_inline void
197cabdff1aSopenharmony_ciavc_luma_hv_qrt_16x16_lasx(uint8_t *src_x, uint8_t *src_y,
198cabdff1aSopenharmony_ci                           uint8_t *dst, ptrdiff_t stride)
199cabdff1aSopenharmony_ci{
200cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
201cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
202cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
203cabdff1aSopenharmony_ci    uint32_t loop_cnt;
204cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
205cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
206cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
207cabdff1aSopenharmony_ci    __m256i tmp0, tmp1;
208cabdff1aSopenharmony_ci    __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
209cabdff1aSopenharmony_ci    __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
210cabdff1aSopenharmony_ci    __m256i src_vt7, src_vt8;
211cabdff1aSopenharmony_ci    __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
212cabdff1aSopenharmony_ci    __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
213cabdff1aSopenharmony_ci    __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
214cabdff1aSopenharmony_ci    __m256i vt_out3, out0, out1, out2, out3;
215cabdff1aSopenharmony_ci    __m256i minus5b = __lasx_xvldi(0xFB);
216cabdff1aSopenharmony_ci    __m256i plus20b = __lasx_xvldi(20);
217cabdff1aSopenharmony_ci
218cabdff1aSopenharmony_ci    filt0 = __lasx_xvreplgr2vr_h(filt_const0);
219cabdff1aSopenharmony_ci    filt1 = __lasx_xvreplgr2vr_h(filt_const1);
220cabdff1aSopenharmony_ci    filt2 = __lasx_xvreplgr2vr_h(filt_const2);
221cabdff1aSopenharmony_ci
222cabdff1aSopenharmony_ci    mask0 = __lasx_xvld(luma_mask_arr, 0);
223cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2);
224cabdff1aSopenharmony_ci    src_vt0 = __lasx_xvld(src_y, 0);
225cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x,
226cabdff1aSopenharmony_ci              src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
227cabdff1aSopenharmony_ci    src_y += stride_4x;
228cabdff1aSopenharmony_ci
229cabdff1aSopenharmony_ci    src_vt0 = __lasx_xvxori_b(src_vt0, 128);
230cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
231cabdff1aSopenharmony_ci              src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
232cabdff1aSopenharmony_ci
233cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
234cabdff1aSopenharmony_ci        src_hz0 = __lasx_xvld(src_x, 0);
235cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x,
236cabdff1aSopenharmony_ci                  src_hz1, src_hz2);
237cabdff1aSopenharmony_ci        src_hz3 = __lasx_xvldx(src_x, stride_3x);
238cabdff1aSopenharmony_ci        src_x  += stride_4x;
239cabdff1aSopenharmony_ci        src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
240cabdff1aSopenharmony_ci        src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
241cabdff1aSopenharmony_ci        src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
242cabdff1aSopenharmony_ci        src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
243cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
244cabdff1aSopenharmony_ci                  src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
245cabdff1aSopenharmony_ci
246cabdff1aSopenharmony_ci        hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
247cabdff1aSopenharmony_ci        hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
248cabdff1aSopenharmony_ci        hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
249cabdff1aSopenharmony_ci        hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
250cabdff1aSopenharmony_ci        hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
251cabdff1aSopenharmony_ci        hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
252cabdff1aSopenharmony_ci
253cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x,
254cabdff1aSopenharmony_ci                  src_y, stride_3x, src_y, stride_4x,
255cabdff1aSopenharmony_ci                  src_vt5, src_vt6, src_vt7, src_vt8);
256cabdff1aSopenharmony_ci        src_y += stride_4x;
257cabdff1aSopenharmony_ci
258cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
259cabdff1aSopenharmony_ci                  src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
260cabdff1aSopenharmony_ci        DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
261cabdff1aSopenharmony_ci                  0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
262cabdff1aSopenharmony_ci                  src_vt0, src_vt1, src_vt2, src_vt3);
263cabdff1aSopenharmony_ci        src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
264cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
265cabdff1aSopenharmony_ci                  src_vt3, src_vt2, src_vt87_h, src_vt3,
266cabdff1aSopenharmony_ci                  src_hz0, src_hz1, src_hz2, src_hz3);
267cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
268cabdff1aSopenharmony_ci                  src_vt3, src_vt2, src_vt87_h, src_vt3,
269cabdff1aSopenharmony_ci                  src_vt0, src_vt1, src_vt2, src_vt3);
270cabdff1aSopenharmony_ci        DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1,
271cabdff1aSopenharmony_ci                  src_hz1, 0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3,
272cabdff1aSopenharmony_ci                  0x02, src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
273cabdff1aSopenharmony_ci        DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1,
274cabdff1aSopenharmony_ci                  src_hz1, 0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3,
275cabdff1aSopenharmony_ci                  0x13, src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
276cabdff1aSopenharmony_ci
277cabdff1aSopenharmony_ci        vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h,
278cabdff1aSopenharmony_ci                                 filt0, filt1, filt2);
279cabdff1aSopenharmony_ci        vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h,
280cabdff1aSopenharmony_ci                                 filt0, filt1, filt2);
281cabdff1aSopenharmony_ci        vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h,
282cabdff1aSopenharmony_ci                                 filt0, filt1, filt2);
283cabdff1aSopenharmony_ci        vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h,
284cabdff1aSopenharmony_ci                                 filt0, filt1, filt2);
285cabdff1aSopenharmony_ci        vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
286cabdff1aSopenharmony_ci        vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
287cabdff1aSopenharmony_ci
288cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
289cabdff1aSopenharmony_ci                  out0, out2);
290cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
291cabdff1aSopenharmony_ci                  out1, out3);
292cabdff1aSopenharmony_ci        tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
293cabdff1aSopenharmony_ci        tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
294cabdff1aSopenharmony_ci
295cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
296cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp0, dst, 0, 0);
297cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp0, dst + stride, 0, 1);
298cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
299cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
300cabdff1aSopenharmony_ci
301cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp0, dst, 8, 2);
302cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp0, dst + stride, 8, 3);
303cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
304cabdff1aSopenharmony_ci        __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
305cabdff1aSopenharmony_ci
306cabdff1aSopenharmony_ci        dst    += stride_4x;
307cabdff1aSopenharmony_ci        src_vt0 = src_vt4;
308cabdff1aSopenharmony_ci        src_vt1 = src_vt5;
309cabdff1aSopenharmony_ci        src_vt2 = src_vt6;
310cabdff1aSopenharmony_ci        src_vt3 = src_vt7;
311cabdff1aSopenharmony_ci        src_vt4 = src_vt8;
312cabdff1aSopenharmony_ci    }
313cabdff1aSopenharmony_ci}
314cabdff1aSopenharmony_ci
315cabdff1aSopenharmony_ci/* put_pixels8_8_inline_asm: dst = src */
316cabdff1aSopenharmony_cistatic av_always_inline void
317cabdff1aSopenharmony_ciput_pixels8_8_inline_asm(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
318cabdff1aSopenharmony_ci{
319cabdff1aSopenharmony_ci    uint64_t tmp[8];
320cabdff1aSopenharmony_ci    ptrdiff_t stride_2, stride_3, stride_4;
321cabdff1aSopenharmony_ci    __asm__ volatile (
322cabdff1aSopenharmony_ci    "slli.d     %[stride_2],     %[stride],   1           \n\t"
323cabdff1aSopenharmony_ci    "add.d      %[stride_3],     %[stride_2], %[stride]   \n\t"
324cabdff1aSopenharmony_ci    "slli.d     %[stride_4],     %[stride_2], 1           \n\t"
325cabdff1aSopenharmony_ci    "ld.d       %[tmp0],         %[src],      0x0         \n\t"
326cabdff1aSopenharmony_ci    "ldx.d      %[tmp1],         %[src],      %[stride]   \n\t"
327cabdff1aSopenharmony_ci    "ldx.d      %[tmp2],         %[src],      %[stride_2] \n\t"
328cabdff1aSopenharmony_ci    "ldx.d      %[tmp3],         %[src],      %[stride_3] \n\t"
329cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],      %[stride_4] \n\t"
330cabdff1aSopenharmony_ci    "ld.d       %[tmp4],         %[src],      0x0         \n\t"
331cabdff1aSopenharmony_ci    "ldx.d      %[tmp5],         %[src],      %[stride]   \n\t"
332cabdff1aSopenharmony_ci    "ldx.d      %[tmp6],         %[src],      %[stride_2] \n\t"
333cabdff1aSopenharmony_ci    "ldx.d      %[tmp7],         %[src],      %[stride_3] \n\t"
334cabdff1aSopenharmony_ci
335cabdff1aSopenharmony_ci    "st.d       %[tmp0],         %[dst],      0x0         \n\t"
336cabdff1aSopenharmony_ci    "stx.d      %[tmp1],         %[dst],      %[stride]   \n\t"
337cabdff1aSopenharmony_ci    "stx.d      %[tmp2],         %[dst],      %[stride_2] \n\t"
338cabdff1aSopenharmony_ci    "stx.d      %[tmp3],         %[dst],      %[stride_3] \n\t"
339cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],      %[stride_4] \n\t"
340cabdff1aSopenharmony_ci    "st.d       %[tmp4],         %[dst],      0x0         \n\t"
341cabdff1aSopenharmony_ci    "stx.d      %[tmp5],         %[dst],      %[stride]   \n\t"
342cabdff1aSopenharmony_ci    "stx.d      %[tmp6],         %[dst],      %[stride_2] \n\t"
343cabdff1aSopenharmony_ci    "stx.d      %[tmp7],         %[dst],      %[stride_3] \n\t"
344cabdff1aSopenharmony_ci    : [tmp0]"=&r"(tmp[0]),        [tmp1]"=&r"(tmp[1]),
345cabdff1aSopenharmony_ci      [tmp2]"=&r"(tmp[2]),        [tmp3]"=&r"(tmp[3]),
346cabdff1aSopenharmony_ci      [tmp4]"=&r"(tmp[4]),        [tmp5]"=&r"(tmp[5]),
347cabdff1aSopenharmony_ci      [tmp6]"=&r"(tmp[6]),        [tmp7]"=&r"(tmp[7]),
348cabdff1aSopenharmony_ci      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
349cabdff1aSopenharmony_ci      [stride_4]"=&r"(stride_4),
350cabdff1aSopenharmony_ci      [dst]"+&r"(dst),            [src]"+&r"(src)
351cabdff1aSopenharmony_ci    : [stride]"r"(stride)
352cabdff1aSopenharmony_ci    : "memory"
353cabdff1aSopenharmony_ci    );
354cabdff1aSopenharmony_ci}
355cabdff1aSopenharmony_ci
356cabdff1aSopenharmony_ci/* avg_pixels8_8_lsx   : dst = avg(src, dst)
357cabdff1aSopenharmony_ci * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
358cabdff1aSopenharmony_ci * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
359cabdff1aSopenharmony_cistatic av_always_inline void
360cabdff1aSopenharmony_ciavg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
361cabdff1aSopenharmony_ci{
362cabdff1aSopenharmony_ci    uint8_t *tmp = dst;
363cabdff1aSopenharmony_ci    ptrdiff_t stride_2, stride_3, stride_4;
364cabdff1aSopenharmony_ci    __asm__ volatile (
365cabdff1aSopenharmony_ci    /* h0~h7 */
366cabdff1aSopenharmony_ci    "slli.d     %[stride_2],     %[stride],   1           \n\t"
367cabdff1aSopenharmony_ci    "add.d      %[stride_3],     %[stride_2], %[stride]   \n\t"
368cabdff1aSopenharmony_ci    "slli.d     %[stride_4],     %[stride_2], 1           \n\t"
369cabdff1aSopenharmony_ci    "vld        $vr0,            %[src],      0           \n\t"
370cabdff1aSopenharmony_ci    "vldx       $vr1,            %[src],      %[stride]   \n\t"
371cabdff1aSopenharmony_ci    "vldx       $vr2,            %[src],      %[stride_2] \n\t"
372cabdff1aSopenharmony_ci    "vldx       $vr3,            %[src],      %[stride_3] \n\t"
373cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],      %[stride_4] \n\t"
374cabdff1aSopenharmony_ci    "vld        $vr4,            %[src],      0           \n\t"
375cabdff1aSopenharmony_ci    "vldx       $vr5,            %[src],      %[stride]   \n\t"
376cabdff1aSopenharmony_ci    "vldx       $vr6,            %[src],      %[stride_2] \n\t"
377cabdff1aSopenharmony_ci    "vldx       $vr7,            %[src],      %[stride_3] \n\t"
378cabdff1aSopenharmony_ci
379cabdff1aSopenharmony_ci    "vld        $vr8,            %[tmp],      0           \n\t"
380cabdff1aSopenharmony_ci    "vldx       $vr9,            %[tmp],      %[stride]   \n\t"
381cabdff1aSopenharmony_ci    "vldx       $vr10,           %[tmp],      %[stride_2] \n\t"
382cabdff1aSopenharmony_ci    "vldx       $vr11,           %[tmp],      %[stride_3] \n\t"
383cabdff1aSopenharmony_ci    "add.d      %[tmp],          %[tmp],      %[stride_4] \n\t"
384cabdff1aSopenharmony_ci    "vld        $vr12,           %[tmp],      0           \n\t"
385cabdff1aSopenharmony_ci    "vldx       $vr13,           %[tmp],      %[stride]   \n\t"
386cabdff1aSopenharmony_ci    "vldx       $vr14,           %[tmp],      %[stride_2] \n\t"
387cabdff1aSopenharmony_ci    "vldx       $vr15,           %[tmp],      %[stride_3] \n\t"
388cabdff1aSopenharmony_ci
389cabdff1aSopenharmony_ci    "vavgr.bu    $vr0,           $vr8,        $vr0        \n\t"
390cabdff1aSopenharmony_ci    "vavgr.bu    $vr1,           $vr9,        $vr1        \n\t"
391cabdff1aSopenharmony_ci    "vavgr.bu    $vr2,           $vr10,       $vr2        \n\t"
392cabdff1aSopenharmony_ci    "vavgr.bu    $vr3,           $vr11,       $vr3        \n\t"
393cabdff1aSopenharmony_ci    "vavgr.bu    $vr4,           $vr12,       $vr4        \n\t"
394cabdff1aSopenharmony_ci    "vavgr.bu    $vr5,           $vr13,       $vr5        \n\t"
395cabdff1aSopenharmony_ci    "vavgr.bu    $vr6,           $vr14,       $vr6        \n\t"
396cabdff1aSopenharmony_ci    "vavgr.bu    $vr7,           $vr15,       $vr7        \n\t"
397cabdff1aSopenharmony_ci
398cabdff1aSopenharmony_ci    "vstelm.d    $vr0,           %[dst],      0,  0       \n\t"
399cabdff1aSopenharmony_ci    "add.d       %[dst],         %[dst],      %[stride]   \n\t"
400cabdff1aSopenharmony_ci    "vstelm.d    $vr1,           %[dst],      0,  0       \n\t"
401cabdff1aSopenharmony_ci    "add.d       %[dst],         %[dst],      %[stride]   \n\t"
402cabdff1aSopenharmony_ci    "vstelm.d    $vr2,           %[dst],      0,  0       \n\t"
403cabdff1aSopenharmony_ci    "add.d       %[dst],         %[dst],      %[stride]   \n\t"
404cabdff1aSopenharmony_ci    "vstelm.d    $vr3,           %[dst],      0,  0       \n\t"
405cabdff1aSopenharmony_ci    "add.d       %[dst],         %[dst],      %[stride]   \n\t"
406cabdff1aSopenharmony_ci    "vstelm.d    $vr4,           %[dst],      0,  0       \n\t"
407cabdff1aSopenharmony_ci    "add.d       %[dst],         %[dst],      %[stride]   \n\t"
408cabdff1aSopenharmony_ci    "vstelm.d    $vr5,           %[dst],      0,  0       \n\t"
409cabdff1aSopenharmony_ci    "add.d       %[dst],         %[dst],      %[stride]   \n\t"
410cabdff1aSopenharmony_ci    "vstelm.d    $vr6,           %[dst],      0,  0       \n\t"
411cabdff1aSopenharmony_ci    "add.d       %[dst],         %[dst],      %[stride]   \n\t"
412cabdff1aSopenharmony_ci    "vstelm.d    $vr7,           %[dst],      0,  0       \n\t"
413cabdff1aSopenharmony_ci    : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src),
414cabdff1aSopenharmony_ci      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
415cabdff1aSopenharmony_ci      [stride_4]"=&r"(stride_4)
416cabdff1aSopenharmony_ci    : [stride]"r"(stride)
417cabdff1aSopenharmony_ci    : "memory"
418cabdff1aSopenharmony_ci    );
419cabdff1aSopenharmony_ci}
420cabdff1aSopenharmony_ci
421cabdff1aSopenharmony_ci/* avg_pixels8_8_lsx   : dst = avg(src, dst)
422cabdff1aSopenharmony_ci * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
423cabdff1aSopenharmony_ci * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
424cabdff1aSopenharmony_cistatic av_always_inline void
425cabdff1aSopenharmony_ciput_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
426cabdff1aSopenharmony_ci                     ptrdiff_t dstStride, ptrdiff_t srcStride)
427cabdff1aSopenharmony_ci{
428cabdff1aSopenharmony_ci    ptrdiff_t stride_2, stride_3, stride_4;
429cabdff1aSopenharmony_ci    __asm__ volatile (
430cabdff1aSopenharmony_ci    /* h0~h7 */
431cabdff1aSopenharmony_ci    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
432cabdff1aSopenharmony_ci    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
433cabdff1aSopenharmony_ci    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
434cabdff1aSopenharmony_ci    "vld        $vr0,            %[src],         0            \n\t"
435cabdff1aSopenharmony_ci    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
436cabdff1aSopenharmony_ci    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
437cabdff1aSopenharmony_ci    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
438cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
439cabdff1aSopenharmony_ci    "vld        $vr4,            %[src],         0            \n\t"
440cabdff1aSopenharmony_ci    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
441cabdff1aSopenharmony_ci    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
442cabdff1aSopenharmony_ci    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
443cabdff1aSopenharmony_ci
444cabdff1aSopenharmony_ci    "vld        $vr8,            %[half],        0x00         \n\t"
445cabdff1aSopenharmony_ci    "vld        $vr9,            %[half],        0x08         \n\t"
446cabdff1aSopenharmony_ci    "vld        $vr10,           %[half],        0x10         \n\t"
447cabdff1aSopenharmony_ci    "vld        $vr11,           %[half],        0x18         \n\t"
448cabdff1aSopenharmony_ci    "vld        $vr12,           %[half],        0x20         \n\t"
449cabdff1aSopenharmony_ci    "vld        $vr13,           %[half],        0x28         \n\t"
450cabdff1aSopenharmony_ci    "vld        $vr14,           %[half],        0x30         \n\t"
451cabdff1aSopenharmony_ci    "vld        $vr15,           %[half],        0x38         \n\t"
452cabdff1aSopenharmony_ci
453cabdff1aSopenharmony_ci    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
454cabdff1aSopenharmony_ci    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
455cabdff1aSopenharmony_ci    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
456cabdff1aSopenharmony_ci    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
457cabdff1aSopenharmony_ci    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
458cabdff1aSopenharmony_ci    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
459cabdff1aSopenharmony_ci    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
460cabdff1aSopenharmony_ci    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
461cabdff1aSopenharmony_ci
462cabdff1aSopenharmony_ci    "vstelm.d   $vr0,            %[dst],         0,  0        \n\t"
463cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
464cabdff1aSopenharmony_ci    "vstelm.d   $vr1,            %[dst],         0,  0        \n\t"
465cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
466cabdff1aSopenharmony_ci    "vstelm.d   $vr2,            %[dst],         0,  0        \n\t"
467cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
468cabdff1aSopenharmony_ci    "vstelm.d   $vr3,            %[dst],         0,  0        \n\t"
469cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
470cabdff1aSopenharmony_ci    "vstelm.d   $vr4,            %[dst],         0,  0        \n\t"
471cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
472cabdff1aSopenharmony_ci    "vstelm.d   $vr5,            %[dst],         0,  0        \n\t"
473cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
474cabdff1aSopenharmony_ci    "vstelm.d   $vr6,            %[dst],         0,  0        \n\t"
475cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
476cabdff1aSopenharmony_ci    "vstelm.d   $vr7,            %[dst],         0,  0        \n\t"
477cabdff1aSopenharmony_ci    : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
478cabdff1aSopenharmony_ci      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
479cabdff1aSopenharmony_ci      [stride_4]"=&r"(stride_4)
480cabdff1aSopenharmony_ci    : [srcStride]"r"(srcStride), [dstStride]"r"(dstStride)
481cabdff1aSopenharmony_ci    : "memory"
482cabdff1aSopenharmony_ci    );
483cabdff1aSopenharmony_ci}
484cabdff1aSopenharmony_ci
485cabdff1aSopenharmony_ci/* avg_pixels8_8_lsx   : dst = avg(src, dst)
486cabdff1aSopenharmony_ci * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
487cabdff1aSopenharmony_ci * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
488cabdff1aSopenharmony_cistatic av_always_inline void
489cabdff1aSopenharmony_ciavg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
490cabdff1aSopenharmony_ci                     ptrdiff_t dstStride, ptrdiff_t srcStride)
491cabdff1aSopenharmony_ci{
492cabdff1aSopenharmony_ci    uint8_t *tmp = dst;
493cabdff1aSopenharmony_ci    ptrdiff_t stride_2, stride_3, stride_4;
494cabdff1aSopenharmony_ci    __asm__ volatile (
495cabdff1aSopenharmony_ci    /* h0~h7 */
496cabdff1aSopenharmony_ci    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
497cabdff1aSopenharmony_ci    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
498cabdff1aSopenharmony_ci    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
499cabdff1aSopenharmony_ci    "vld        $vr0,            %[src],         0            \n\t"
500cabdff1aSopenharmony_ci    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
501cabdff1aSopenharmony_ci    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
502cabdff1aSopenharmony_ci    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
503cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
504cabdff1aSopenharmony_ci    "vld        $vr4,            %[src],         0            \n\t"
505cabdff1aSopenharmony_ci    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
506cabdff1aSopenharmony_ci    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
507cabdff1aSopenharmony_ci    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
508cabdff1aSopenharmony_ci
509cabdff1aSopenharmony_ci    "vld        $vr8,            %[half],        0x00         \n\t"
510cabdff1aSopenharmony_ci    "vld        $vr9,            %[half],        0x08         \n\t"
511cabdff1aSopenharmony_ci    "vld        $vr10,           %[half],        0x10         \n\t"
512cabdff1aSopenharmony_ci    "vld        $vr11,           %[half],        0x18         \n\t"
513cabdff1aSopenharmony_ci    "vld        $vr12,           %[half],        0x20         \n\t"
514cabdff1aSopenharmony_ci    "vld        $vr13,           %[half],        0x28         \n\t"
515cabdff1aSopenharmony_ci    "vld        $vr14,           %[half],        0x30         \n\t"
516cabdff1aSopenharmony_ci    "vld        $vr15,           %[half],        0x38         \n\t"
517cabdff1aSopenharmony_ci
518cabdff1aSopenharmony_ci    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
519cabdff1aSopenharmony_ci    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
520cabdff1aSopenharmony_ci    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
521cabdff1aSopenharmony_ci    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
522cabdff1aSopenharmony_ci    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
523cabdff1aSopenharmony_ci    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
524cabdff1aSopenharmony_ci    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
525cabdff1aSopenharmony_ci    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
526cabdff1aSopenharmony_ci
527cabdff1aSopenharmony_ci    "slli.d     %[stride_2],     %[dstStride],   1            \n\t"
528cabdff1aSopenharmony_ci    "add.d      %[stride_3],     %[stride_2],    %[dstStride] \n\t"
529cabdff1aSopenharmony_ci    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
530cabdff1aSopenharmony_ci    "vld        $vr8,            %[tmp],         0            \n\t"
531cabdff1aSopenharmony_ci    "vldx       $vr9,            %[tmp],         %[dstStride] \n\t"
532cabdff1aSopenharmony_ci    "vldx       $vr10,           %[tmp],         %[stride_2]  \n\t"
533cabdff1aSopenharmony_ci    "vldx       $vr11,           %[tmp],         %[stride_3]  \n\t"
534cabdff1aSopenharmony_ci    "add.d      %[tmp],          %[tmp],         %[stride_4]  \n\t"
535cabdff1aSopenharmony_ci    "vld        $vr12,           %[tmp],         0            \n\t"
536cabdff1aSopenharmony_ci    "vldx       $vr13,           %[tmp],         %[dstStride] \n\t"
537cabdff1aSopenharmony_ci    "vldx       $vr14,           %[tmp],         %[stride_2]  \n\t"
538cabdff1aSopenharmony_ci    "vldx       $vr15,           %[tmp],         %[stride_3]  \n\t"
539cabdff1aSopenharmony_ci
540cabdff1aSopenharmony_ci    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
541cabdff1aSopenharmony_ci    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
542cabdff1aSopenharmony_ci    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
543cabdff1aSopenharmony_ci    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
544cabdff1aSopenharmony_ci    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
545cabdff1aSopenharmony_ci    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
546cabdff1aSopenharmony_ci    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
547cabdff1aSopenharmony_ci    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
548cabdff1aSopenharmony_ci
549cabdff1aSopenharmony_ci    "vstelm.d    $vr0,           %[dst],         0,  0        \n\t"
550cabdff1aSopenharmony_ci    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
551cabdff1aSopenharmony_ci    "vstelm.d    $vr1,           %[dst],         0,  0        \n\t"
552cabdff1aSopenharmony_ci    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
553cabdff1aSopenharmony_ci    "vstelm.d    $vr2,           %[dst],         0,  0        \n\t"
554cabdff1aSopenharmony_ci    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
555cabdff1aSopenharmony_ci    "vstelm.d    $vr3,           %[dst],         0,  0        \n\t"
556cabdff1aSopenharmony_ci    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
557cabdff1aSopenharmony_ci    "vstelm.d    $vr4,           %[dst],         0,  0        \n\t"
558cabdff1aSopenharmony_ci    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
559cabdff1aSopenharmony_ci    "vstelm.d    $vr5,           %[dst],         0,  0        \n\t"
560cabdff1aSopenharmony_ci    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
561cabdff1aSopenharmony_ci    "vstelm.d    $vr6,           %[dst],         0,  0        \n\t"
562cabdff1aSopenharmony_ci    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
563cabdff1aSopenharmony_ci    "vstelm.d    $vr7,           %[dst],         0,  0        \n\t"
564cabdff1aSopenharmony_ci    : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half),
565cabdff1aSopenharmony_ci      [src]"+&r"(src), [stride_2]"=&r"(stride_2),
566cabdff1aSopenharmony_ci      [stride_3]"=&r"(stride_3), [stride_4]"=&r"(stride_4)
567cabdff1aSopenharmony_ci    : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
568cabdff1aSopenharmony_ci    : "memory"
569cabdff1aSopenharmony_ci    );
570cabdff1aSopenharmony_ci}
571cabdff1aSopenharmony_ci
572cabdff1aSopenharmony_ci/* put_pixels16_8_lsx: dst = src */
573cabdff1aSopenharmony_cistatic av_always_inline void
574cabdff1aSopenharmony_ciput_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
575cabdff1aSopenharmony_ci{
576cabdff1aSopenharmony_ci    ptrdiff_t stride_2, stride_3, stride_4;
577cabdff1aSopenharmony_ci    __asm__ volatile (
578cabdff1aSopenharmony_ci    "slli.d     %[stride_2],     %[stride],      1            \n\t"
579cabdff1aSopenharmony_ci    "add.d      %[stride_3],     %[stride_2],    %[stride]    \n\t"
580cabdff1aSopenharmony_ci    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
581cabdff1aSopenharmony_ci    "vld        $vr0,            %[src],         0            \n\t"
582cabdff1aSopenharmony_ci    "vldx       $vr1,            %[src],         %[stride]    \n\t"
583cabdff1aSopenharmony_ci    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
584cabdff1aSopenharmony_ci    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
585cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
586cabdff1aSopenharmony_ci    "vld        $vr4,            %[src],         0            \n\t"
587cabdff1aSopenharmony_ci    "vldx       $vr5,            %[src],         %[stride]    \n\t"
588cabdff1aSopenharmony_ci    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
589cabdff1aSopenharmony_ci    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
590cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
591cabdff1aSopenharmony_ci
592cabdff1aSopenharmony_ci    "vst        $vr0,            %[dst],         0            \n\t"
593cabdff1aSopenharmony_ci    "vstx       $vr1,            %[dst],         %[stride]    \n\t"
594cabdff1aSopenharmony_ci    "vstx       $vr2,            %[dst],         %[stride_2]  \n\t"
595cabdff1aSopenharmony_ci    "vstx       $vr3,            %[dst],         %[stride_3]  \n\t"
596cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
597cabdff1aSopenharmony_ci    "vst        $vr4,            %[dst],         0            \n\t"
598cabdff1aSopenharmony_ci    "vstx       $vr5,            %[dst],         %[stride]    \n\t"
599cabdff1aSopenharmony_ci    "vstx       $vr6,            %[dst],         %[stride_2]  \n\t"
600cabdff1aSopenharmony_ci    "vstx       $vr7,            %[dst],         %[stride_3]  \n\t"
601cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
602cabdff1aSopenharmony_ci
603cabdff1aSopenharmony_ci    "vld        $vr0,            %[src],         0            \n\t"
604cabdff1aSopenharmony_ci    "vldx       $vr1,            %[src],         %[stride]    \n\t"
605cabdff1aSopenharmony_ci    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
606cabdff1aSopenharmony_ci    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
607cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
608cabdff1aSopenharmony_ci    "vld        $vr4,            %[src],         0            \n\t"
609cabdff1aSopenharmony_ci    "vldx       $vr5,            %[src],         %[stride]    \n\t"
610cabdff1aSopenharmony_ci    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
611cabdff1aSopenharmony_ci    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
612cabdff1aSopenharmony_ci
613cabdff1aSopenharmony_ci    "vst        $vr0,            %[dst],         0            \n\t"
614cabdff1aSopenharmony_ci    "vstx       $vr1,            %[dst],         %[stride]    \n\t"
615cabdff1aSopenharmony_ci    "vstx       $vr2,            %[dst],         %[stride_2]  \n\t"
616cabdff1aSopenharmony_ci    "vstx       $vr3,            %[dst],         %[stride_3]  \n\t"
617cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
618cabdff1aSopenharmony_ci    "vst        $vr4,            %[dst],         0            \n\t"
619cabdff1aSopenharmony_ci    "vstx       $vr5,            %[dst],         %[stride]    \n\t"
620cabdff1aSopenharmony_ci    "vstx       $vr6,            %[dst],         %[stride_2]  \n\t"
621cabdff1aSopenharmony_ci    "vstx       $vr7,            %[dst],         %[stride_3]  \n\t"
622cabdff1aSopenharmony_ci    : [dst]"+&r"(dst),            [src]"+&r"(src),
623cabdff1aSopenharmony_ci      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
624cabdff1aSopenharmony_ci      [stride_4]"=&r"(stride_4)
625cabdff1aSopenharmony_ci    : [stride]"r"(stride)
626cabdff1aSopenharmony_ci    : "memory"
627cabdff1aSopenharmony_ci    );
628cabdff1aSopenharmony_ci}
629cabdff1aSopenharmony_ci
630cabdff1aSopenharmony_ci/* avg_pixels16_8_lsx    : dst = avg(src, dst)
631cabdff1aSopenharmony_ci * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
632cabdff1aSopenharmony_ci * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
633cabdff1aSopenharmony_cistatic av_always_inline void
634cabdff1aSopenharmony_ciavg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
635cabdff1aSopenharmony_ci{
636cabdff1aSopenharmony_ci    uint8_t *tmp = dst;
637cabdff1aSopenharmony_ci    ptrdiff_t stride_2, stride_3, stride_4;
638cabdff1aSopenharmony_ci    __asm__ volatile (
639cabdff1aSopenharmony_ci    /* h0~h7 */
640cabdff1aSopenharmony_ci    "slli.d     %[stride_2],     %[stride],      1            \n\t"
641cabdff1aSopenharmony_ci    "add.d      %[stride_3],     %[stride_2],    %[stride]    \n\t"
642cabdff1aSopenharmony_ci    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
643cabdff1aSopenharmony_ci    "vld        $vr0,            %[src],         0            \n\t"
644cabdff1aSopenharmony_ci    "vldx       $vr1,            %[src],         %[stride]    \n\t"
645cabdff1aSopenharmony_ci    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
646cabdff1aSopenharmony_ci    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
647cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
648cabdff1aSopenharmony_ci    "vld        $vr4,            %[src],         0            \n\t"
649cabdff1aSopenharmony_ci    "vldx       $vr5,            %[src],         %[stride]    \n\t"
650cabdff1aSopenharmony_ci    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
651cabdff1aSopenharmony_ci    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
652cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
653cabdff1aSopenharmony_ci
654cabdff1aSopenharmony_ci    "vld        $vr8,            %[tmp],         0            \n\t"
655cabdff1aSopenharmony_ci    "vldx       $vr9,            %[tmp],         %[stride]    \n\t"
656cabdff1aSopenharmony_ci    "vldx       $vr10,           %[tmp],         %[stride_2]  \n\t"
657cabdff1aSopenharmony_ci    "vldx       $vr11,           %[tmp],         %[stride_3]  \n\t"
658cabdff1aSopenharmony_ci    "add.d      %[tmp],          %[tmp],         %[stride_4]  \n\t"
659cabdff1aSopenharmony_ci    "vld        $vr12,           %[tmp],         0            \n\t"
660cabdff1aSopenharmony_ci    "vldx       $vr13,           %[tmp],         %[stride]    \n\t"
661cabdff1aSopenharmony_ci    "vldx       $vr14,           %[tmp],         %[stride_2]  \n\t"
662cabdff1aSopenharmony_ci    "vldx       $vr15,           %[tmp],         %[stride_3]  \n\t"
663cabdff1aSopenharmony_ci    "add.d      %[tmp],          %[tmp],         %[stride_4]  \n\t"
664cabdff1aSopenharmony_ci
665cabdff1aSopenharmony_ci    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
666cabdff1aSopenharmony_ci    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
667cabdff1aSopenharmony_ci    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
668cabdff1aSopenharmony_ci    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
669cabdff1aSopenharmony_ci    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
670cabdff1aSopenharmony_ci    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
671cabdff1aSopenharmony_ci    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
672cabdff1aSopenharmony_ci    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
673cabdff1aSopenharmony_ci
674cabdff1aSopenharmony_ci    "vst        $vr0,            %[dst],         0            \n\t"
675cabdff1aSopenharmony_ci    "vstx       $vr1,            %[dst],         %[stride]    \n\t"
676cabdff1aSopenharmony_ci    "vstx       $vr2,            %[dst],         %[stride_2]  \n\t"
677cabdff1aSopenharmony_ci    "vstx       $vr3,            %[dst],         %[stride_3]  \n\t"
678cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
679cabdff1aSopenharmony_ci    "vst        $vr4,            %[dst],         0            \n\t"
680cabdff1aSopenharmony_ci    "vstx       $vr5,            %[dst],         %[stride]    \n\t"
681cabdff1aSopenharmony_ci    "vstx       $vr6,            %[dst],         %[stride_2]  \n\t"
682cabdff1aSopenharmony_ci    "vstx       $vr7,            %[dst],         %[stride_3]  \n\t"
683cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
684cabdff1aSopenharmony_ci
685cabdff1aSopenharmony_ci    /* h8~h15 */
686cabdff1aSopenharmony_ci    "vld        $vr0,            %[src],         0            \n\t"
687cabdff1aSopenharmony_ci    "vldx       $vr1,            %[src],         %[stride]    \n\t"
688cabdff1aSopenharmony_ci    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
689cabdff1aSopenharmony_ci    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
690cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
691cabdff1aSopenharmony_ci    "vld        $vr4,            %[src],         0            \n\t"
692cabdff1aSopenharmony_ci    "vldx       $vr5,            %[src],         %[stride]    \n\t"
693cabdff1aSopenharmony_ci    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
694cabdff1aSopenharmony_ci    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
695cabdff1aSopenharmony_ci
696cabdff1aSopenharmony_ci    "vld        $vr8,            %[tmp],         0            \n\t"
697cabdff1aSopenharmony_ci    "vldx       $vr9,            %[tmp],         %[stride]    \n\t"
698cabdff1aSopenharmony_ci    "vldx       $vr10,           %[tmp],         %[stride_2]  \n\t"
699cabdff1aSopenharmony_ci    "vldx       $vr11,           %[tmp],         %[stride_3]  \n\t"
700cabdff1aSopenharmony_ci    "add.d      %[tmp],          %[tmp],         %[stride_4]  \n\t"
701cabdff1aSopenharmony_ci    "vld        $vr12,           %[tmp],         0            \n\t"
702cabdff1aSopenharmony_ci    "vldx       $vr13,           %[tmp],         %[stride]    \n\t"
703cabdff1aSopenharmony_ci    "vldx       $vr14,           %[tmp],         %[stride_2]  \n\t"
704cabdff1aSopenharmony_ci    "vldx       $vr15,           %[tmp],         %[stride_3]  \n\t"
705cabdff1aSopenharmony_ci
706cabdff1aSopenharmony_ci    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
707cabdff1aSopenharmony_ci    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
708cabdff1aSopenharmony_ci    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
709cabdff1aSopenharmony_ci    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
710cabdff1aSopenharmony_ci    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
711cabdff1aSopenharmony_ci    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
712cabdff1aSopenharmony_ci    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
713cabdff1aSopenharmony_ci    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
714cabdff1aSopenharmony_ci
715cabdff1aSopenharmony_ci    "vst        $vr0,            %[dst],         0            \n\t"
716cabdff1aSopenharmony_ci    "vstx       $vr1,            %[dst],         %[stride]    \n\t"
717cabdff1aSopenharmony_ci    "vstx       $vr2,            %[dst],         %[stride_2]  \n\t"
718cabdff1aSopenharmony_ci    "vstx       $vr3,            %[dst],         %[stride_3]  \n\t"
719cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[stride_4]  \n\t"
720cabdff1aSopenharmony_ci    "vst        $vr4,            %[dst],         0            \n\t"
721cabdff1aSopenharmony_ci    "vstx       $vr5,            %[dst],         %[stride]    \n\t"
722cabdff1aSopenharmony_ci    "vstx       $vr6,            %[dst],         %[stride_2]  \n\t"
723cabdff1aSopenharmony_ci    "vstx       $vr7,            %[dst],         %[stride_3]  \n\t"
724cabdff1aSopenharmony_ci    : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src),
725cabdff1aSopenharmony_ci      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
726cabdff1aSopenharmony_ci      [stride_4]"=&r"(stride_4)
727cabdff1aSopenharmony_ci    : [stride]"r"(stride)
728cabdff1aSopenharmony_ci    : "memory"
729cabdff1aSopenharmony_ci    );
730cabdff1aSopenharmony_ci}
731cabdff1aSopenharmony_ci
732cabdff1aSopenharmony_ci/* avg_pixels16_8_lsx   : dst = avg(src, dst)
733cabdff1aSopenharmony_ci * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
734cabdff1aSopenharmony_ci * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
735cabdff1aSopenharmony_cistatic av_always_inline void
736cabdff1aSopenharmony_ciput_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
737cabdff1aSopenharmony_ci                      ptrdiff_t dstStride, ptrdiff_t srcStride)
738cabdff1aSopenharmony_ci{
739cabdff1aSopenharmony_ci    ptrdiff_t stride_2, stride_3, stride_4;
740cabdff1aSopenharmony_ci    ptrdiff_t dstride_2, dstride_3, dstride_4;
741cabdff1aSopenharmony_ci    __asm__ volatile (
742cabdff1aSopenharmony_ci    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
743cabdff1aSopenharmony_ci    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
744cabdff1aSopenharmony_ci    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
745cabdff1aSopenharmony_ci    "slli.d     %[dstride_2],    %[dstStride],   1            \n\t"
746cabdff1aSopenharmony_ci    "add.d      %[dstride_3],    %[dstride_2],   %[dstStride] \n\t"
747cabdff1aSopenharmony_ci    "slli.d     %[dstride_4],    %[dstride_2],   1            \n\t"
748cabdff1aSopenharmony_ci    /* h0~h7 */
749cabdff1aSopenharmony_ci    "vld        $vr0,            %[src],         0            \n\t"
750cabdff1aSopenharmony_ci    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
751cabdff1aSopenharmony_ci    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
752cabdff1aSopenharmony_ci    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
753cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
754cabdff1aSopenharmony_ci    "vld        $vr4,            %[src],         0            \n\t"
755cabdff1aSopenharmony_ci    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
756cabdff1aSopenharmony_ci    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
757cabdff1aSopenharmony_ci    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
758cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
759cabdff1aSopenharmony_ci
760cabdff1aSopenharmony_ci    "vld        $vr8,            %[half],        0x00         \n\t"
761cabdff1aSopenharmony_ci    "vld        $vr9,            %[half],        0x10         \n\t"
762cabdff1aSopenharmony_ci    "vld        $vr10,           %[half],        0x20         \n\t"
763cabdff1aSopenharmony_ci    "vld        $vr11,           %[half],        0x30         \n\t"
764cabdff1aSopenharmony_ci    "vld        $vr12,           %[half],        0x40         \n\t"
765cabdff1aSopenharmony_ci    "vld        $vr13,           %[half],        0x50         \n\t"
766cabdff1aSopenharmony_ci    "vld        $vr14,           %[half],        0x60         \n\t"
767cabdff1aSopenharmony_ci    "vld        $vr15,           %[half],        0x70         \n\t"
768cabdff1aSopenharmony_ci
769cabdff1aSopenharmony_ci    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
770cabdff1aSopenharmony_ci    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
771cabdff1aSopenharmony_ci    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
772cabdff1aSopenharmony_ci    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
773cabdff1aSopenharmony_ci    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
774cabdff1aSopenharmony_ci    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
775cabdff1aSopenharmony_ci    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
776cabdff1aSopenharmony_ci    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
777cabdff1aSopenharmony_ci
778cabdff1aSopenharmony_ci    "vst        $vr0,            %[dst],         0            \n\t"
779cabdff1aSopenharmony_ci    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
780cabdff1aSopenharmony_ci    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
781cabdff1aSopenharmony_ci    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
782cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
783cabdff1aSopenharmony_ci    "vst        $vr4,            %[dst],         0            \n\t"
784cabdff1aSopenharmony_ci    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
785cabdff1aSopenharmony_ci    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
786cabdff1aSopenharmony_ci    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
787cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
788cabdff1aSopenharmony_ci
789cabdff1aSopenharmony_ci    /* h8~h15 */
790cabdff1aSopenharmony_ci    "vld        $vr0,            %[src],         0            \n\t"
791cabdff1aSopenharmony_ci    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
792cabdff1aSopenharmony_ci    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
793cabdff1aSopenharmony_ci    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
794cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
795cabdff1aSopenharmony_ci    "vld        $vr4,            %[src],         0            \n\t"
796cabdff1aSopenharmony_ci    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
797cabdff1aSopenharmony_ci    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
798cabdff1aSopenharmony_ci    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
799cabdff1aSopenharmony_ci
800cabdff1aSopenharmony_ci    "vld        $vr8,            %[half],        0x80         \n\t"
801cabdff1aSopenharmony_ci    "vld        $vr9,            %[half],        0x90         \n\t"
802cabdff1aSopenharmony_ci    "vld        $vr10,           %[half],        0xa0         \n\t"
803cabdff1aSopenharmony_ci    "vld        $vr11,           %[half],        0xb0         \n\t"
804cabdff1aSopenharmony_ci    "vld        $vr12,           %[half],        0xc0         \n\t"
805cabdff1aSopenharmony_ci    "vld        $vr13,           %[half],        0xd0         \n\t"
806cabdff1aSopenharmony_ci    "vld        $vr14,           %[half],        0xe0         \n\t"
807cabdff1aSopenharmony_ci    "vld        $vr15,           %[half],        0xf0         \n\t"
808cabdff1aSopenharmony_ci
809cabdff1aSopenharmony_ci    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
810cabdff1aSopenharmony_ci    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
811cabdff1aSopenharmony_ci    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
812cabdff1aSopenharmony_ci    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
813cabdff1aSopenharmony_ci    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
814cabdff1aSopenharmony_ci    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
815cabdff1aSopenharmony_ci    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
816cabdff1aSopenharmony_ci    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
817cabdff1aSopenharmony_ci
818cabdff1aSopenharmony_ci    "vst        $vr0,            %[dst],         0            \n\t"
819cabdff1aSopenharmony_ci    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
820cabdff1aSopenharmony_ci    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
821cabdff1aSopenharmony_ci    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
822cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
823cabdff1aSopenharmony_ci    "vst        $vr4,            %[dst],         0            \n\t"
824cabdff1aSopenharmony_ci    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
825cabdff1aSopenharmony_ci    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
826cabdff1aSopenharmony_ci    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
827cabdff1aSopenharmony_ci    : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
828cabdff1aSopenharmony_ci      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
829cabdff1aSopenharmony_ci      [stride_4]"=&r"(stride_4),  [dstride_2]"=&r"(dstride_2),
830cabdff1aSopenharmony_ci      [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
831cabdff1aSopenharmony_ci    : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
832cabdff1aSopenharmony_ci    : "memory"
833cabdff1aSopenharmony_ci    );
834cabdff1aSopenharmony_ci}
835cabdff1aSopenharmony_ci
836cabdff1aSopenharmony_ci/* avg_pixels16_8_lsx    : dst = avg(src, dst)
837cabdff1aSopenharmony_ci * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
838cabdff1aSopenharmony_ci * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
839cabdff1aSopenharmony_cistatic av_always_inline void
840cabdff1aSopenharmony_ciavg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
841cabdff1aSopenharmony_ci                      ptrdiff_t dstStride, ptrdiff_t srcStride)
842cabdff1aSopenharmony_ci{
843cabdff1aSopenharmony_ci    uint8_t *tmp = dst;
844cabdff1aSopenharmony_ci    ptrdiff_t stride_2, stride_3, stride_4;
845cabdff1aSopenharmony_ci    ptrdiff_t dstride_2, dstride_3, dstride_4;
846cabdff1aSopenharmony_ci    __asm__ volatile (
847cabdff1aSopenharmony_ci    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
848cabdff1aSopenharmony_ci    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
849cabdff1aSopenharmony_ci    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
850cabdff1aSopenharmony_ci    "slli.d     %[dstride_2],    %[dstStride],   1            \n\t"
851cabdff1aSopenharmony_ci    "add.d      %[dstride_3],    %[dstride_2],   %[dstStride] \n\t"
852cabdff1aSopenharmony_ci    "slli.d     %[dstride_4],    %[dstride_2],   1            \n\t"
853cabdff1aSopenharmony_ci    /* h0~h7 */
854cabdff1aSopenharmony_ci    "vld        $vr0,            %[src],         0            \n\t"
855cabdff1aSopenharmony_ci    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
856cabdff1aSopenharmony_ci    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
857cabdff1aSopenharmony_ci    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
858cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
859cabdff1aSopenharmony_ci    "vld        $vr4,            %[src],         0            \n\t"
860cabdff1aSopenharmony_ci    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
861cabdff1aSopenharmony_ci    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
862cabdff1aSopenharmony_ci    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
863cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
864cabdff1aSopenharmony_ci
865cabdff1aSopenharmony_ci    "vld        $vr8,            %[half],        0x00         \n\t"
866cabdff1aSopenharmony_ci    "vld        $vr9,            %[half],        0x10         \n\t"
867cabdff1aSopenharmony_ci    "vld        $vr10,           %[half],        0x20         \n\t"
868cabdff1aSopenharmony_ci    "vld        $vr11,           %[half],        0x30         \n\t"
869cabdff1aSopenharmony_ci    "vld        $vr12,           %[half],        0x40         \n\t"
870cabdff1aSopenharmony_ci    "vld        $vr13,           %[half],        0x50         \n\t"
871cabdff1aSopenharmony_ci    "vld        $vr14,           %[half],        0x60         \n\t"
872cabdff1aSopenharmony_ci    "vld        $vr15,           %[half],        0x70         \n\t"
873cabdff1aSopenharmony_ci
874cabdff1aSopenharmony_ci    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
875cabdff1aSopenharmony_ci    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
876cabdff1aSopenharmony_ci    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
877cabdff1aSopenharmony_ci    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
878cabdff1aSopenharmony_ci    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
879cabdff1aSopenharmony_ci    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
880cabdff1aSopenharmony_ci    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
881cabdff1aSopenharmony_ci    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
882cabdff1aSopenharmony_ci
883cabdff1aSopenharmony_ci    "vld        $vr8,            %[tmp],         0            \n\t"
884cabdff1aSopenharmony_ci    "vldx       $vr9,            %[tmp],         %[dstStride] \n\t"
885cabdff1aSopenharmony_ci    "vldx       $vr10,           %[tmp],         %[dstride_2] \n\t"
886cabdff1aSopenharmony_ci    "vldx       $vr11,           %[tmp],         %[dstride_3] \n\t"
887cabdff1aSopenharmony_ci    "add.d      %[tmp],          %[tmp],         %[dstride_4] \n\t"
888cabdff1aSopenharmony_ci    "vld        $vr12,           %[tmp],         0            \n\t"
889cabdff1aSopenharmony_ci    "vldx       $vr13,           %[tmp],         %[dstStride] \n\t"
890cabdff1aSopenharmony_ci    "vldx       $vr14,           %[tmp],         %[dstride_2] \n\t"
891cabdff1aSopenharmony_ci    "vldx       $vr15,           %[tmp],         %[dstride_3] \n\t"
892cabdff1aSopenharmony_ci    "add.d      %[tmp],          %[tmp],         %[dstride_4] \n\t"
893cabdff1aSopenharmony_ci
894cabdff1aSopenharmony_ci    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
895cabdff1aSopenharmony_ci    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
896cabdff1aSopenharmony_ci    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
897cabdff1aSopenharmony_ci    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
898cabdff1aSopenharmony_ci    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
899cabdff1aSopenharmony_ci    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
900cabdff1aSopenharmony_ci    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
901cabdff1aSopenharmony_ci    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
902cabdff1aSopenharmony_ci
903cabdff1aSopenharmony_ci    "vst        $vr0,            %[dst],         0            \n\t"
904cabdff1aSopenharmony_ci    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
905cabdff1aSopenharmony_ci    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
906cabdff1aSopenharmony_ci    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
907cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
908cabdff1aSopenharmony_ci    "vst        $vr4,            %[dst],         0            \n\t"
909cabdff1aSopenharmony_ci    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
910cabdff1aSopenharmony_ci    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
911cabdff1aSopenharmony_ci    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
912cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
913cabdff1aSopenharmony_ci
914cabdff1aSopenharmony_ci    /* h8~h15    */
915cabdff1aSopenharmony_ci    "vld        $vr0,            %[src],         0            \n\t"
916cabdff1aSopenharmony_ci    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
917cabdff1aSopenharmony_ci    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
918cabdff1aSopenharmony_ci    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
919cabdff1aSopenharmony_ci    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
920cabdff1aSopenharmony_ci    "vld        $vr4,            %[src],         0            \n\t"
921cabdff1aSopenharmony_ci    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
922cabdff1aSopenharmony_ci    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
923cabdff1aSopenharmony_ci    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
924cabdff1aSopenharmony_ci
925cabdff1aSopenharmony_ci    "vld        $vr8,            %[half],        0x80         \n\t"
926cabdff1aSopenharmony_ci    "vld        $vr9,            %[half],        0x90         \n\t"
927cabdff1aSopenharmony_ci    "vld        $vr10,           %[half],        0xa0         \n\t"
928cabdff1aSopenharmony_ci    "vld        $vr11,           %[half],        0xb0         \n\t"
929cabdff1aSopenharmony_ci    "vld        $vr12,           %[half],        0xc0         \n\t"
930cabdff1aSopenharmony_ci    "vld        $vr13,           %[half],        0xd0         \n\t"
931cabdff1aSopenharmony_ci    "vld        $vr14,           %[half],        0xe0         \n\t"
932cabdff1aSopenharmony_ci    "vld        $vr15,           %[half],        0xf0         \n\t"
933cabdff1aSopenharmony_ci
934cabdff1aSopenharmony_ci    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
935cabdff1aSopenharmony_ci    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
936cabdff1aSopenharmony_ci    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
937cabdff1aSopenharmony_ci    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
938cabdff1aSopenharmony_ci    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
939cabdff1aSopenharmony_ci    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
940cabdff1aSopenharmony_ci    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
941cabdff1aSopenharmony_ci    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
942cabdff1aSopenharmony_ci
943cabdff1aSopenharmony_ci    "vld        $vr8,            %[tmp],         0            \n\t"
944cabdff1aSopenharmony_ci    "vldx       $vr9,            %[tmp],         %[dstStride] \n\t"
945cabdff1aSopenharmony_ci    "vldx       $vr10,           %[tmp],         %[dstride_2] \n\t"
946cabdff1aSopenharmony_ci    "vldx       $vr11,           %[tmp],         %[dstride_3] \n\t"
947cabdff1aSopenharmony_ci    "add.d      %[tmp],          %[tmp],         %[dstride_4] \n\t"
948cabdff1aSopenharmony_ci    "vld        $vr12,           %[tmp],         0            \n\t"
949cabdff1aSopenharmony_ci    "vldx       $vr13,           %[tmp],         %[dstStride] \n\t"
950cabdff1aSopenharmony_ci    "vldx       $vr14,           %[tmp],         %[dstride_2] \n\t"
951cabdff1aSopenharmony_ci    "vldx       $vr15,           %[tmp],         %[dstride_3] \n\t"
952cabdff1aSopenharmony_ci
953cabdff1aSopenharmony_ci    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
954cabdff1aSopenharmony_ci    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
955cabdff1aSopenharmony_ci    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
956cabdff1aSopenharmony_ci    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
957cabdff1aSopenharmony_ci    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
958cabdff1aSopenharmony_ci    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
959cabdff1aSopenharmony_ci    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
960cabdff1aSopenharmony_ci    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
961cabdff1aSopenharmony_ci
962cabdff1aSopenharmony_ci    "vst        $vr0,            %[dst],         0            \n\t"
963cabdff1aSopenharmony_ci    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
964cabdff1aSopenharmony_ci    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
965cabdff1aSopenharmony_ci    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
966cabdff1aSopenharmony_ci    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
967cabdff1aSopenharmony_ci    "vst        $vr4,            %[dst],         0            \n\t"
968cabdff1aSopenharmony_ci    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
969cabdff1aSopenharmony_ci    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
970cabdff1aSopenharmony_ci    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
971cabdff1aSopenharmony_ci    : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half), [src]"+&r"(src),
972cabdff1aSopenharmony_ci      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
973cabdff1aSopenharmony_ci      [stride_4]"=&r"(stride_4),  [dstride_2]"=&r"(dstride_2),
974cabdff1aSopenharmony_ci      [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
975cabdff1aSopenharmony_ci    : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
976cabdff1aSopenharmony_ci    : "memory"
977cabdff1aSopenharmony_ci    );
978cabdff1aSopenharmony_ci}
979cabdff1aSopenharmony_ci
980cabdff1aSopenharmony_ci#define QPEL8_H_LOWPASS(out_v)                                               \
981cabdff1aSopenharmony_ci    src00 = __lasx_xvld(src, - 2);                                           \
982cabdff1aSopenharmony_ci    src += srcStride;                                                        \
983cabdff1aSopenharmony_ci    src10 = __lasx_xvld(src, - 2);                                           \
984cabdff1aSopenharmony_ci    src += srcStride;                                                        \
985cabdff1aSopenharmony_ci    src00 = __lasx_xvpermi_q(src00, src10, 0x02);                            \
986cabdff1aSopenharmony_ci    src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1);                   \
987cabdff1aSopenharmony_ci    src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2);                   \
988cabdff1aSopenharmony_ci    src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3);                   \
989cabdff1aSopenharmony_ci    src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4);                   \
990cabdff1aSopenharmony_ci    src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5);                   \
991cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
992cabdff1aSopenharmony_ci    src00 = __lasx_xvaddwl_h_bu(src00, src05);                               \
993cabdff1aSopenharmony_ci    src02 = __lasx_xvmul_h(src02, h_20);                                     \
994cabdff1aSopenharmony_ci    src01 = __lasx_xvmul_h(src01, h_5);                                      \
995cabdff1aSopenharmony_ci    src02 = __lasx_xvssub_h(src02, src01);                                   \
996cabdff1aSopenharmony_ci    src02 = __lasx_xvsadd_h(src02, src00);                                   \
997cabdff1aSopenharmony_ci    src02 = __lasx_xvsadd_h(src02, h_16);                                    \
998cabdff1aSopenharmony_ci    out_v = __lasx_xvssrani_bu_h(src02, src02, 5);                           \
999cabdff1aSopenharmony_ci
1000cabdff1aSopenharmony_cistatic av_always_inline void
1001cabdff1aSopenharmony_ciput_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride,
1002cabdff1aSopenharmony_ci                              int srcStride)
1003cabdff1aSopenharmony_ci{
1004cabdff1aSopenharmony_ci    int dstStride_2x = dstStride << 1;
1005cabdff1aSopenharmony_ci    __m256i src00, src01, src02, src03, src04, src05, src10;
1006cabdff1aSopenharmony_ci    __m256i out0, out1, out2, out3;
1007cabdff1aSopenharmony_ci    __m256i h_20 = __lasx_xvldi(0x414);
1008cabdff1aSopenharmony_ci    __m256i h_5  = __lasx_xvldi(0x405);
1009cabdff1aSopenharmony_ci    __m256i h_16 = __lasx_xvldi(0x410);
1010cabdff1aSopenharmony_ci    __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1011cabdff1aSopenharmony_ci    __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1012cabdff1aSopenharmony_ci    __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1013cabdff1aSopenharmony_ci    __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1014cabdff1aSopenharmony_ci    __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1015cabdff1aSopenharmony_ci
1016cabdff1aSopenharmony_ci    QPEL8_H_LOWPASS(out0)
1017cabdff1aSopenharmony_ci    QPEL8_H_LOWPASS(out1)
1018cabdff1aSopenharmony_ci    QPEL8_H_LOWPASS(out2)
1019cabdff1aSopenharmony_ci    QPEL8_H_LOWPASS(out3)
1020cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst, 0, 0);
1021cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + dstStride, 0, 2);
1022cabdff1aSopenharmony_ci    dst += dstStride_2x;
1023cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst, 0, 0);
1024cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + dstStride, 0, 2);
1025cabdff1aSopenharmony_ci    dst += dstStride_2x;
1026cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out2, dst, 0, 0);
1027cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out2, dst + dstStride, 0, 2);
1028cabdff1aSopenharmony_ci    dst += dstStride_2x;
1029cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out3, dst, 0, 0);
1030cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out3, dst + dstStride, 0, 2);
1031cabdff1aSopenharmony_ci}
1032cabdff1aSopenharmony_ci
1033cabdff1aSopenharmony_ci#define QPEL8_V_LOWPASS(src0, src1, src2, src3, src4, src5, src6,       \
1034cabdff1aSopenharmony_ci                        tmp0, tmp1, tmp2, tmp3, tmp4, tmp5)             \
1035cabdff1aSopenharmony_ci{                                                                       \
1036cabdff1aSopenharmony_ci    tmp0 = __lasx_xvpermi_q(src0, src1, 0x02);                          \
1037cabdff1aSopenharmony_ci    tmp1 = __lasx_xvpermi_q(src1, src2, 0x02);                          \
1038cabdff1aSopenharmony_ci    tmp2 = __lasx_xvpermi_q(src2, src3, 0x02);                          \
1039cabdff1aSopenharmony_ci    tmp3 = __lasx_xvpermi_q(src3, src4, 0x02);                          \
1040cabdff1aSopenharmony_ci    tmp4 = __lasx_xvpermi_q(src4, src5, 0x02);                          \
1041cabdff1aSopenharmony_ci    tmp5 = __lasx_xvpermi_q(src5, src6, 0x02);                          \
1042cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvaddwl_h_bu, tmp2, tmp3, tmp1, tmp4, tmp2, tmp1); \
1043cabdff1aSopenharmony_ci    tmp0 = __lasx_xvaddwl_h_bu(tmp0, tmp5);                             \
1044cabdff1aSopenharmony_ci    tmp2 = __lasx_xvmul_h(tmp2, h_20);                                  \
1045cabdff1aSopenharmony_ci    tmp1 = __lasx_xvmul_h(tmp1, h_5);                                   \
1046cabdff1aSopenharmony_ci    tmp2 = __lasx_xvssub_h(tmp2, tmp1);                                 \
1047cabdff1aSopenharmony_ci    tmp2 = __lasx_xvsadd_h(tmp2, tmp0);                                 \
1048cabdff1aSopenharmony_ci    tmp2 = __lasx_xvsadd_h(tmp2, h_16);                                 \
1049cabdff1aSopenharmony_ci    tmp2 = __lasx_xvssrani_bu_h(tmp2, tmp2, 5);                         \
1050cabdff1aSopenharmony_ci}
1051cabdff1aSopenharmony_ci
1052cabdff1aSopenharmony_cistatic av_always_inline void
1053cabdff1aSopenharmony_ciput_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride,
1054cabdff1aSopenharmony_ci                              int srcStride)
1055cabdff1aSopenharmony_ci{
1056cabdff1aSopenharmony_ci    int srcStride_2x = srcStride << 1;
1057cabdff1aSopenharmony_ci    int dstStride_2x = dstStride << 1;
1058cabdff1aSopenharmony_ci    int srcStride_4x = srcStride << 2;
1059cabdff1aSopenharmony_ci    int srcStride_3x = srcStride_2x + srcStride;
1060cabdff1aSopenharmony_ci    __m256i src00, src01, src02, src03, src04, src05, src06;
1061cabdff1aSopenharmony_ci    __m256i src07, src08, src09, src10, src11, src12;
1062cabdff1aSopenharmony_ci    __m256i tmp00, tmp01, tmp02, tmp03, tmp04, tmp05;
1063cabdff1aSopenharmony_ci    __m256i h_20 = __lasx_xvldi(0x414);
1064cabdff1aSopenharmony_ci    __m256i h_5  = __lasx_xvldi(0x405);
1065cabdff1aSopenharmony_ci    __m256i h_16 = __lasx_xvldi(0x410);
1066cabdff1aSopenharmony_ci
1067cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0,
1068cabdff1aSopenharmony_ci              src00, src01);
1069cabdff1aSopenharmony_ci    src02 = __lasx_xvld(src, 0);
1070cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1071cabdff1aSopenharmony_ci              srcStride_3x, src, srcStride_4x, src03, src04, src05, src06);
1072cabdff1aSopenharmony_ci    src += srcStride_4x;
1073cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1074cabdff1aSopenharmony_ci              srcStride_3x, src, srcStride_4x, src07, src08, src09, src10);
1075cabdff1aSopenharmony_ci    src += srcStride_4x;
1076cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12);
1077cabdff1aSopenharmony_ci
1078cabdff1aSopenharmony_ci    QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06,
1079cabdff1aSopenharmony_ci                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1080cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp02, dst, 0, 0);
1081cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1082cabdff1aSopenharmony_ci    dst += dstStride_2x;
1083cabdff1aSopenharmony_ci    QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08,
1084cabdff1aSopenharmony_ci                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1085cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp02, dst, 0, 0);
1086cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1087cabdff1aSopenharmony_ci    dst += dstStride_2x;
1088cabdff1aSopenharmony_ci    QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10,
1089cabdff1aSopenharmony_ci                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1090cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp02, dst, 0, 0);
1091cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1092cabdff1aSopenharmony_ci    dst += dstStride_2x;
1093cabdff1aSopenharmony_ci    QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12,
1094cabdff1aSopenharmony_ci                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1095cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp02, dst, 0, 0);
1096cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1097cabdff1aSopenharmony_ci}
1098cabdff1aSopenharmony_ci
1099cabdff1aSopenharmony_cistatic av_always_inline void
1100cabdff1aSopenharmony_ciavg_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride,
1101cabdff1aSopenharmony_ci                              int srcStride)
1102cabdff1aSopenharmony_ci{
1103cabdff1aSopenharmony_ci    int srcStride_2x = srcStride << 1;
1104cabdff1aSopenharmony_ci    int srcStride_4x = srcStride << 2;
1105cabdff1aSopenharmony_ci    int dstStride_2x = dstStride << 1;
1106cabdff1aSopenharmony_ci    int dstStride_4x = dstStride << 2;
1107cabdff1aSopenharmony_ci    int srcStride_3x = srcStride_2x + srcStride;
1108cabdff1aSopenharmony_ci    int dstStride_3x = dstStride_2x + dstStride;
1109cabdff1aSopenharmony_ci    __m256i src00, src01, src02, src03, src04, src05, src06;
1110cabdff1aSopenharmony_ci    __m256i src07, src08, src09, src10, src11, src12, tmp00;
1111cabdff1aSopenharmony_ci    __m256i tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp08, tmp09;
1112cabdff1aSopenharmony_ci    __m256i h_20 = __lasx_xvldi(0x414);
1113cabdff1aSopenharmony_ci    __m256i h_5  = __lasx_xvldi(0x405);
1114cabdff1aSopenharmony_ci    __m256i h_16 = __lasx_xvldi(0x410);
1115cabdff1aSopenharmony_ci
1116cabdff1aSopenharmony_ci
1117cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0,
1118cabdff1aSopenharmony_ci              src00, src01);
1119cabdff1aSopenharmony_ci    src02 = __lasx_xvld(src, 0);
1120cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1121cabdff1aSopenharmony_ci              srcStride_3x, src, srcStride_4x, src03, src04, src05, src06);
1122cabdff1aSopenharmony_ci    src += srcStride_4x;
1123cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1124cabdff1aSopenharmony_ci              srcStride_3x, src, srcStride_4x, src07, src08, src09, src10);
1125cabdff1aSopenharmony_ci    src += srcStride_4x;
1126cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12);
1127cabdff1aSopenharmony_ci
1128cabdff1aSopenharmony_ci    tmp06 = __lasx_xvld(dst, 0);
1129cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
1130cabdff1aSopenharmony_ci              dst, dstStride_3x, dst, dstStride_4x,
1131cabdff1aSopenharmony_ci              tmp07, tmp02, tmp03, tmp04);
1132cabdff1aSopenharmony_ci    dst += dstStride_4x;
1133cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
1134cabdff1aSopenharmony_ci              tmp05, tmp00);
1135cabdff1aSopenharmony_ci    tmp01 = __lasx_xvldx(dst, dstStride_3x);
1136cabdff1aSopenharmony_ci    dst -= dstStride_4x;
1137cabdff1aSopenharmony_ci
1138cabdff1aSopenharmony_ci    tmp06 = __lasx_xvpermi_q(tmp06, tmp07, 0x02);
1139cabdff1aSopenharmony_ci    tmp07 = __lasx_xvpermi_q(tmp02, tmp03, 0x02);
1140cabdff1aSopenharmony_ci    tmp08 = __lasx_xvpermi_q(tmp04, tmp05, 0x02);
1141cabdff1aSopenharmony_ci    tmp09 = __lasx_xvpermi_q(tmp00, tmp01, 0x02);
1142cabdff1aSopenharmony_ci
1143cabdff1aSopenharmony_ci    QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06,
1144cabdff1aSopenharmony_ci                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1145cabdff1aSopenharmony_ci    tmp06 = __lasx_xvavgr_bu(tmp06, tmp02);
1146cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp06, dst, 0, 0);
1147cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp06, dst + dstStride, 0, 2);
1148cabdff1aSopenharmony_ci    dst += dstStride_2x;
1149cabdff1aSopenharmony_ci    QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08,
1150cabdff1aSopenharmony_ci                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1151cabdff1aSopenharmony_ci    tmp07 = __lasx_xvavgr_bu(tmp07, tmp02);
1152cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp07, dst, 0, 0);
1153cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp07, dst + dstStride, 0, 2);
1154cabdff1aSopenharmony_ci    dst += dstStride_2x;
1155cabdff1aSopenharmony_ci    QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10,
1156cabdff1aSopenharmony_ci                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1157cabdff1aSopenharmony_ci    tmp08 = __lasx_xvavgr_bu(tmp08, tmp02);
1158cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp08, dst, 0, 0);
1159cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp08, dst + dstStride, 0, 2);
1160cabdff1aSopenharmony_ci    dst += dstStride_2x;
1161cabdff1aSopenharmony_ci    QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12,
1162cabdff1aSopenharmony_ci                    tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1163cabdff1aSopenharmony_ci    tmp09 = __lasx_xvavgr_bu(tmp09, tmp02);
1164cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp09, dst, 0, 0);
1165cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp09, dst + dstStride, 0, 2);
1166cabdff1aSopenharmony_ci}
1167cabdff1aSopenharmony_ci
1168cabdff1aSopenharmony_ci#define QPEL8_HV_LOWPASS_H(tmp)                                              \
1169cabdff1aSopenharmony_ci{                                                                            \
1170cabdff1aSopenharmony_ci    src00 = __lasx_xvld(src, -2);                                            \
1171cabdff1aSopenharmony_ci    src += srcStride;                                                        \
1172cabdff1aSopenharmony_ci    src10 = __lasx_xvld(src, -2);                                            \
1173cabdff1aSopenharmony_ci    src += srcStride;                                                        \
1174cabdff1aSopenharmony_ci    src00 = __lasx_xvpermi_q(src00, src10, 0x02);                            \
1175cabdff1aSopenharmony_ci    src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1);                   \
1176cabdff1aSopenharmony_ci    src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2);                   \
1177cabdff1aSopenharmony_ci    src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3);                   \
1178cabdff1aSopenharmony_ci    src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4);                   \
1179cabdff1aSopenharmony_ci    src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5);                   \
1180cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
1181cabdff1aSopenharmony_ci    src00 = __lasx_xvaddwl_h_bu(src00, src05);                               \
1182cabdff1aSopenharmony_ci    src02 = __lasx_xvmul_h(src02, h_20);                                     \
1183cabdff1aSopenharmony_ci    src01 = __lasx_xvmul_h(src01, h_5);                                      \
1184cabdff1aSopenharmony_ci    src02 = __lasx_xvssub_h(src02, src01);                                   \
1185cabdff1aSopenharmony_ci    tmp  = __lasx_xvsadd_h(src02, src00);                                    \
1186cabdff1aSopenharmony_ci}
1187cabdff1aSopenharmony_ci
1188cabdff1aSopenharmony_ci#define QPEL8_HV_LOWPASS_V(src0, src1, src2, src3,                       \
1189cabdff1aSopenharmony_ci                           src4, src5, temp0, temp1,                     \
1190cabdff1aSopenharmony_ci                           temp2, temp3, temp4, temp5,                   \
1191cabdff1aSopenharmony_ci                           out)                                          \
1192cabdff1aSopenharmony_ci{                                                                        \
1193cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvaddwl_w_h, src2, src3, src1, src4, temp0, temp2); \
1194cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvaddwh_w_h, src2, src3, src1, src4, temp1, temp3); \
1195cabdff1aSopenharmony_ci    temp4 = __lasx_xvaddwl_w_h(src0, src5);                              \
1196cabdff1aSopenharmony_ci    temp5 = __lasx_xvaddwh_w_h(src0, src5);                              \
1197cabdff1aSopenharmony_ci    temp0 = __lasx_xvmul_w(temp0, w_20);                                 \
1198cabdff1aSopenharmony_ci    temp1 = __lasx_xvmul_w(temp1, w_20);                                 \
1199cabdff1aSopenharmony_ci    temp2 = __lasx_xvmul_w(temp2, w_5);                                  \
1200cabdff1aSopenharmony_ci    temp3 = __lasx_xvmul_w(temp3, w_5);                                  \
1201cabdff1aSopenharmony_ci    temp0 = __lasx_xvssub_w(temp0, temp2);                               \
1202cabdff1aSopenharmony_ci    temp1 = __lasx_xvssub_w(temp1, temp3);                               \
1203cabdff1aSopenharmony_ci    temp0 = __lasx_xvsadd_w(temp0, temp4);                               \
1204cabdff1aSopenharmony_ci    temp1 = __lasx_xvsadd_w(temp1, temp5);                               \
1205cabdff1aSopenharmony_ci    temp0 = __lasx_xvsadd_w(temp0, w_512);                               \
1206cabdff1aSopenharmony_ci    temp1 = __lasx_xvsadd_w(temp1, w_512);                               \
1207cabdff1aSopenharmony_ci    temp0 = __lasx_xvssrani_hu_w(temp0, temp0, 10);                      \
1208cabdff1aSopenharmony_ci    temp1 = __lasx_xvssrani_hu_w(temp1, temp1, 10);                      \
1209cabdff1aSopenharmony_ci    temp0 = __lasx_xvpackev_d(temp1, temp0);                             \
1210cabdff1aSopenharmony_ci    out   = __lasx_xvssrani_bu_h(temp0, temp0, 0);                       \
1211cabdff1aSopenharmony_ci}
1212cabdff1aSopenharmony_ci
1213cabdff1aSopenharmony_cistatic av_always_inline void
1214cabdff1aSopenharmony_ciput_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1215cabdff1aSopenharmony_ci                               ptrdiff_t dstStride, ptrdiff_t srcStride)
1216cabdff1aSopenharmony_ci{
1217cabdff1aSopenharmony_ci    __m256i src00, src01, src02, src03, src04, src05, src10;
1218cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1219cabdff1aSopenharmony_ci    __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
1220cabdff1aSopenharmony_ci    __m256i h_20 = __lasx_xvldi(0x414);
1221cabdff1aSopenharmony_ci    __m256i h_5  = __lasx_xvldi(0x405);
1222cabdff1aSopenharmony_ci    __m256i w_20 = __lasx_xvldi(0x814);
1223cabdff1aSopenharmony_ci    __m256i w_5  = __lasx_xvldi(0x805);
1224cabdff1aSopenharmony_ci    __m256i w_512 = {512};
1225cabdff1aSopenharmony_ci    __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1226cabdff1aSopenharmony_ci    __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1227cabdff1aSopenharmony_ci    __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1228cabdff1aSopenharmony_ci    __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1229cabdff1aSopenharmony_ci    __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1230cabdff1aSopenharmony_ci
1231cabdff1aSopenharmony_ci    w_512 = __lasx_xvreplve0_w(w_512);
1232cabdff1aSopenharmony_ci
1233cabdff1aSopenharmony_ci    src -= srcStride << 1;
1234cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_H(tmp0)
1235cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_H(tmp2)
1236cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_H(tmp4)
1237cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_H(tmp6)
1238cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_H(tmp8)
1239cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_H(tmp10)
1240cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_H(tmp12)
1241cabdff1aSopenharmony_ci    tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
1242cabdff1aSopenharmony_ci    tmp9  = __lasx_xvpermi_q(tmp10, tmp8,  0x21);
1243cabdff1aSopenharmony_ci    tmp7  = __lasx_xvpermi_q(tmp8,  tmp6,  0x21);
1244cabdff1aSopenharmony_ci    tmp5  = __lasx_xvpermi_q(tmp6,  tmp4,  0x21);
1245cabdff1aSopenharmony_ci    tmp3  = __lasx_xvpermi_q(tmp4,  tmp2,  0x21);
1246cabdff1aSopenharmony_ci    tmp1  = __lasx_xvpermi_q(tmp2,  tmp0,  0x21);
1247cabdff1aSopenharmony_ci
1248cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01,
1249cabdff1aSopenharmony_ci                       src02, src03, src04, src05, tmp0)
1250cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01,
1251cabdff1aSopenharmony_ci                       src02, src03, src04, src05, tmp2)
1252cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01,
1253cabdff1aSopenharmony_ci                       src02, src03, src04, src05, tmp4)
1254cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01,
1255cabdff1aSopenharmony_ci                       src02, src03, src04, src05, tmp6)
1256cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp0, dst, 0, 0);
1257cabdff1aSopenharmony_ci    dst += dstStride;
1258cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp0, dst, 0, 2);
1259cabdff1aSopenharmony_ci    dst += dstStride;
1260cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp2, dst, 0, 0);
1261cabdff1aSopenharmony_ci    dst += dstStride;
1262cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp2, dst, 0, 2);
1263cabdff1aSopenharmony_ci    dst += dstStride;
1264cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp4, dst, 0, 0);
1265cabdff1aSopenharmony_ci    dst += dstStride;
1266cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp4, dst, 0, 2);
1267cabdff1aSopenharmony_ci    dst += dstStride;
1268cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp6, dst, 0, 0);
1269cabdff1aSopenharmony_ci    dst += dstStride;
1270cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp6, dst, 0, 2);
1271cabdff1aSopenharmony_ci}
1272cabdff1aSopenharmony_ci
1273cabdff1aSopenharmony_cistatic av_always_inline void
1274cabdff1aSopenharmony_ciavg_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride,
1275cabdff1aSopenharmony_ci                              int srcStride)
1276cabdff1aSopenharmony_ci{
1277cabdff1aSopenharmony_ci    int dstStride_2x = dstStride << 1;
1278cabdff1aSopenharmony_ci    int dstStride_4x = dstStride << 2;
1279cabdff1aSopenharmony_ci    int dstStride_3x = dstStride_2x + dstStride;
1280cabdff1aSopenharmony_ci    __m256i src00, src01, src02, src03, src04, src05, src10;
1281cabdff1aSopenharmony_ci    __m256i dst00, dst01, dst0, dst1, dst2, dst3;
1282cabdff1aSopenharmony_ci    __m256i out0, out1, out2, out3;
1283cabdff1aSopenharmony_ci    __m256i h_20 = __lasx_xvldi(0x414);
1284cabdff1aSopenharmony_ci    __m256i h_5  = __lasx_xvldi(0x405);
1285cabdff1aSopenharmony_ci    __m256i h_16 = __lasx_xvldi(0x410);
1286cabdff1aSopenharmony_ci    __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1287cabdff1aSopenharmony_ci    __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1288cabdff1aSopenharmony_ci    __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1289cabdff1aSopenharmony_ci    __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1290cabdff1aSopenharmony_ci    __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1291cabdff1aSopenharmony_ci
1292cabdff1aSopenharmony_ci    QPEL8_H_LOWPASS(out0)
1293cabdff1aSopenharmony_ci    QPEL8_H_LOWPASS(out1)
1294cabdff1aSopenharmony_ci    QPEL8_H_LOWPASS(out2)
1295cabdff1aSopenharmony_ci    QPEL8_H_LOWPASS(out3)
1296cabdff1aSopenharmony_ci    src00 = __lasx_xvld(dst, 0);
1297cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
1298cabdff1aSopenharmony_ci              dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
1299cabdff1aSopenharmony_ci    dst += dstStride_4x;
1300cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, dst00);
1301cabdff1aSopenharmony_ci    dst01 = __lasx_xvldx(dst, dstStride_3x);
1302cabdff1aSopenharmony_ci    dst -= dstStride_4x;
1303cabdff1aSopenharmony_ci    dst0 = __lasx_xvpermi_q(src00, src01, 0x02);
1304cabdff1aSopenharmony_ci    dst1 = __lasx_xvpermi_q(src02, src03, 0x02);
1305cabdff1aSopenharmony_ci    dst2 = __lasx_xvpermi_q(src04, src05, 0x02);
1306cabdff1aSopenharmony_ci    dst3 = __lasx_xvpermi_q(dst00, dst01, 0x02);
1307cabdff1aSopenharmony_ci    dst0 = __lasx_xvavgr_bu(dst0, out0);
1308cabdff1aSopenharmony_ci    dst1 = __lasx_xvavgr_bu(dst1, out1);
1309cabdff1aSopenharmony_ci    dst2 = __lasx_xvavgr_bu(dst2, out2);
1310cabdff1aSopenharmony_ci    dst3 = __lasx_xvavgr_bu(dst3, out3);
1311cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst, 0, 0);
1312cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + dstStride, 0, 2);
1313cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst + dstStride_2x, 0, 0);
1314cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst + dstStride_3x, 0, 2);
1315cabdff1aSopenharmony_ci    dst += dstStride_4x;
1316cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst2, dst, 0, 0);
1317cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst2, dst + dstStride, 0, 2);
1318cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst3, dst + dstStride_2x, 0, 0);
1319cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst3, dst + dstStride_3x, 0, 2);
1320cabdff1aSopenharmony_ci}
1321cabdff1aSopenharmony_ci
1322cabdff1aSopenharmony_cistatic av_always_inline void
1323cabdff1aSopenharmony_ciavg_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1324cabdff1aSopenharmony_ci                               ptrdiff_t dstStride, ptrdiff_t srcStride)
1325cabdff1aSopenharmony_ci{
1326cabdff1aSopenharmony_ci    __m256i src00, src01, src02, src03, src04, src05, src10;
1327cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1328cabdff1aSopenharmony_ci    __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
1329cabdff1aSopenharmony_ci    __m256i h_20 = __lasx_xvldi(0x414);
1330cabdff1aSopenharmony_ci    __m256i h_5  = __lasx_xvldi(0x405);
1331cabdff1aSopenharmony_ci    __m256i w_20 = __lasx_xvldi(0x814);
1332cabdff1aSopenharmony_ci    __m256i w_5  = __lasx_xvldi(0x805);
1333cabdff1aSopenharmony_ci    __m256i w_512 = {512};
1334cabdff1aSopenharmony_ci    __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1335cabdff1aSopenharmony_ci    __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1336cabdff1aSopenharmony_ci    __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1337cabdff1aSopenharmony_ci    __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1338cabdff1aSopenharmony_ci    __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1339cabdff1aSopenharmony_ci    ptrdiff_t dstStride_2x = dstStride << 1;
1340cabdff1aSopenharmony_ci    ptrdiff_t dstStride_4x = dstStride << 2;
1341cabdff1aSopenharmony_ci    ptrdiff_t dstStride_3x = dstStride_2x + dstStride;
1342cabdff1aSopenharmony_ci
1343cabdff1aSopenharmony_ci    w_512 = __lasx_xvreplve0_w(w_512);
1344cabdff1aSopenharmony_ci
1345cabdff1aSopenharmony_ci    src -= srcStride << 1;
1346cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_H(tmp0)
1347cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_H(tmp2)
1348cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_H(tmp4)
1349cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_H(tmp6)
1350cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_H(tmp8)
1351cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_H(tmp10)
1352cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_H(tmp12)
1353cabdff1aSopenharmony_ci    tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
1354cabdff1aSopenharmony_ci    tmp9  = __lasx_xvpermi_q(tmp10, tmp8,  0x21);
1355cabdff1aSopenharmony_ci    tmp7  = __lasx_xvpermi_q(tmp8,  tmp6,  0x21);
1356cabdff1aSopenharmony_ci    tmp5  = __lasx_xvpermi_q(tmp6,  tmp4,  0x21);
1357cabdff1aSopenharmony_ci    tmp3  = __lasx_xvpermi_q(tmp4,  tmp2,  0x21);
1358cabdff1aSopenharmony_ci    tmp1  = __lasx_xvpermi_q(tmp2,  tmp0,  0x21);
1359cabdff1aSopenharmony_ci
1360cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01,
1361cabdff1aSopenharmony_ci                       src02, src03, src04, src05, tmp0)
1362cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01,
1363cabdff1aSopenharmony_ci                       src02, src03, src04, src05, tmp2)
1364cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01,
1365cabdff1aSopenharmony_ci                       src02, src03, src04, src05, tmp4)
1366cabdff1aSopenharmony_ci    QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01,
1367cabdff1aSopenharmony_ci                       src02, src03, src04, src05, tmp6)
1368cabdff1aSopenharmony_ci
1369cabdff1aSopenharmony_ci    src00 = __lasx_xvld(dst, 0);
1370cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
1371cabdff1aSopenharmony_ci              dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
1372cabdff1aSopenharmony_ci    dst += dstStride_4x;
1373cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, tmp8);
1374cabdff1aSopenharmony_ci    tmp9 = __lasx_xvldx(dst, dstStride_3x);
1375cabdff1aSopenharmony_ci    dst -= dstStride_4x;
1376cabdff1aSopenharmony_ci    tmp1 = __lasx_xvpermi_q(src00, src01, 0x02);
1377cabdff1aSopenharmony_ci    tmp3 = __lasx_xvpermi_q(src02, src03, 0x02);
1378cabdff1aSopenharmony_ci    tmp5 = __lasx_xvpermi_q(src04, src05, 0x02);
1379cabdff1aSopenharmony_ci    tmp7 = __lasx_xvpermi_q(tmp8,  tmp9,  0x02);
1380cabdff1aSopenharmony_ci    tmp0 = __lasx_xvavgr_bu(tmp0, tmp1);
1381cabdff1aSopenharmony_ci    tmp2 = __lasx_xvavgr_bu(tmp2, tmp3);
1382cabdff1aSopenharmony_ci    tmp4 = __lasx_xvavgr_bu(tmp4, tmp5);
1383cabdff1aSopenharmony_ci    tmp6 = __lasx_xvavgr_bu(tmp6, tmp7);
1384cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp0, dst, 0, 0);
1385cabdff1aSopenharmony_ci    dst += dstStride;
1386cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp0, dst, 0, 2);
1387cabdff1aSopenharmony_ci    dst += dstStride;
1388cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp2, dst, 0, 0);
1389cabdff1aSopenharmony_ci    dst += dstStride;
1390cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp2, dst, 0, 2);
1391cabdff1aSopenharmony_ci    dst += dstStride;
1392cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp4, dst, 0, 0);
1393cabdff1aSopenharmony_ci    dst += dstStride;
1394cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp4, dst, 0, 2);
1395cabdff1aSopenharmony_ci    dst += dstStride;
1396cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp6, dst, 0, 0);
1397cabdff1aSopenharmony_ci    dst += dstStride;
1398cabdff1aSopenharmony_ci    __lasx_xvstelm_d(tmp6, dst, 0, 2);
1399cabdff1aSopenharmony_ci}
1400cabdff1aSopenharmony_ci
1401cabdff1aSopenharmony_cistatic av_always_inline void
1402cabdff1aSopenharmony_ciput_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1403cabdff1aSopenharmony_ci                               int dstStride, int srcStride)
1404cabdff1aSopenharmony_ci{
1405cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1406cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1407cabdff1aSopenharmony_ci    src += srcStride << 3;
1408cabdff1aSopenharmony_ci    dst += dstStride << 3;
1409cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1410cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1411cabdff1aSopenharmony_ci}
1412cabdff1aSopenharmony_ci
1413cabdff1aSopenharmony_cistatic av_always_inline void
1414cabdff1aSopenharmony_ciavg_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1415cabdff1aSopenharmony_ci                               int dstStride, int srcStride)
1416cabdff1aSopenharmony_ci{
1417cabdff1aSopenharmony_ci    avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1418cabdff1aSopenharmony_ci    avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1419cabdff1aSopenharmony_ci    src += srcStride << 3;
1420cabdff1aSopenharmony_ci    dst += dstStride << 3;
1421cabdff1aSopenharmony_ci    avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1422cabdff1aSopenharmony_ci    avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1423cabdff1aSopenharmony_ci}
1424cabdff1aSopenharmony_ci
1425cabdff1aSopenharmony_cistatic void put_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1426cabdff1aSopenharmony_ci                                           int dstStride, int srcStride)
1427cabdff1aSopenharmony_ci{
1428cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1429cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1430cabdff1aSopenharmony_ci    src += 8*srcStride;
1431cabdff1aSopenharmony_ci    dst += 8*dstStride;
1432cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1433cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1434cabdff1aSopenharmony_ci}
1435cabdff1aSopenharmony_ci
1436cabdff1aSopenharmony_cistatic void avg_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1437cabdff1aSopenharmony_ci                                           int dstStride, int srcStride)
1438cabdff1aSopenharmony_ci{
1439cabdff1aSopenharmony_ci    avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1440cabdff1aSopenharmony_ci    avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1441cabdff1aSopenharmony_ci    src += 8*srcStride;
1442cabdff1aSopenharmony_ci    dst += 8*dstStride;
1443cabdff1aSopenharmony_ci    avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1444cabdff1aSopenharmony_ci    avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1445cabdff1aSopenharmony_ci}
1446cabdff1aSopenharmony_ci
1447cabdff1aSopenharmony_cistatic void put_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1448cabdff1aSopenharmony_ci                                     ptrdiff_t dstStride, ptrdiff_t srcStride)
1449cabdff1aSopenharmony_ci{
1450cabdff1aSopenharmony_ci    put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1451cabdff1aSopenharmony_ci    put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1452cabdff1aSopenharmony_ci    src += srcStride << 3;
1453cabdff1aSopenharmony_ci    dst += dstStride << 3;
1454cabdff1aSopenharmony_ci    put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1455cabdff1aSopenharmony_ci    put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1456cabdff1aSopenharmony_ci}
1457cabdff1aSopenharmony_ci
1458cabdff1aSopenharmony_cistatic void avg_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1459cabdff1aSopenharmony_ci                                     ptrdiff_t dstStride, ptrdiff_t srcStride)
1460cabdff1aSopenharmony_ci{
1461cabdff1aSopenharmony_ci    avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1462cabdff1aSopenharmony_ci    avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1463cabdff1aSopenharmony_ci    src += srcStride << 3;
1464cabdff1aSopenharmony_ci    dst += dstStride << 3;
1465cabdff1aSopenharmony_ci    avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1466cabdff1aSopenharmony_ci    avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1467cabdff1aSopenharmony_ci}
1468cabdff1aSopenharmony_ci
1469cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
1470cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1471cabdff1aSopenharmony_ci{
1472cabdff1aSopenharmony_ci    /* In mmi optimization, it used function ff_put_pixels8_8_mmi
1473cabdff1aSopenharmony_ci     * which implemented in hpeldsp_mmi.c */
1474cabdff1aSopenharmony_ci    put_pixels8_8_inline_asm(dst, src, stride);
1475cabdff1aSopenharmony_ci}
1476cabdff1aSopenharmony_ci
1477cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
1478cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1479cabdff1aSopenharmony_ci{
1480cabdff1aSopenharmony_ci    uint8_t half[64];
1481cabdff1aSopenharmony_ci
1482cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1483cabdff1aSopenharmony_ci    /* in qpel8, the stride of half and height of block is 8 */
1484cabdff1aSopenharmony_ci    put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
1485cabdff1aSopenharmony_ci}
1486cabdff1aSopenharmony_ci
1487cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
1488cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1489cabdff1aSopenharmony_ci{
1490cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(dst, src, stride, stride);
1491cabdff1aSopenharmony_ci}
1492cabdff1aSopenharmony_ci
1493cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
1494cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1495cabdff1aSopenharmony_ci{
1496cabdff1aSopenharmony_ci    uint8_t half[64];
1497cabdff1aSopenharmony_ci
1498cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1499cabdff1aSopenharmony_ci    put_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
1500cabdff1aSopenharmony_ci}
1501cabdff1aSopenharmony_ci
1502cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src,
1503cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1504cabdff1aSopenharmony_ci{
1505cabdff1aSopenharmony_ci    uint8_t half[64];
1506cabdff1aSopenharmony_ci
1507cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride);
1508cabdff1aSopenharmony_ci    put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
1509cabdff1aSopenharmony_ci}
1510cabdff1aSopenharmony_ci
1511cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
1512cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1513cabdff1aSopenharmony_ci{
1514cabdff1aSopenharmony_ci    uint8_t halfH[64];
1515cabdff1aSopenharmony_ci    uint8_t halfV[64];
1516cabdff1aSopenharmony_ci
1517cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1518cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1519cabdff1aSopenharmony_ci    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1520cabdff1aSopenharmony_ci}
1521cabdff1aSopenharmony_ci
1522cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
1523cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1524cabdff1aSopenharmony_ci{
1525cabdff1aSopenharmony_ci    uint8_t temp[128];
1526cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp;
1527cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp + 64;
1528cabdff1aSopenharmony_ci
1529cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1530cabdff1aSopenharmony_ci    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1531cabdff1aSopenharmony_ci    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1532cabdff1aSopenharmony_ci}
1533cabdff1aSopenharmony_ci
1534cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
1535cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1536cabdff1aSopenharmony_ci{
1537cabdff1aSopenharmony_ci    uint8_t halfH[64];
1538cabdff1aSopenharmony_ci    uint8_t halfV[64];
1539cabdff1aSopenharmony_ci
1540cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1541cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1542cabdff1aSopenharmony_ci    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1543cabdff1aSopenharmony_ci}
1544cabdff1aSopenharmony_ci
1545cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
1546cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1547cabdff1aSopenharmony_ci{
1548cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride);
1549cabdff1aSopenharmony_ci}
1550cabdff1aSopenharmony_ci
1551cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
1552cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1553cabdff1aSopenharmony_ci{
1554cabdff1aSopenharmony_ci    uint8_t temp[128];
1555cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp;
1556cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp + 64;
1557cabdff1aSopenharmony_ci
1558cabdff1aSopenharmony_ci    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1559cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride);
1560cabdff1aSopenharmony_ci    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1561cabdff1aSopenharmony_ci}
1562cabdff1aSopenharmony_ci
1563cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
1564cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1565cabdff1aSopenharmony_ci{
1566cabdff1aSopenharmony_ci    put_h264_qpel8_hv_lowpass_lasx(dst, src, stride, stride);
1567cabdff1aSopenharmony_ci}
1568cabdff1aSopenharmony_ci
1569cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
1570cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1571cabdff1aSopenharmony_ci{
1572cabdff1aSopenharmony_ci    uint8_t temp[128];
1573cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp;
1574cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp + 64;
1575cabdff1aSopenharmony_ci
1576cabdff1aSopenharmony_ci    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1577cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride);
1578cabdff1aSopenharmony_ci    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1579cabdff1aSopenharmony_ci}
1580cabdff1aSopenharmony_ci
1581cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src,
1582cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1583cabdff1aSopenharmony_ci{
1584cabdff1aSopenharmony_ci    uint8_t half[64];
1585cabdff1aSopenharmony_ci
1586cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride);
1587cabdff1aSopenharmony_ci    put_pixels8_l2_8_lsx(dst, src + stride, half, stride, stride);
1588cabdff1aSopenharmony_ci}
1589cabdff1aSopenharmony_ci
1590cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
1591cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1592cabdff1aSopenharmony_ci{
1593cabdff1aSopenharmony_ci    uint8_t halfH[64];
1594cabdff1aSopenharmony_ci    uint8_t halfV[64];
1595cabdff1aSopenharmony_ci
1596cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1597cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1598cabdff1aSopenharmony_ci    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1599cabdff1aSopenharmony_ci}
1600cabdff1aSopenharmony_ci
1601cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
1602cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1603cabdff1aSopenharmony_ci{
1604cabdff1aSopenharmony_ci    uint8_t temp[128];
1605cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp;
1606cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp + 64;
1607cabdff1aSopenharmony_ci
1608cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1609cabdff1aSopenharmony_ci    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1610cabdff1aSopenharmony_ci    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1611cabdff1aSopenharmony_ci}
1612cabdff1aSopenharmony_ci
1613cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
1614cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1615cabdff1aSopenharmony_ci{
1616cabdff1aSopenharmony_ci    uint8_t halfH[64];
1617cabdff1aSopenharmony_ci    uint8_t halfV[64];
1618cabdff1aSopenharmony_ci
1619cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1620cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1621cabdff1aSopenharmony_ci    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1622cabdff1aSopenharmony_ci}
1623cabdff1aSopenharmony_ci
1624cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
1625cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1626cabdff1aSopenharmony_ci{
1627cabdff1aSopenharmony_ci    /* In mmi optimization, it used function ff_avg_pixels8_8_mmi
1628cabdff1aSopenharmony_ci     * which implemented in hpeldsp_mmi.c */
1629cabdff1aSopenharmony_ci    avg_pixels8_8_lsx(dst, src, stride);
1630cabdff1aSopenharmony_ci}
1631cabdff1aSopenharmony_ci
1632cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
1633cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1634cabdff1aSopenharmony_ci{
1635cabdff1aSopenharmony_ci    uint8_t half[64];
1636cabdff1aSopenharmony_ci
1637cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1638cabdff1aSopenharmony_ci    avg_pixels8_l2_8_lsx(dst, src, half, stride, stride);
1639cabdff1aSopenharmony_ci}
1640cabdff1aSopenharmony_ci
1641cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
1642cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1643cabdff1aSopenharmony_ci{
1644cabdff1aSopenharmony_ci    avg_h264_qpel8_h_lowpass_lasx(dst, src, stride, stride);
1645cabdff1aSopenharmony_ci}
1646cabdff1aSopenharmony_ci
1647cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
1648cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1649cabdff1aSopenharmony_ci{
1650cabdff1aSopenharmony_ci    uint8_t half[64];
1651cabdff1aSopenharmony_ci
1652cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride);
1653cabdff1aSopenharmony_ci    avg_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
1654cabdff1aSopenharmony_ci}
1655cabdff1aSopenharmony_ci
1656cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
1657cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1658cabdff1aSopenharmony_ci{
1659cabdff1aSopenharmony_ci    uint8_t halfH[64];
1660cabdff1aSopenharmony_ci    uint8_t halfV[64];
1661cabdff1aSopenharmony_ci
1662cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1663cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1664cabdff1aSopenharmony_ci    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1665cabdff1aSopenharmony_ci}
1666cabdff1aSopenharmony_ci
1667cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
1668cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1669cabdff1aSopenharmony_ci{
1670cabdff1aSopenharmony_ci    uint8_t temp[128];
1671cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp;
1672cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp + 64;
1673cabdff1aSopenharmony_ci
1674cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1675cabdff1aSopenharmony_ci    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1676cabdff1aSopenharmony_ci    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1677cabdff1aSopenharmony_ci}
1678cabdff1aSopenharmony_ci
1679cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
1680cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1681cabdff1aSopenharmony_ci{
1682cabdff1aSopenharmony_ci    uint8_t halfH[64];
1683cabdff1aSopenharmony_ci    uint8_t halfV[64];
1684cabdff1aSopenharmony_ci
1685cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride);
1686cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1687cabdff1aSopenharmony_ci    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1688cabdff1aSopenharmony_ci}
1689cabdff1aSopenharmony_ci
1690cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
1691cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1692cabdff1aSopenharmony_ci{
1693cabdff1aSopenharmony_ci    avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride);
1694cabdff1aSopenharmony_ci}
1695cabdff1aSopenharmony_ci
1696cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
1697cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1698cabdff1aSopenharmony_ci{
1699cabdff1aSopenharmony_ci    uint8_t temp[128];
1700cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp;
1701cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp + 64;
1702cabdff1aSopenharmony_ci
1703cabdff1aSopenharmony_ci    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1704cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride);
1705cabdff1aSopenharmony_ci    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1706cabdff1aSopenharmony_ci}
1707cabdff1aSopenharmony_ci
1708cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
1709cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1710cabdff1aSopenharmony_ci{
1711cabdff1aSopenharmony_ci    avg_h264_qpel8_hv_lowpass_lasx(dst, src, stride, stride);
1712cabdff1aSopenharmony_ci}
1713cabdff1aSopenharmony_ci
1714cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
1715cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1716cabdff1aSopenharmony_ci{
1717cabdff1aSopenharmony_ci    uint8_t temp[128];
1718cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp;
1719cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp + 64;
1720cabdff1aSopenharmony_ci
1721cabdff1aSopenharmony_ci    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1722cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride);
1723cabdff1aSopenharmony_ci    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1724cabdff1aSopenharmony_ci}
1725cabdff1aSopenharmony_ci
1726cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
1727cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1728cabdff1aSopenharmony_ci{
1729cabdff1aSopenharmony_ci    uint8_t halfH[64];
1730cabdff1aSopenharmony_ci    uint8_t halfV[64];
1731cabdff1aSopenharmony_ci
1732cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1733cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1734cabdff1aSopenharmony_ci    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1735cabdff1aSopenharmony_ci}
1736cabdff1aSopenharmony_ci
1737cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
1738cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1739cabdff1aSopenharmony_ci{
1740cabdff1aSopenharmony_ci    uint8_t temp[128];
1741cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp;
1742cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp + 64;
1743cabdff1aSopenharmony_ci
1744cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1745cabdff1aSopenharmony_ci    put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride);
1746cabdff1aSopenharmony_ci    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1747cabdff1aSopenharmony_ci}
1748cabdff1aSopenharmony_ci
1749cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
1750cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1751cabdff1aSopenharmony_ci{
1752cabdff1aSopenharmony_ci    uint8_t halfH[64];
1753cabdff1aSopenharmony_ci    uint8_t halfV[64];
1754cabdff1aSopenharmony_ci
1755cabdff1aSopenharmony_ci    put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride);
1756cabdff1aSopenharmony_ci    put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1757cabdff1aSopenharmony_ci    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1758cabdff1aSopenharmony_ci}
1759cabdff1aSopenharmony_ci
1760cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
1761cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1762cabdff1aSopenharmony_ci{
1763cabdff1aSopenharmony_ci    /* In mmi optimization, it used function ff_put_pixels16_8_mmi
1764cabdff1aSopenharmony_ci     * which implemented in hpeldsp_mmi.c */
1765cabdff1aSopenharmony_ci    put_pixels16_8_lsx(dst, src, stride);
1766cabdff1aSopenharmony_ci}
1767cabdff1aSopenharmony_ci
1768cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
1769cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1770cabdff1aSopenharmony_ci{
1771cabdff1aSopenharmony_ci    uint8_t half[256];
1772cabdff1aSopenharmony_ci
1773cabdff1aSopenharmony_ci    put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1774cabdff1aSopenharmony_ci    put_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1775cabdff1aSopenharmony_ci}
1776cabdff1aSopenharmony_ci
1777cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
1778cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1779cabdff1aSopenharmony_ci{
1780cabdff1aSopenharmony_ci    put_h264_qpel16_h_lowpass_lasx(dst, src, stride, stride);
1781cabdff1aSopenharmony_ci}
1782cabdff1aSopenharmony_ci
1783cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
1784cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1785cabdff1aSopenharmony_ci{
1786cabdff1aSopenharmony_ci    uint8_t half[256];
1787cabdff1aSopenharmony_ci
1788cabdff1aSopenharmony_ci    put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1789cabdff1aSopenharmony_ci    put_pixels16_l2_8_lsx(dst, src+1, half, stride, stride);
1790cabdff1aSopenharmony_ci}
1791cabdff1aSopenharmony_ci
1792cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
1793cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1794cabdff1aSopenharmony_ci{
1795cabdff1aSopenharmony_ci    uint8_t half[256];
1796cabdff1aSopenharmony_ci
1797cabdff1aSopenharmony_ci    put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
1798cabdff1aSopenharmony_ci    put_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1799cabdff1aSopenharmony_ci}
1800cabdff1aSopenharmony_ci
1801cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
1802cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1803cabdff1aSopenharmony_ci{
1804cabdff1aSopenharmony_ci    avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2),
1805cabdff1aSopenharmony_ci                               dst, stride);
1806cabdff1aSopenharmony_ci}
1807cabdff1aSopenharmony_ci
1808cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
1809cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1810cabdff1aSopenharmony_ci{
1811cabdff1aSopenharmony_ci    uint8_t temp[512];
1812cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp;
1813cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp + 256;
1814cabdff1aSopenharmony_ci
1815cabdff1aSopenharmony_ci    put_h264_qpel16_h_lowpass_lasx(halfH, src, 16, stride);
1816cabdff1aSopenharmony_ci    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1817cabdff1aSopenharmony_ci    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1818cabdff1aSopenharmony_ci}
1819cabdff1aSopenharmony_ci
1820cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
1821cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1822cabdff1aSopenharmony_ci{
1823cabdff1aSopenharmony_ci    avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2) + 1,
1824cabdff1aSopenharmony_ci                               dst, stride);
1825cabdff1aSopenharmony_ci}
1826cabdff1aSopenharmony_ci
1827cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
1828cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1829cabdff1aSopenharmony_ci{
1830cabdff1aSopenharmony_ci    put_h264_qpel16_v_lowpass_lasx(dst, src, stride, stride);
1831cabdff1aSopenharmony_ci}
1832cabdff1aSopenharmony_ci
1833cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
1834cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1835cabdff1aSopenharmony_ci{
1836cabdff1aSopenharmony_ci    uint8_t temp[512];
1837cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp;
1838cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp + 256;
1839cabdff1aSopenharmony_ci
1840cabdff1aSopenharmony_ci    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1841cabdff1aSopenharmony_ci    put_h264_qpel16_v_lowpass_lasx(halfH, src, 16, stride);
1842cabdff1aSopenharmony_ci    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1843cabdff1aSopenharmony_ci}
1844cabdff1aSopenharmony_ci
1845cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
1846cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1847cabdff1aSopenharmony_ci{
1848cabdff1aSopenharmony_ci    put_h264_qpel16_hv_lowpass_lasx(dst, src, stride, stride);
1849cabdff1aSopenharmony_ci}
1850cabdff1aSopenharmony_ci
1851cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
1852cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1853cabdff1aSopenharmony_ci{
1854cabdff1aSopenharmony_ci    uint8_t temp[512];
1855cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp;
1856cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp + 256;
1857cabdff1aSopenharmony_ci
1858cabdff1aSopenharmony_ci    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1859cabdff1aSopenharmony_ci    put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride);
1860cabdff1aSopenharmony_ci    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1861cabdff1aSopenharmony_ci}
1862cabdff1aSopenharmony_ci
1863cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
1864cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1865cabdff1aSopenharmony_ci{
1866cabdff1aSopenharmony_ci    uint8_t half[256];
1867cabdff1aSopenharmony_ci
1868cabdff1aSopenharmony_ci    put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
1869cabdff1aSopenharmony_ci    put_pixels16_l2_8_lsx(dst, src+stride, half, stride, stride);
1870cabdff1aSopenharmony_ci}
1871cabdff1aSopenharmony_ci
1872cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
1873cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1874cabdff1aSopenharmony_ci{
1875cabdff1aSopenharmony_ci    avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2, (uint8_t*)src - (stride * 2),
1876cabdff1aSopenharmony_ci                               dst, stride);
1877cabdff1aSopenharmony_ci}
1878cabdff1aSopenharmony_ci
1879cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
1880cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1881cabdff1aSopenharmony_ci{
1882cabdff1aSopenharmony_ci    uint8_t temp[512];
1883cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp;
1884cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp + 256;
1885cabdff1aSopenharmony_ci
1886cabdff1aSopenharmony_ci    put_h264_qpel16_h_lowpass_lasx(halfH, src + stride, 16, stride);
1887cabdff1aSopenharmony_ci    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1888cabdff1aSopenharmony_ci    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1889cabdff1aSopenharmony_ci}
1890cabdff1aSopenharmony_ci
1891cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
1892cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1893cabdff1aSopenharmony_ci{
1894cabdff1aSopenharmony_ci    avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2,
1895cabdff1aSopenharmony_ci                               (uint8_t*)src - (stride * 2) + 1, dst, stride);
1896cabdff1aSopenharmony_ci}
1897cabdff1aSopenharmony_ci
1898cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
1899cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1900cabdff1aSopenharmony_ci{
1901cabdff1aSopenharmony_ci    /* In mmi optimization, it used function ff_avg_pixels16_8_mmi
1902cabdff1aSopenharmony_ci     * which implemented in hpeldsp_mmi.c */
1903cabdff1aSopenharmony_ci    avg_pixels16_8_lsx(dst, src, stride);
1904cabdff1aSopenharmony_ci}
1905cabdff1aSopenharmony_ci
1906cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
1907cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1908cabdff1aSopenharmony_ci{
1909cabdff1aSopenharmony_ci    uint8_t half[256];
1910cabdff1aSopenharmony_ci
1911cabdff1aSopenharmony_ci    put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1912cabdff1aSopenharmony_ci    avg_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1913cabdff1aSopenharmony_ci}
1914cabdff1aSopenharmony_ci
1915cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
1916cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1917cabdff1aSopenharmony_ci{
1918cabdff1aSopenharmony_ci    avg_h264_qpel16_h_lowpass_lasx(dst, src, stride, stride);
1919cabdff1aSopenharmony_ci}
1920cabdff1aSopenharmony_ci
1921cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
1922cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1923cabdff1aSopenharmony_ci{
1924cabdff1aSopenharmony_ci    uint8_t half[256];
1925cabdff1aSopenharmony_ci
1926cabdff1aSopenharmony_ci    put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride);
1927cabdff1aSopenharmony_ci    avg_pixels16_l2_8_lsx(dst, src+1, half, stride, stride);
1928cabdff1aSopenharmony_ci}
1929cabdff1aSopenharmony_ci
1930cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
1931cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1932cabdff1aSopenharmony_ci{
1933cabdff1aSopenharmony_ci    uint8_t half[256];
1934cabdff1aSopenharmony_ci
1935cabdff1aSopenharmony_ci    put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
1936cabdff1aSopenharmony_ci    avg_pixels16_l2_8_lsx(dst, src, half, stride, stride);
1937cabdff1aSopenharmony_ci}
1938cabdff1aSopenharmony_ci
1939cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
1940cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1941cabdff1aSopenharmony_ci{
1942cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src - 2,
1943cabdff1aSopenharmony_ci                                           (uint8_t*)src - (stride * 2),
1944cabdff1aSopenharmony_ci                                           dst, stride);
1945cabdff1aSopenharmony_ci}
1946cabdff1aSopenharmony_ci
1947cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
1948cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1949cabdff1aSopenharmony_ci{
1950cabdff1aSopenharmony_ci    uint8_t temp[512];
1951cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp;
1952cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp + 256;
1953cabdff1aSopenharmony_ci
1954cabdff1aSopenharmony_ci    put_h264_qpel16_h_lowpass_lasx(halfH, src, 16, stride);
1955cabdff1aSopenharmony_ci    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1956cabdff1aSopenharmony_ci    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1957cabdff1aSopenharmony_ci}
1958cabdff1aSopenharmony_ci
1959cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
1960cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1961cabdff1aSopenharmony_ci{
1962cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src - 2,
1963cabdff1aSopenharmony_ci                                            (uint8_t*)src - (stride * 2) + 1,
1964cabdff1aSopenharmony_ci                                            dst, stride);
1965cabdff1aSopenharmony_ci}
1966cabdff1aSopenharmony_ci
1967cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
1968cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1969cabdff1aSopenharmony_ci{
1970cabdff1aSopenharmony_ci    avg_h264_qpel16_v_lowpass_lasx(dst, src, stride, stride);
1971cabdff1aSopenharmony_ci}
1972cabdff1aSopenharmony_ci
1973cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
1974cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1975cabdff1aSopenharmony_ci{
1976cabdff1aSopenharmony_ci    uint8_t temp[512];
1977cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp;
1978cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp + 256;
1979cabdff1aSopenharmony_ci
1980cabdff1aSopenharmony_ci    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1981cabdff1aSopenharmony_ci    put_h264_qpel16_v_lowpass_lasx(halfH, src, 16, stride);
1982cabdff1aSopenharmony_ci    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1983cabdff1aSopenharmony_ci}
1984cabdff1aSopenharmony_ci
1985cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
1986cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1987cabdff1aSopenharmony_ci{
1988cabdff1aSopenharmony_ci    avg_h264_qpel16_hv_lowpass_lasx(dst, src, stride, stride);
1989cabdff1aSopenharmony_ci}
1990cabdff1aSopenharmony_ci
1991cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
1992cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
1993cabdff1aSopenharmony_ci{
1994cabdff1aSopenharmony_ci    uint8_t temp[512];
1995cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp;
1996cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp + 256;
1997cabdff1aSopenharmony_ci
1998cabdff1aSopenharmony_ci    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
1999cabdff1aSopenharmony_ci    put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride);
2000cabdff1aSopenharmony_ci    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
2001cabdff1aSopenharmony_ci}
2002cabdff1aSopenharmony_ci
2003cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
2004cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
2005cabdff1aSopenharmony_ci{
2006cabdff1aSopenharmony_ci    uint8_t half[256];
2007cabdff1aSopenharmony_ci
2008cabdff1aSopenharmony_ci    put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride);
2009cabdff1aSopenharmony_ci    avg_pixels16_l2_8_lsx(dst, src + stride, half, stride, stride);
2010cabdff1aSopenharmony_ci}
2011cabdff1aSopenharmony_ci
2012cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
2013cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
2014cabdff1aSopenharmony_ci{
2015cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src + stride - 2,
2016cabdff1aSopenharmony_ci                                            (uint8_t*)src - (stride * 2),
2017cabdff1aSopenharmony_ci                                            dst, stride);
2018cabdff1aSopenharmony_ci}
2019cabdff1aSopenharmony_ci
2020cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
2021cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
2022cabdff1aSopenharmony_ci{
2023cabdff1aSopenharmony_ci    uint8_t temp[512];
2024cabdff1aSopenharmony_ci    uint8_t *const halfH  = temp;
2025cabdff1aSopenharmony_ci    uint8_t *const halfHV = temp + 256;
2026cabdff1aSopenharmony_ci
2027cabdff1aSopenharmony_ci    put_h264_qpel16_h_lowpass_lasx(halfH, src + stride, 16, stride);
2028cabdff1aSopenharmony_ci    put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride);
2029cabdff1aSopenharmony_ci    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
2030cabdff1aSopenharmony_ci}
2031cabdff1aSopenharmony_ci
2032cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
2033cabdff1aSopenharmony_ci                                  ptrdiff_t stride)
2034cabdff1aSopenharmony_ci{
2035cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src + stride - 2,
2036cabdff1aSopenharmony_ci                                            (uint8_t*)src - (stride * 2) + 1,
2037cabdff1aSopenharmony_ci                                            dst, stride);
2038cabdff1aSopenharmony_ci}
2039