1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
22cabdff1aSopenharmony_ci#include "libavcodec/mips/hpeldsp_mips.h"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)                  \
25cabdff1aSopenharmony_ci{                                                             \
26cabdff1aSopenharmony_ci    v16u8 tmp_m;                                              \
27cabdff1aSopenharmony_ci                                                              \
28cabdff1aSopenharmony_ci    tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
29cabdff1aSopenharmony_ci    tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);               \
30cabdff1aSopenharmony_ci    ST_UB(tmp_m, (pdst));                                     \
31cabdff1aSopenharmony_ci}
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_ci#define PCKEV_ST_SB4(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
34cabdff1aSopenharmony_ci{                                                                           \
35cabdff1aSopenharmony_ci    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
36cabdff1aSopenharmony_ci    uint8_t *pdst_m = (uint8_t *) (pdst);                                   \
37cabdff1aSopenharmony_ci                                                                            \
38cabdff1aSopenharmony_ci    PCKEV_B4_SB(in0, in1, in2, in3, in4, in5, in6, in7,                     \
39cabdff1aSopenharmony_ci                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                            \
40cabdff1aSopenharmony_ci    ST_SB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst_m, stride);                 \
41cabdff1aSopenharmony_ci}
42cabdff1aSopenharmony_ci
43cabdff1aSopenharmony_ci#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,  \
44cabdff1aSopenharmony_ci                           pdst, stride)                                \
45cabdff1aSopenharmony_ci{                                                                       \
46cabdff1aSopenharmony_ci    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
47cabdff1aSopenharmony_ci    uint8_t *pdst_m = (uint8_t *) (pdst);                               \
48cabdff1aSopenharmony_ci                                                                        \
49cabdff1aSopenharmony_ci    PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                    \
50cabdff1aSopenharmony_ci    PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                \
51cabdff1aSopenharmony_ci    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);        \
52cabdff1aSopenharmony_ci    ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride);                  \
53cabdff1aSopenharmony_ci}
54cabdff1aSopenharmony_ci
55cabdff1aSopenharmony_cistatic void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
56cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
57cabdff1aSopenharmony_ci                                 uint8_t height)
58cabdff1aSopenharmony_ci{
59cabdff1aSopenharmony_ci    uint8_t loop_cnt;
60cabdff1aSopenharmony_ci    uint32_t out0, out1;
61cabdff1aSopenharmony_ci    v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
62cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };
63cabdff1aSopenharmony_ci
64cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
65cabdff1aSopenharmony_ci        LD_UB2(src, src_stride, src0, src1);
66cabdff1aSopenharmony_ci        src += (2 * src_stride);
67cabdff1aSopenharmony_ci
68cabdff1aSopenharmony_ci        SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1);
69cabdff1aSopenharmony_ci        AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
70cabdff1aSopenharmony_ci
71cabdff1aSopenharmony_ci        out0 = __msa_copy_u_w((v4i32) res0, 0);
72cabdff1aSopenharmony_ci        out1 = __msa_copy_u_w((v4i32) res1, 0);
73cabdff1aSopenharmony_ci        SW(out0, dst);
74cabdff1aSopenharmony_ci        dst += dst_stride;
75cabdff1aSopenharmony_ci        SW(out1, dst);
76cabdff1aSopenharmony_ci        dst += dst_stride;
77cabdff1aSopenharmony_ci    }
78cabdff1aSopenharmony_ci}
79cabdff1aSopenharmony_ci
80cabdff1aSopenharmony_cistatic void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride,
81cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
82cabdff1aSopenharmony_ci                                 uint8_t height)
83cabdff1aSopenharmony_ci{
84cabdff1aSopenharmony_ci    uint8_t loop_cnt;
85cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
86cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
89cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
90cabdff1aSopenharmony_ci        src += (4 * src_stride);
91cabdff1aSopenharmony_ci
92cabdff1aSopenharmony_ci        SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
93cabdff1aSopenharmony_ci                   src0_sld1, src1_sld1, src2_sld1, src3_sld1);
94cabdff1aSopenharmony_ci        AVER_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
95cabdff1aSopenharmony_ci                      src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
96cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
97cabdff1aSopenharmony_ci    }
98cabdff1aSopenharmony_ci}
99cabdff1aSopenharmony_ci
100cabdff1aSopenharmony_cistatic void common_hz_bil_16w_msa(const uint8_t *src, int32_t src_stride,
101cabdff1aSopenharmony_ci                                  uint8_t *dst, int32_t dst_stride,
102cabdff1aSopenharmony_ci                                  uint8_t height)
103cabdff1aSopenharmony_ci{
104cabdff1aSopenharmony_ci    uint8_t loop_cnt;
105cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
106cabdff1aSopenharmony_ci    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
107cabdff1aSopenharmony_ci
108cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
109cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
110cabdff1aSopenharmony_ci        LD_UB8((src + 1), src_stride,
111cabdff1aSopenharmony_ci               src8, src9, src10, src11, src12, src13, src14, src15);
112cabdff1aSopenharmony_ci        src += (8 * src_stride);
113cabdff1aSopenharmony_ci
114cabdff1aSopenharmony_ci        AVER_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
115cabdff1aSopenharmony_ci                       dst, dst_stride);
116cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
117cabdff1aSopenharmony_ci
118cabdff1aSopenharmony_ci        AVER_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
119cabdff1aSopenharmony_ci                       dst, dst_stride);
120cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
121cabdff1aSopenharmony_ci    }
122cabdff1aSopenharmony_ci}
123cabdff1aSopenharmony_ci
124cabdff1aSopenharmony_cistatic void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
125cabdff1aSopenharmony_ci                                         uint8_t *dst, int32_t dst_stride)
126cabdff1aSopenharmony_ci{
127cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
128cabdff1aSopenharmony_ci    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
129cabdff1aSopenharmony_ci    v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1;
130cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
133cabdff1aSopenharmony_ci    src += (8 * src_stride);
134cabdff1aSopenharmony_ci
135cabdff1aSopenharmony_ci    SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
136cabdff1aSopenharmony_ci               src0_sld1, src1_sld1, src2_sld1, src3_sld1);
137cabdff1aSopenharmony_ci    SLDI_B4_SB(zeros, src4, zeros, src5, zeros, src6, zeros, src7, 1,
138cabdff1aSopenharmony_ci               src4_sld1, src5_sld1, src6_sld1, src7_sld1);
139cabdff1aSopenharmony_ci
140cabdff1aSopenharmony_ci    AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
141cabdff1aSopenharmony_ci                 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
142cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
143cabdff1aSopenharmony_ci    AVE_ST8x4_UB(src4, src4_sld1, src5, src5_sld1,
144cabdff1aSopenharmony_ci                 src6, src6_sld1, src7, src7_sld1, dst, dst_stride);
145cabdff1aSopenharmony_ci}
146cabdff1aSopenharmony_ci
147cabdff1aSopenharmony_cistatic void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
148cabdff1aSopenharmony_ci                                         uint8_t *dst, int32_t dst_stride)
149cabdff1aSopenharmony_ci{
150cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
151cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };
152cabdff1aSopenharmony_ci
153cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
154cabdff1aSopenharmony_ci    SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
155cabdff1aSopenharmony_ci               src0_sld1, src1_sld1, src2_sld1, src3_sld1);
156cabdff1aSopenharmony_ci    AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
157cabdff1aSopenharmony_ci                 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
158cabdff1aSopenharmony_ci}
159cabdff1aSopenharmony_ci
160cabdff1aSopenharmony_cistatic void common_hz_bil_no_rnd_16x16_msa(const uint8_t *src,
161cabdff1aSopenharmony_ci                                           int32_t src_stride,
162cabdff1aSopenharmony_ci                                           uint8_t *dst, int32_t dst_stride)
163cabdff1aSopenharmony_ci{
164cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
165cabdff1aSopenharmony_ci    v16u8 src9, src10, src11, src12, src13, src14, src15;
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_ci    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
168cabdff1aSopenharmony_ci    LD_UB8((src + 1), src_stride,
169cabdff1aSopenharmony_ci           src8, src9, src10, src11, src12, src13, src14, src15);
170cabdff1aSopenharmony_ci    src += (8 * src_stride);
171cabdff1aSopenharmony_ci
172cabdff1aSopenharmony_ci    AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
173cabdff1aSopenharmony_ci                  dst, dst_stride);
174cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
175cabdff1aSopenharmony_ci
176cabdff1aSopenharmony_ci    LD_UB4(src, src_stride, src0, src1, src2, src3);
177cabdff1aSopenharmony_ci    LD_UB4((src + 1), src_stride, src8, src9, src10, src11);
178cabdff1aSopenharmony_ci    src += (4 * src_stride);
179cabdff1aSopenharmony_ci
180cabdff1aSopenharmony_ci    AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
181cabdff1aSopenharmony_ci                  dst, dst_stride);
182cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
183cabdff1aSopenharmony_ci
184cabdff1aSopenharmony_ci    LD_UB4(src, src_stride, src4, src5, src6, src7);
185cabdff1aSopenharmony_ci    LD_UB4((src + 1), src_stride, src12, src13, src14, src15);
186cabdff1aSopenharmony_ci    src += (4 * src_stride);
187cabdff1aSopenharmony_ci
188cabdff1aSopenharmony_ci    AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
189cabdff1aSopenharmony_ci                  dst, dst_stride);
190cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
191cabdff1aSopenharmony_ci    AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
192cabdff1aSopenharmony_ci                  dst, dst_stride);
193cabdff1aSopenharmony_ci}
194cabdff1aSopenharmony_ci
195cabdff1aSopenharmony_cistatic void common_hz_bil_no_rnd_8x16_msa(const uint8_t *src,
196cabdff1aSopenharmony_ci                                          int32_t src_stride,
197cabdff1aSopenharmony_ci                                          uint8_t *dst, int32_t dst_stride)
198cabdff1aSopenharmony_ci{
199cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
200cabdff1aSopenharmony_ci    v16u8 src9, src10, src11, src12, src13, src14, src15;
201cabdff1aSopenharmony_ci
202cabdff1aSopenharmony_ci    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
203cabdff1aSopenharmony_ci    LD_UB8((src + 1), src_stride,
204cabdff1aSopenharmony_ci           src8, src9, src10, src11, src12, src13, src14, src15);
205cabdff1aSopenharmony_ci
206cabdff1aSopenharmony_ci    AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
207cabdff1aSopenharmony_ci                  dst, dst_stride);
208cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
209cabdff1aSopenharmony_ci    AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
210cabdff1aSopenharmony_ci                  dst, dst_stride);
211cabdff1aSopenharmony_ci}
212cabdff1aSopenharmony_ci
213cabdff1aSopenharmony_cistatic void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src,
214cabdff1aSopenharmony_ci                                              int32_t src_stride,
215cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
216cabdff1aSopenharmony_ci                                              uint8_t height)
217cabdff1aSopenharmony_ci{
218cabdff1aSopenharmony_ci    uint8_t loop_cnt;
219cabdff1aSopenharmony_ci    uint32_t dst0, dst1, out0, out1;
220cabdff1aSopenharmony_ci    v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
221cabdff1aSopenharmony_ci    v16u8 tmp0 = { 0 };
222cabdff1aSopenharmony_ci    v16u8 tmp1 = { 0 };
223cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };
224cabdff1aSopenharmony_ci
225cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
226cabdff1aSopenharmony_ci        LD_UB2(src, src_stride, src0, src1);
227cabdff1aSopenharmony_ci        src += (2 * src_stride);
228cabdff1aSopenharmony_ci
229cabdff1aSopenharmony_ci        SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1);
230cabdff1aSopenharmony_ci
231cabdff1aSopenharmony_ci        dst0 = LW(dst);
232cabdff1aSopenharmony_ci        dst1 = LW(dst + dst_stride);
233cabdff1aSopenharmony_ci        tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
234cabdff1aSopenharmony_ci        tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
235cabdff1aSopenharmony_ci
236cabdff1aSopenharmony_ci        AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
237cabdff1aSopenharmony_ci        AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
238cabdff1aSopenharmony_ci
239cabdff1aSopenharmony_ci        out0 = __msa_copy_u_w((v4i32) res0, 0);
240cabdff1aSopenharmony_ci        out1 = __msa_copy_u_w((v4i32) res1, 0);
241cabdff1aSopenharmony_ci        SW(out0, dst);
242cabdff1aSopenharmony_ci        dst += dst_stride;
243cabdff1aSopenharmony_ci        SW(out1, dst);
244cabdff1aSopenharmony_ci        dst += dst_stride;
245cabdff1aSopenharmony_ci    }
246cabdff1aSopenharmony_ci}
247cabdff1aSopenharmony_ci
248cabdff1aSopenharmony_cistatic void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src,
249cabdff1aSopenharmony_ci                                              int32_t src_stride,
250cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
251cabdff1aSopenharmony_ci                                              uint8_t height)
252cabdff1aSopenharmony_ci{
253cabdff1aSopenharmony_ci    uint8_t loop_cnt;
254cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
255cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };
256cabdff1aSopenharmony_ci
257cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
258cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
259cabdff1aSopenharmony_ci        src += (4 * src_stride);
260cabdff1aSopenharmony_ci
261cabdff1aSopenharmony_ci        SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
262cabdff1aSopenharmony_ci                   src0_sld1, src1_sld1, src2_sld1, src3_sld1);
263cabdff1aSopenharmony_ci
264cabdff1aSopenharmony_ci        AVER_DST_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, src2, src2_sld1,
265cabdff1aSopenharmony_ci                          src3, src3_sld1, dst, dst_stride);
266cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
267cabdff1aSopenharmony_ci    }
268cabdff1aSopenharmony_ci}
269cabdff1aSopenharmony_ci
270cabdff1aSopenharmony_cistatic void common_hz_bil_and_aver_dst_16w_msa(const uint8_t *src,
271cabdff1aSopenharmony_ci                                               int32_t src_stride,
272cabdff1aSopenharmony_ci                                               uint8_t *dst, int32_t dst_stride,
273cabdff1aSopenharmony_ci                                               uint8_t height)
274cabdff1aSopenharmony_ci{
275cabdff1aSopenharmony_ci    uint8_t loop_cnt;
276cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
277cabdff1aSopenharmony_ci    v16u8 src9, src10, src11, src12, src13, src14, src15;
278cabdff1aSopenharmony_ci
279cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
280cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
281cabdff1aSopenharmony_ci        LD_UB8((src + 1), src_stride,
282cabdff1aSopenharmony_ci               src8, src9, src10, src11, src12, src13, src14, src15);
283cabdff1aSopenharmony_ci        src += (8 * src_stride);
284cabdff1aSopenharmony_ci
285cabdff1aSopenharmony_ci        AVER_DST_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
286cabdff1aSopenharmony_ci                           dst, dst_stride);
287cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
288cabdff1aSopenharmony_ci        AVER_DST_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
289cabdff1aSopenharmony_ci                           dst, dst_stride);
290cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
291cabdff1aSopenharmony_ci    }
292cabdff1aSopenharmony_ci}
293cabdff1aSopenharmony_ci
294cabdff1aSopenharmony_cistatic void common_vt_bil_4w_msa(const uint8_t *src, int32_t src_stride,
295cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
296cabdff1aSopenharmony_ci                                 uint8_t height)
297cabdff1aSopenharmony_ci{
298cabdff1aSopenharmony_ci    uint8_t loop_cnt;
299cabdff1aSopenharmony_ci    uint32_t out0, out1;
300cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, res0, res1;
301cabdff1aSopenharmony_ci
302cabdff1aSopenharmony_ci    src0 = LD_UB(src);
303cabdff1aSopenharmony_ci    src += src_stride;
304cabdff1aSopenharmony_ci
305cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
306cabdff1aSopenharmony_ci        LD_UB2(src, src_stride, src1, src2);
307cabdff1aSopenharmony_ci        src += (2 * src_stride);
308cabdff1aSopenharmony_ci
309cabdff1aSopenharmony_ci        AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
310cabdff1aSopenharmony_ci
311cabdff1aSopenharmony_ci        out0 = __msa_copy_u_w((v4i32) res0, 0);
312cabdff1aSopenharmony_ci        out1 = __msa_copy_u_w((v4i32) res1, 0);
313cabdff1aSopenharmony_ci        SW(out0, dst);
314cabdff1aSopenharmony_ci        dst += dst_stride;
315cabdff1aSopenharmony_ci        SW(out1, dst);
316cabdff1aSopenharmony_ci        dst += dst_stride;
317cabdff1aSopenharmony_ci
318cabdff1aSopenharmony_ci        src0 = src2;
319cabdff1aSopenharmony_ci    }
320cabdff1aSopenharmony_ci}
321cabdff1aSopenharmony_ci
322cabdff1aSopenharmony_cistatic void common_vt_bil_8w_msa(const uint8_t *src, int32_t src_stride,
323cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
324cabdff1aSopenharmony_ci                                 uint8_t height)
325cabdff1aSopenharmony_ci{
326cabdff1aSopenharmony_ci    uint8_t loop_cnt;
327cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4;
328cabdff1aSopenharmony_ci
329cabdff1aSopenharmony_ci    src0 = LD_UB(src);
330cabdff1aSopenharmony_ci    src += src_stride;
331cabdff1aSopenharmony_ci
332cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
333cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src1, src2, src3, src4);
334cabdff1aSopenharmony_ci        src += (4 * src_stride);
335cabdff1aSopenharmony_ci
336cabdff1aSopenharmony_ci        AVER_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
337cabdff1aSopenharmony_ci                      dst, dst_stride);
338cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
339cabdff1aSopenharmony_ci
340cabdff1aSopenharmony_ci        src0 = src4;
341cabdff1aSopenharmony_ci    }
342cabdff1aSopenharmony_ci}
343cabdff1aSopenharmony_ci
344cabdff1aSopenharmony_cistatic void common_vt_bil_16w_msa(const uint8_t *src, int32_t src_stride,
345cabdff1aSopenharmony_ci                                  uint8_t *dst, int32_t dst_stride,
346cabdff1aSopenharmony_ci                                  uint8_t height)
347cabdff1aSopenharmony_ci{
348cabdff1aSopenharmony_ci    uint8_t loop_cnt;
349cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
350cabdff1aSopenharmony_ci
351cabdff1aSopenharmony_ci    src0 = LD_UB(src);
352cabdff1aSopenharmony_ci    src += src_stride;
353cabdff1aSopenharmony_ci
354cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
355cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
356cabdff1aSopenharmony_ci        src += (8 * src_stride);
357cabdff1aSopenharmony_ci
358cabdff1aSopenharmony_ci        AVER_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
359cabdff1aSopenharmony_ci                       dst, dst_stride);
360cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
361cabdff1aSopenharmony_ci        AVER_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
362cabdff1aSopenharmony_ci                       dst, dst_stride);
363cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
364cabdff1aSopenharmony_ci
365cabdff1aSopenharmony_ci        src0 = src8;
366cabdff1aSopenharmony_ci    }
367cabdff1aSopenharmony_ci}
368cabdff1aSopenharmony_ci
369cabdff1aSopenharmony_cistatic void common_vt_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
370cabdff1aSopenharmony_ci                                         uint8_t *dst, int32_t dst_stride)
371cabdff1aSopenharmony_ci{
372cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
373cabdff1aSopenharmony_ci
374cabdff1aSopenharmony_ci    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
375cabdff1aSopenharmony_ci    src += (8 * src_stride);
376cabdff1aSopenharmony_ci    src8 = LD_UB(src);
377cabdff1aSopenharmony_ci
378cabdff1aSopenharmony_ci    AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
379cabdff1aSopenharmony_ci                 dst, dst_stride);
380cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
381cabdff1aSopenharmony_ci
382cabdff1aSopenharmony_ci    AVE_ST8x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
383cabdff1aSopenharmony_ci                 dst, dst_stride);
384cabdff1aSopenharmony_ci}
385cabdff1aSopenharmony_ci
386cabdff1aSopenharmony_cistatic void common_vt_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
387cabdff1aSopenharmony_ci                                         uint8_t *dst, int32_t dst_stride)
388cabdff1aSopenharmony_ci{
389cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4;
390cabdff1aSopenharmony_ci
391cabdff1aSopenharmony_ci    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
392cabdff1aSopenharmony_ci    AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
393cabdff1aSopenharmony_ci                 dst, dst_stride);
394cabdff1aSopenharmony_ci}
395cabdff1aSopenharmony_ci
396cabdff1aSopenharmony_cistatic void common_vt_bil_no_rnd_16x16_msa(const uint8_t *src,
397cabdff1aSopenharmony_ci                                           int32_t src_stride,
398cabdff1aSopenharmony_ci                                           uint8_t *dst, int32_t dst_stride)
399cabdff1aSopenharmony_ci{
400cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
401cabdff1aSopenharmony_ci    v16u8 src9, src10, src11, src12, src13, src14, src15, src16;
402cabdff1aSopenharmony_ci
403cabdff1aSopenharmony_ci    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
404cabdff1aSopenharmony_ci    src += (8 * src_stride);
405cabdff1aSopenharmony_ci    LD_UB8(src, src_stride,
406cabdff1aSopenharmony_ci           src8, src9, src10, src11, src12, src13, src14, src15);
407cabdff1aSopenharmony_ci    src += (8 * src_stride);
408cabdff1aSopenharmony_ci    src16 = LD_UB(src);
409cabdff1aSopenharmony_ci
410cabdff1aSopenharmony_ci    AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
411cabdff1aSopenharmony_ci                  dst, dst_stride);
412cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
413cabdff1aSopenharmony_ci    AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
414cabdff1aSopenharmony_ci                  dst, dst_stride);
415cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
416cabdff1aSopenharmony_ci    AVE_ST16x4_UB(src8, src9, src9, src10, src10, src11, src11, src12,
417cabdff1aSopenharmony_ci                  dst, dst_stride);
418cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
419cabdff1aSopenharmony_ci    AVE_ST16x4_UB(src12, src13, src13, src14,
420cabdff1aSopenharmony_ci                  src14, src15, src15, src16, dst, dst_stride);
421cabdff1aSopenharmony_ci}
422cabdff1aSopenharmony_ci
423cabdff1aSopenharmony_cistatic void common_vt_bil_no_rnd_8x16_msa(const uint8_t *src,
424cabdff1aSopenharmony_ci                                          int32_t src_stride,
425cabdff1aSopenharmony_ci                                          uint8_t *dst, int32_t dst_stride)
426cabdff1aSopenharmony_ci{
427cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
428cabdff1aSopenharmony_ci
429cabdff1aSopenharmony_ci    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
430cabdff1aSopenharmony_ci    src += (8 * src_stride);
431cabdff1aSopenharmony_ci    src8 = LD_UB(src);
432cabdff1aSopenharmony_ci
433cabdff1aSopenharmony_ci    AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
434cabdff1aSopenharmony_ci                  dst, dst_stride);
435cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
436cabdff1aSopenharmony_ci    AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
437cabdff1aSopenharmony_ci                  dst, dst_stride);
438cabdff1aSopenharmony_ci}
439cabdff1aSopenharmony_ci
440cabdff1aSopenharmony_cistatic void common_vt_bil_and_aver_dst_4w_msa(const uint8_t *src,
441cabdff1aSopenharmony_ci                                              int32_t src_stride,
442cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
443cabdff1aSopenharmony_ci                                              uint8_t height)
444cabdff1aSopenharmony_ci{
445cabdff1aSopenharmony_ci    uint8_t loop_cnt;
446cabdff1aSopenharmony_ci    uint32_t out0, out1, dst0, dst1;
447cabdff1aSopenharmony_ci    v16u8 src0, src1, src2;
448cabdff1aSopenharmony_ci    v16u8 tmp0 = { 0 };
449cabdff1aSopenharmony_ci    v16u8 tmp1 = { 0 };
450cabdff1aSopenharmony_ci    v16u8 res0, res1;
451cabdff1aSopenharmony_ci
452cabdff1aSopenharmony_ci    src0 = LD_UB(src);
453cabdff1aSopenharmony_ci    src += src_stride;
454cabdff1aSopenharmony_ci
455cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
456cabdff1aSopenharmony_ci        LD_UB2(src, src_stride, src1, src2);
457cabdff1aSopenharmony_ci        src += (2 * src_stride);
458cabdff1aSopenharmony_ci        dst0 = LW(dst);
459cabdff1aSopenharmony_ci        dst1 = LW(dst + dst_stride);
460cabdff1aSopenharmony_ci        tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
461cabdff1aSopenharmony_ci        tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
462cabdff1aSopenharmony_ci        AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
463cabdff1aSopenharmony_ci        AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
464cabdff1aSopenharmony_ci        out0 = __msa_copy_u_w((v4i32) res0, 0);
465cabdff1aSopenharmony_ci        out1 = __msa_copy_u_w((v4i32) res1, 0);
466cabdff1aSopenharmony_ci        SW(out0, dst);
467cabdff1aSopenharmony_ci        dst += dst_stride;
468cabdff1aSopenharmony_ci        SW(out1, dst);
469cabdff1aSopenharmony_ci        dst += dst_stride;
470cabdff1aSopenharmony_ci        src0 = src2;
471cabdff1aSopenharmony_ci    }
472cabdff1aSopenharmony_ci}
473cabdff1aSopenharmony_ci
474cabdff1aSopenharmony_cistatic void common_vt_bil_and_aver_dst_8w_msa(const uint8_t *src,
475cabdff1aSopenharmony_ci                                              int32_t src_stride,
476cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
477cabdff1aSopenharmony_ci                                              uint8_t height)
478cabdff1aSopenharmony_ci{
479cabdff1aSopenharmony_ci    uint8_t loop_cnt;
480cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4;
481cabdff1aSopenharmony_ci
482cabdff1aSopenharmony_ci    src0 = LD_UB(src);
483cabdff1aSopenharmony_ci    src += src_stride;
484cabdff1aSopenharmony_ci
485cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
486cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src1, src2, src3, src4);
487cabdff1aSopenharmony_ci        src += (4 * src_stride);
488cabdff1aSopenharmony_ci
489cabdff1aSopenharmony_ci        AVER_DST_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
490cabdff1aSopenharmony_ci                          dst, dst_stride);
491cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
492cabdff1aSopenharmony_ci        src0 = src4;
493cabdff1aSopenharmony_ci    }
494cabdff1aSopenharmony_ci}
495cabdff1aSopenharmony_ci
496cabdff1aSopenharmony_cistatic void common_vt_bil_and_aver_dst_16w_msa(const uint8_t *src,
497cabdff1aSopenharmony_ci                                               int32_t src_stride,
498cabdff1aSopenharmony_ci                                               uint8_t *dst, int32_t dst_stride,
499cabdff1aSopenharmony_ci                                               uint8_t height)
500cabdff1aSopenharmony_ci{
501cabdff1aSopenharmony_ci    uint8_t loop_cnt;
502cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
503cabdff1aSopenharmony_ci    v16u8 res0, res1, res2, res3, res4, res5, res6, res7;
504cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
505cabdff1aSopenharmony_ci
506cabdff1aSopenharmony_ci    src0 = LD_UB(src);
507cabdff1aSopenharmony_ci    src += src_stride;
508cabdff1aSopenharmony_ci
509cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
510cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
511cabdff1aSopenharmony_ci        src += (8 * src_stride);
512cabdff1aSopenharmony_ci        AVER_UB4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
513cabdff1aSopenharmony_ci                    res0, res1, res2, res3);
514cabdff1aSopenharmony_ci        AVER_UB4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
515cabdff1aSopenharmony_ci                    res4, res5, res6, res7);
516cabdff1aSopenharmony_ci
517cabdff1aSopenharmony_ci        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
518cabdff1aSopenharmony_ci        AVER_UB4_UB(dst0, res0, dst1, res1, dst2, res2, dst3, res3,
519cabdff1aSopenharmony_ci                    res0, res1, res2, res3);
520cabdff1aSopenharmony_ci        AVER_UB4_UB(dst4, res4, dst5, res5, dst6, res6, dst7, res7,
521cabdff1aSopenharmony_ci                    res4, res5, res6, res7);
522cabdff1aSopenharmony_ci        ST_UB8(res0, res1, res2, res3, res4, res5, res6, res7, dst, dst_stride);
523cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
524cabdff1aSopenharmony_ci
525cabdff1aSopenharmony_ci        src0 = src8;
526cabdff1aSopenharmony_ci    }
527cabdff1aSopenharmony_ci}
528cabdff1aSopenharmony_ci
529cabdff1aSopenharmony_cistatic void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride,
530cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
531cabdff1aSopenharmony_ci                                 uint8_t height)
532cabdff1aSopenharmony_ci{
533cabdff1aSopenharmony_ci    uint8_t loop_cnt;
534cabdff1aSopenharmony_ci    uint32_t res0, res1;
535cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
536cabdff1aSopenharmony_ci    v16u8 src0_r, src1_r, src2_r, res;
537cabdff1aSopenharmony_ci    v8u16 add0, add1, add2, sum0, sum1;
538cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };
539cabdff1aSopenharmony_ci
540cabdff1aSopenharmony_ci    src0 = LD_SB(src);
541cabdff1aSopenharmony_ci    src += src_stride;
542cabdff1aSopenharmony_ci
543cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
544cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src1, src2);
545cabdff1aSopenharmony_ci        src += (2 * src_stride);
546cabdff1aSopenharmony_ci
547cabdff1aSopenharmony_ci        SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
548cabdff1aSopenharmony_ci                   src1_sld1, src2_sld1);
549cabdff1aSopenharmony_ci        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2,
550cabdff1aSopenharmony_ci                   src0_r, src1_r, src2_r);
551cabdff1aSopenharmony_ci        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
552cabdff1aSopenharmony_ci        ADD2(add0, add1, add1, add2, sum0, sum1);
553cabdff1aSopenharmony_ci        SRARI_H2_UH(sum0, sum1, 2);
554cabdff1aSopenharmony_ci        res = (v16u8) __msa_pckev_b((v16i8) sum1, (v16i8) sum0);
555cabdff1aSopenharmony_ci        res0 = __msa_copy_u_w((v4i32) res, 0);
556cabdff1aSopenharmony_ci        res1 = __msa_copy_u_w((v4i32) res, 2);
557cabdff1aSopenharmony_ci        SW(res0, dst);
558cabdff1aSopenharmony_ci        dst += dst_stride;
559cabdff1aSopenharmony_ci        SW(res1, dst);
560cabdff1aSopenharmony_ci        dst += dst_stride;
561cabdff1aSopenharmony_ci
562cabdff1aSopenharmony_ci        src0 = src2;
563cabdff1aSopenharmony_ci    }
564cabdff1aSopenharmony_ci}
565cabdff1aSopenharmony_ci
566cabdff1aSopenharmony_cistatic void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
567cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
568cabdff1aSopenharmony_ci                                 uint8_t height)
569cabdff1aSopenharmony_ci{
570cabdff1aSopenharmony_ci    uint8_t loop_cnt;
571cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
572cabdff1aSopenharmony_ci    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
573cabdff1aSopenharmony_ci    v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
574cabdff1aSopenharmony_ci    v8u16 add0, add1, add2, add3, add4;
575cabdff1aSopenharmony_ci    v8u16 sum0, sum1, sum2, sum3;
576cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };
577cabdff1aSopenharmony_ci
578cabdff1aSopenharmony_ci    src0 = LD_SB(src);
579cabdff1aSopenharmony_ci    src += src_stride;
580cabdff1aSopenharmony_ci
581cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
582cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src1, src2, src3, src4);
583cabdff1aSopenharmony_ci        src += (4 * src_stride);
584cabdff1aSopenharmony_ci
585cabdff1aSopenharmony_ci        SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
586cabdff1aSopenharmony_ci                   src1_sld1, src2_sld1);
587cabdff1aSopenharmony_ci        SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
588cabdff1aSopenharmony_ci        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
589cabdff1aSopenharmony_ci                   src1_r, src2_r);
590cabdff1aSopenharmony_ci        ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
591cabdff1aSopenharmony_ci        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
592cabdff1aSopenharmony_ci        HADD_UB2_UH(src3_r, src4_r, add3, add4);
593cabdff1aSopenharmony_ci        ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
594cabdff1aSopenharmony_ci             sum0, sum1, sum2, sum3);
595cabdff1aSopenharmony_ci        SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
596cabdff1aSopenharmony_ci        PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1);
597cabdff1aSopenharmony_ci        ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
598cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
599cabdff1aSopenharmony_ci        src0 = src4;
600cabdff1aSopenharmony_ci    }
601cabdff1aSopenharmony_ci}
602cabdff1aSopenharmony_ci
603cabdff1aSopenharmony_cistatic void common_hv_bil_16w_msa(const uint8_t *src, int32_t src_stride,
604cabdff1aSopenharmony_ci                                  uint8_t *dst, int32_t dst_stride,
605cabdff1aSopenharmony_ci                                  uint8_t height)
606cabdff1aSopenharmony_ci{
607cabdff1aSopenharmony_ci    uint8_t loop_cnt;
608cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
609cabdff1aSopenharmony_ci    v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
610cabdff1aSopenharmony_ci    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
611cabdff1aSopenharmony_ci    v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
612cabdff1aSopenharmony_ci    v8u16 src7_l, src8_l;
613cabdff1aSopenharmony_ci    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
614cabdff1aSopenharmony_ci    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
615cabdff1aSopenharmony_ci
616cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
617cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
618cabdff1aSopenharmony_ci        LD_UB8((src + 1), src_stride,
619cabdff1aSopenharmony_ci               src9, src10, src11, src12, src13, src14, src15, src16);
620cabdff1aSopenharmony_ci        src += (8 * src_stride);
621cabdff1aSopenharmony_ci
622cabdff1aSopenharmony_ci        src8 = LD_UB(src);
623cabdff1aSopenharmony_ci        src17 = LD_UB(src + 1);
624cabdff1aSopenharmony_ci
625cabdff1aSopenharmony_ci        ILVRL_B2_UH(src9, src0, src0_r, src0_l);
626cabdff1aSopenharmony_ci        ILVRL_B2_UH(src10, src1, src1_r, src1_l);
627cabdff1aSopenharmony_ci        ILVRL_B2_UH(src11, src2, src2_r, src2_l);
628cabdff1aSopenharmony_ci        ILVRL_B2_UH(src12, src3, src3_r, src3_l);
629cabdff1aSopenharmony_ci        ILVRL_B2_UH(src13, src4, src4_r, src4_l);
630cabdff1aSopenharmony_ci        ILVRL_B2_UH(src14, src5, src5_r, src5_l);
631cabdff1aSopenharmony_ci        ILVRL_B2_UH(src15, src6, src6_r, src6_l);
632cabdff1aSopenharmony_ci        ILVRL_B2_UH(src16, src7, src7_r, src7_l);
633cabdff1aSopenharmony_ci        ILVRL_B2_UH(src17, src8, src8_r, src8_l);
634cabdff1aSopenharmony_ci        HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
635cabdff1aSopenharmony_ci        HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
636cabdff1aSopenharmony_ci        HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
637cabdff1aSopenharmony_ci        HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
638cabdff1aSopenharmony_ci        HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
639cabdff1aSopenharmony_ci        HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
640cabdff1aSopenharmony_ci        ADD4(src0_r, src1_r, src1_r, src2_r, src2_r, src3_r, src3_r, src4_r,
641cabdff1aSopenharmony_ci             sum0_r, sum1_r, sum2_r, sum3_r);
642cabdff1aSopenharmony_ci        ADD4(src4_r, src5_r, src5_r, src6_r, src6_r, src7_r, src7_r, src8_r,
643cabdff1aSopenharmony_ci             sum4_r, sum5_r, sum6_r, sum7_r);
644cabdff1aSopenharmony_ci        ADD4(src0_l, src1_l, src1_l, src2_l, src2_l, src3_l, src3_l, src4_l,
645cabdff1aSopenharmony_ci             sum0_l, sum1_l, sum2_l, sum3_l);
646cabdff1aSopenharmony_ci        ADD4(src4_l, src5_l, src5_l, src6_l, src6_l, src7_l, src7_l, src8_l,
647cabdff1aSopenharmony_ci             sum4_l, sum5_l, sum6_l, sum7_l);
648cabdff1aSopenharmony_ci        SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
649cabdff1aSopenharmony_ci        SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
650cabdff1aSopenharmony_ci        SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
651cabdff1aSopenharmony_ci        SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
652cabdff1aSopenharmony_ci        PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, sum2_l, sum2_r,
653cabdff1aSopenharmony_ci                     sum3_l, sum3_r, dst, dst_stride);
654cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
655cabdff1aSopenharmony_ci        PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, sum6_l, sum6_r,
656cabdff1aSopenharmony_ci                     sum7_l, sum7_r, dst, dst_stride);
657cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
658cabdff1aSopenharmony_ci    }
659cabdff1aSopenharmony_ci}
660cabdff1aSopenharmony_ci
661cabdff1aSopenharmony_cistatic void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
662cabdff1aSopenharmony_ci                                         uint8_t *dst, int32_t dst_stride)
663cabdff1aSopenharmony_ci{
664cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
665cabdff1aSopenharmony_ci    v16u8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
666cabdff1aSopenharmony_ci    v16u8 src4_sld1, src5_sld1, src6_sld1, src7_sld1, src8_sld1;
667cabdff1aSopenharmony_ci    v8u16 src0_r, src1_r, src2_r, src3_r;
668cabdff1aSopenharmony_ci    v8u16 src4_r, src5_r, src6_r, src7_r, src8_r;
669cabdff1aSopenharmony_ci    v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
670cabdff1aSopenharmony_ci    v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
671cabdff1aSopenharmony_ci    v16i8 out0, out1;
672cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };
673cabdff1aSopenharmony_ci
674cabdff1aSopenharmony_ci    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
675cabdff1aSopenharmony_ci    src += (8 * src_stride);
676cabdff1aSopenharmony_ci    src8 = LD_UB(src);
677cabdff1aSopenharmony_ci
678cabdff1aSopenharmony_ci    SLDI_B4_UB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
679cabdff1aSopenharmony_ci               src0_sld1, src1_sld1, src2_sld1, src3_sld1);
680cabdff1aSopenharmony_ci    SLDI_B3_UB(zeros, src4, zeros, src5, zeros, src6, 1, src4_sld1,
681cabdff1aSopenharmony_ci               src5_sld1, src6_sld1);
682cabdff1aSopenharmony_ci    SLDI_B2_UB(zeros, src7, zeros, src8, 1, src7_sld1, src8_sld1);
683cabdff1aSopenharmony_ci    ILVR_B4_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src3_sld1,
684cabdff1aSopenharmony_ci               src3, src0_r, src1_r, src2_r, src3_r);
685cabdff1aSopenharmony_ci    ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r,
686cabdff1aSopenharmony_ci               src5_r, src6_r);
687cabdff1aSopenharmony_ci    ILVR_B2_UH(src7_sld1, src7, src8_sld1, src8, src7_r, src8_r);
688cabdff1aSopenharmony_ci    HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
689cabdff1aSopenharmony_ci    HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
690cabdff1aSopenharmony_ci    HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
691cabdff1aSopenharmony_ci
692cabdff1aSopenharmony_ci    sum0 = add0 + add1 + 1;
693cabdff1aSopenharmony_ci    sum1 = add1 + add2 + 1;
694cabdff1aSopenharmony_ci    sum2 = add2 + add3 + 1;
695cabdff1aSopenharmony_ci    sum3 = add3 + add4 + 1;
696cabdff1aSopenharmony_ci    sum4 = add4 + add5 + 1;
697cabdff1aSopenharmony_ci    sum5 = add5 + add6 + 1;
698cabdff1aSopenharmony_ci    sum6 = add6 + add7 + 1;
699cabdff1aSopenharmony_ci    sum7 = add7 + add8 + 1;
700cabdff1aSopenharmony_ci
701cabdff1aSopenharmony_ci    SRA_4V(sum0, sum1, sum2, sum3, 2);
702cabdff1aSopenharmony_ci    SRA_4V(sum4, sum5, sum6, sum7, 2);
703cabdff1aSopenharmony_ci    PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
704cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
705cabdff1aSopenharmony_ci    PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1);
706cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
707cabdff1aSopenharmony_ci}
708cabdff1aSopenharmony_ci
709cabdff1aSopenharmony_cistatic void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
710cabdff1aSopenharmony_ci                                         uint8_t *dst, int32_t dst_stride)
711cabdff1aSopenharmony_ci{
712cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
713cabdff1aSopenharmony_ci    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
714cabdff1aSopenharmony_ci    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r;
715cabdff1aSopenharmony_ci    v8u16 add0, add1, add2, add3, add4;
716cabdff1aSopenharmony_ci    v8u16 sum0, sum1, sum2, sum3;
717cabdff1aSopenharmony_ci    v16i8 out0, out1;
718cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };
719cabdff1aSopenharmony_ci
720cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
721cabdff1aSopenharmony_ci    src += (4 * src_stride);
722cabdff1aSopenharmony_ci    src4 = LD_SB(src);
723cabdff1aSopenharmony_ci
724cabdff1aSopenharmony_ci    SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
725cabdff1aSopenharmony_ci               src1_sld1, src2_sld1);
726cabdff1aSopenharmony_ci    SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
727cabdff1aSopenharmony_ci    ILVR_B3_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
728cabdff1aSopenharmony_ci               src1_r, src2_r);
729cabdff1aSopenharmony_ci    ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
730cabdff1aSopenharmony_ci    HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
731cabdff1aSopenharmony_ci    HADD_UB2_UH(src3_r, src4_r, add3, add4);
732cabdff1aSopenharmony_ci
733cabdff1aSopenharmony_ci    sum0 = add0 + add1 + 1;
734cabdff1aSopenharmony_ci    sum1 = add1 + add2 + 1;
735cabdff1aSopenharmony_ci    sum2 = add2 + add3 + 1;
736cabdff1aSopenharmony_ci    sum3 = add3 + add4 + 1;
737cabdff1aSopenharmony_ci
738cabdff1aSopenharmony_ci    SRA_4V(sum0, sum1, sum2, sum3, 2);
739cabdff1aSopenharmony_ci    PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
740cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
741cabdff1aSopenharmony_ci}
742cabdff1aSopenharmony_ci
743cabdff1aSopenharmony_cistatic void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src,
744cabdff1aSopenharmony_ci                                           int32_t src_stride,
745cabdff1aSopenharmony_ci                                           uint8_t *dst, int32_t dst_stride)
746cabdff1aSopenharmony_ci{
747cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
748cabdff1aSopenharmony_ci    v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
749cabdff1aSopenharmony_ci    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
750cabdff1aSopenharmony_ci    v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
751cabdff1aSopenharmony_ci    v8u16 src7_l, src8_l;
752cabdff1aSopenharmony_ci    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
753cabdff1aSopenharmony_ci    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
754cabdff1aSopenharmony_ci
755cabdff1aSopenharmony_ci    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
756cabdff1aSopenharmony_ci    LD_UB8((src + 1), src_stride,
757cabdff1aSopenharmony_ci           src9, src10, src11, src12, src13, src14, src15, src16);
758cabdff1aSopenharmony_ci    src += (8 * src_stride);
759cabdff1aSopenharmony_ci    src8 = LD_UB(src);
760cabdff1aSopenharmony_ci    src17 = LD_UB(src + 1);
761cabdff1aSopenharmony_ci
762cabdff1aSopenharmony_ci    ILVRL_B2_UH(src9, src0, src0_r, src0_l);
763cabdff1aSopenharmony_ci    ILVRL_B2_UH(src10, src1, src1_r, src1_l);
764cabdff1aSopenharmony_ci    ILVRL_B2_UH(src11, src2, src2_r, src2_l);
765cabdff1aSopenharmony_ci    ILVRL_B2_UH(src12, src3, src3_r, src3_l);
766cabdff1aSopenharmony_ci    ILVRL_B2_UH(src13, src4, src4_r, src4_l);
767cabdff1aSopenharmony_ci    ILVRL_B2_UH(src14, src5, src5_r, src5_l);
768cabdff1aSopenharmony_ci    ILVRL_B2_UH(src15, src6, src6_r, src6_l);
769cabdff1aSopenharmony_ci    ILVRL_B2_UH(src16, src7, src7_r, src7_l);
770cabdff1aSopenharmony_ci    ILVRL_B2_UH(src17, src8, src8_r, src8_l);
771cabdff1aSopenharmony_ci
772cabdff1aSopenharmony_ci    HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
773cabdff1aSopenharmony_ci    HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
774cabdff1aSopenharmony_ci    HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
775cabdff1aSopenharmony_ci    HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
776cabdff1aSopenharmony_ci    HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
777cabdff1aSopenharmony_ci    HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
778cabdff1aSopenharmony_ci
779cabdff1aSopenharmony_ci    sum0_r = src0_r + src1_r + 1;
780cabdff1aSopenharmony_ci    sum1_r = src1_r + src2_r + 1;
781cabdff1aSopenharmony_ci    sum2_r = src2_r + src3_r + 1;
782cabdff1aSopenharmony_ci    sum3_r = src3_r + src4_r + 1;
783cabdff1aSopenharmony_ci    sum4_r = src4_r + src5_r + 1;
784cabdff1aSopenharmony_ci    sum5_r = src5_r + src6_r + 1;
785cabdff1aSopenharmony_ci    sum6_r = src6_r + src7_r + 1;
786cabdff1aSopenharmony_ci    sum7_r = src7_r + src8_r + 1;
787cabdff1aSopenharmony_ci    sum0_l = src0_l + src1_l + 1;
788cabdff1aSopenharmony_ci    sum1_l = src1_l + src2_l + 1;
789cabdff1aSopenharmony_ci    sum2_l = src2_l + src3_l + 1;
790cabdff1aSopenharmony_ci    sum3_l = src3_l + src4_l + 1;
791cabdff1aSopenharmony_ci    sum4_l = src4_l + src5_l + 1;
792cabdff1aSopenharmony_ci    sum5_l = src5_l + src6_l + 1;
793cabdff1aSopenharmony_ci    sum6_l = src6_l + src7_l + 1;
794cabdff1aSopenharmony_ci    sum7_l = src7_l + src8_l + 1;
795cabdff1aSopenharmony_ci
796cabdff1aSopenharmony_ci    SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
797cabdff1aSopenharmony_ci    SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
798cabdff1aSopenharmony_ci    SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
799cabdff1aSopenharmony_ci    SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
800cabdff1aSopenharmony_ci    PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
801cabdff1aSopenharmony_ci                 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
802cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
803cabdff1aSopenharmony_ci
804cabdff1aSopenharmony_ci    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
805cabdff1aSopenharmony_ci    LD_UB8((src + 1), src_stride,
806cabdff1aSopenharmony_ci           src9, src10, src11, src12, src13, src14, src15, src16);
807cabdff1aSopenharmony_ci    src += (8 * src_stride);
808cabdff1aSopenharmony_ci    src8 = LD_UB(src);
809cabdff1aSopenharmony_ci    src17 = LD_UB(src + 1);
810cabdff1aSopenharmony_ci
811cabdff1aSopenharmony_ci    PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
812cabdff1aSopenharmony_ci                 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
813cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
814cabdff1aSopenharmony_ci
815cabdff1aSopenharmony_ci    ILVRL_B2_UH(src9, src0, src0_r, src0_l);
816cabdff1aSopenharmony_ci    ILVRL_B2_UH(src10, src1, src1_r, src1_l);
817cabdff1aSopenharmony_ci    ILVRL_B2_UH(src11, src2, src2_r, src2_l);
818cabdff1aSopenharmony_ci    ILVRL_B2_UH(src12, src3, src3_r, src3_l);
819cabdff1aSopenharmony_ci    ILVRL_B2_UH(src13, src4, src4_r, src4_l);
820cabdff1aSopenharmony_ci    ILVRL_B2_UH(src14, src5, src5_r, src5_l);
821cabdff1aSopenharmony_ci    ILVRL_B2_UH(src15, src6, src6_r, src6_l);
822cabdff1aSopenharmony_ci    ILVRL_B2_UH(src16, src7, src7_r, src7_l);
823cabdff1aSopenharmony_ci    ILVRL_B2_UH(src17, src8, src8_r, src8_l);
824cabdff1aSopenharmony_ci
825cabdff1aSopenharmony_ci    HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
826cabdff1aSopenharmony_ci    HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
827cabdff1aSopenharmony_ci    HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
828cabdff1aSopenharmony_ci    HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
829cabdff1aSopenharmony_ci    HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
830cabdff1aSopenharmony_ci    HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
831cabdff1aSopenharmony_ci
832cabdff1aSopenharmony_ci    sum0_r = src0_r + src1_r + 1;
833cabdff1aSopenharmony_ci    sum1_r = src1_r + src2_r + 1;
834cabdff1aSopenharmony_ci    sum2_r = src2_r + src3_r + 1;
835cabdff1aSopenharmony_ci    sum3_r = src3_r + src4_r + 1;
836cabdff1aSopenharmony_ci    sum4_r = src4_r + src5_r + 1;
837cabdff1aSopenharmony_ci    sum5_r = src5_r + src6_r + 1;
838cabdff1aSopenharmony_ci    sum6_r = src6_r + src7_r + 1;
839cabdff1aSopenharmony_ci    sum7_r = src7_r + src8_r + 1;
840cabdff1aSopenharmony_ci    sum0_l = src0_l + src1_l + 1;
841cabdff1aSopenharmony_ci    sum1_l = src1_l + src2_l + 1;
842cabdff1aSopenharmony_ci    sum2_l = src2_l + src3_l + 1;
843cabdff1aSopenharmony_ci    sum3_l = src3_l + src4_l + 1;
844cabdff1aSopenharmony_ci    sum4_l = src4_l + src5_l + 1;
845cabdff1aSopenharmony_ci    sum5_l = src5_l + src6_l + 1;
846cabdff1aSopenharmony_ci    sum6_l = src6_l + src7_l + 1;
847cabdff1aSopenharmony_ci    sum7_l = src7_l + src8_l + 1;
848cabdff1aSopenharmony_ci
849cabdff1aSopenharmony_ci    SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
850cabdff1aSopenharmony_ci    SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
851cabdff1aSopenharmony_ci    SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
852cabdff1aSopenharmony_ci    SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
853cabdff1aSopenharmony_ci    PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
854cabdff1aSopenharmony_ci                 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
855cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
856cabdff1aSopenharmony_ci    PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
857cabdff1aSopenharmony_ci                 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
858cabdff1aSopenharmony_ci}
859cabdff1aSopenharmony_ci
860cabdff1aSopenharmony_cistatic void common_hv_bil_no_rnd_8x16_msa(const uint8_t *src,
861cabdff1aSopenharmony_ci                                          int32_t src_stride,
862cabdff1aSopenharmony_ci                                          uint8_t *dst, int32_t dst_stride)
863cabdff1aSopenharmony_ci{
864cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
865cabdff1aSopenharmony_ci    v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
866cabdff1aSopenharmony_ci    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
867cabdff1aSopenharmony_ci    v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
868cabdff1aSopenharmony_ci    v8u16 src7_l, src8_l;
869cabdff1aSopenharmony_ci    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
870cabdff1aSopenharmony_ci    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
871cabdff1aSopenharmony_ci
872cabdff1aSopenharmony_ci    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
873cabdff1aSopenharmony_ci    LD_UB8((src + 1), src_stride,
874cabdff1aSopenharmony_ci           src9, src10, src11, src12, src13, src14, src15, src16);
875cabdff1aSopenharmony_ci    src += (8 * src_stride);
876cabdff1aSopenharmony_ci    src8 = LD_UB(src);
877cabdff1aSopenharmony_ci    src17 = LD_UB(src + 1);
878cabdff1aSopenharmony_ci
879cabdff1aSopenharmony_ci    ILVRL_B2_UH(src9, src0, src0_r, src0_l);
880cabdff1aSopenharmony_ci    ILVRL_B2_UH(src10, src1, src1_r, src1_l);
881cabdff1aSopenharmony_ci    ILVRL_B2_UH(src11, src2, src2_r, src2_l);
882cabdff1aSopenharmony_ci    ILVRL_B2_UH(src12, src3, src3_r, src3_l);
883cabdff1aSopenharmony_ci    ILVRL_B2_UH(src13, src4, src4_r, src4_l);
884cabdff1aSopenharmony_ci    ILVRL_B2_UH(src14, src5, src5_r, src5_l);
885cabdff1aSopenharmony_ci    ILVRL_B2_UH(src15, src6, src6_r, src6_l);
886cabdff1aSopenharmony_ci    ILVRL_B2_UH(src16, src7, src7_r, src7_l);
887cabdff1aSopenharmony_ci    ILVRL_B2_UH(src17, src8, src8_r, src8_l);
888cabdff1aSopenharmony_ci
889cabdff1aSopenharmony_ci    HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
890cabdff1aSopenharmony_ci    HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
891cabdff1aSopenharmony_ci    HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
892cabdff1aSopenharmony_ci    HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
893cabdff1aSopenharmony_ci    HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
894cabdff1aSopenharmony_ci    HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
895cabdff1aSopenharmony_ci
896cabdff1aSopenharmony_ci    sum0_r = src0_r + src1_r + 1;
897cabdff1aSopenharmony_ci    sum1_r = src1_r + src2_r + 1;
898cabdff1aSopenharmony_ci    sum2_r = src2_r + src3_r + 1;
899cabdff1aSopenharmony_ci    sum3_r = src3_r + src4_r + 1;
900cabdff1aSopenharmony_ci    sum4_r = src4_r + src5_r + 1;
901cabdff1aSopenharmony_ci    sum5_r = src5_r + src6_r + 1;
902cabdff1aSopenharmony_ci    sum6_r = src6_r + src7_r + 1;
903cabdff1aSopenharmony_ci    sum7_r = src7_r + src8_r + 1;
904cabdff1aSopenharmony_ci    sum0_l = src0_l + src1_l + 1;
905cabdff1aSopenharmony_ci    sum1_l = src1_l + src2_l + 1;
906cabdff1aSopenharmony_ci    sum2_l = src2_l + src3_l + 1;
907cabdff1aSopenharmony_ci    sum3_l = src3_l + src4_l + 1;
908cabdff1aSopenharmony_ci    sum4_l = src4_l + src5_l + 1;
909cabdff1aSopenharmony_ci    sum5_l = src5_l + src6_l + 1;
910cabdff1aSopenharmony_ci    sum6_l = src6_l + src7_l + 1;
911cabdff1aSopenharmony_ci    sum7_l = src7_l + src8_l + 1;
912cabdff1aSopenharmony_ci
913cabdff1aSopenharmony_ci    SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
914cabdff1aSopenharmony_ci    SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
915cabdff1aSopenharmony_ci    SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
916cabdff1aSopenharmony_ci    SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
917cabdff1aSopenharmony_ci    PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
918cabdff1aSopenharmony_ci                 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
919cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
920cabdff1aSopenharmony_ci    PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
921cabdff1aSopenharmony_ci                 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
922cabdff1aSopenharmony_ci}
923cabdff1aSopenharmony_ci
924cabdff1aSopenharmony_cistatic void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src,
925cabdff1aSopenharmony_ci                                              int32_t src_stride,
926cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
927cabdff1aSopenharmony_ci                                              uint8_t height)
928cabdff1aSopenharmony_ci{
929cabdff1aSopenharmony_ci    uint8_t loop_cnt;
930cabdff1aSopenharmony_ci    uint32_t out0, out1;
931cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
932cabdff1aSopenharmony_ci    v16u8 src0_r, src1_r, src2_r;
933cabdff1aSopenharmony_ci    v8u16 add0, add1, add2, sum0, sum1;
934cabdff1aSopenharmony_ci    v16u8 dst0, dst1, res0, res1;
935cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };
936cabdff1aSopenharmony_ci
937cabdff1aSopenharmony_ci    src0 = LD_SB(src);
938cabdff1aSopenharmony_ci    src += src_stride;
939cabdff1aSopenharmony_ci
940cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
941cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src1, src2);
942cabdff1aSopenharmony_ci        src += (2 * src_stride);
943cabdff1aSopenharmony_ci
944cabdff1aSopenharmony_ci        LD_UB2(dst, dst_stride, dst0, dst1);
945cabdff1aSopenharmony_ci        SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
946cabdff1aSopenharmony_ci                   src1_sld1, src2_sld1);
947cabdff1aSopenharmony_ci        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
948cabdff1aSopenharmony_ci                   src1_r, src2_r);
949cabdff1aSopenharmony_ci        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
950cabdff1aSopenharmony_ci        ADD2(add0, add1, add1, add2, sum0, sum1);
951cabdff1aSopenharmony_ci        SRARI_H2_UH(sum0, sum1, 2);
952cabdff1aSopenharmony_ci        PCKEV_B2_UB(sum0, sum0, sum1, sum1, res0, res1);
953cabdff1aSopenharmony_ci        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
954cabdff1aSopenharmony_ci
955cabdff1aSopenharmony_ci        out0 = __msa_copy_u_w((v4i32) res0, 0);
956cabdff1aSopenharmony_ci        out1 = __msa_copy_u_w((v4i32) res1, 0);
957cabdff1aSopenharmony_ci        SW(out0, dst);
958cabdff1aSopenharmony_ci        dst += dst_stride;
959cabdff1aSopenharmony_ci        SW(out1, dst);
960cabdff1aSopenharmony_ci        dst += dst_stride;
961cabdff1aSopenharmony_ci
962cabdff1aSopenharmony_ci        src0 = src2;
963cabdff1aSopenharmony_ci    }
964cabdff1aSopenharmony_ci}
965cabdff1aSopenharmony_ci
966cabdff1aSopenharmony_cistatic void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src,
967cabdff1aSopenharmony_ci                                              int32_t src_stride,
968cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
969cabdff1aSopenharmony_ci                                              uint8_t height)
970cabdff1aSopenharmony_ci{
971cabdff1aSopenharmony_ci    uint8_t loop_cnt;
972cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
973cabdff1aSopenharmony_ci    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
974cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3;
975cabdff1aSopenharmony_ci    v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
976cabdff1aSopenharmony_ci    v8u16 add0, add1, add2, add3, add4;
977cabdff1aSopenharmony_ci    v8u16 sum0, sum1, sum2, sum3;
978cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };
979cabdff1aSopenharmony_ci
980cabdff1aSopenharmony_ci    src0 = LD_SB(src);
981cabdff1aSopenharmony_ci    src += src_stride;
982cabdff1aSopenharmony_ci
983cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
984cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src1, src2, src3, src4);
985cabdff1aSopenharmony_ci        src += (4 * src_stride);
986cabdff1aSopenharmony_ci
987cabdff1aSopenharmony_ci        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
988cabdff1aSopenharmony_ci        SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
989cabdff1aSopenharmony_ci                   src1_sld1, src2_sld1);
990cabdff1aSopenharmony_ci        SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
991cabdff1aSopenharmony_ci        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
992cabdff1aSopenharmony_ci                   src1_r, src2_r);
993cabdff1aSopenharmony_ci        ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
994cabdff1aSopenharmony_ci        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
995cabdff1aSopenharmony_ci        HADD_UB2_UH(src3_r, src4_r, add3, add4);
996cabdff1aSopenharmony_ci        ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
997cabdff1aSopenharmony_ci             sum0, sum1, sum2, sum3);
998cabdff1aSopenharmony_ci        SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
999cabdff1aSopenharmony_ci        PCKEV_AVG_ST8x4_UB(sum0, dst0, sum1, dst1,
1000cabdff1aSopenharmony_ci                           sum2, dst2, sum3, dst3, dst, dst_stride);
1001cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1002cabdff1aSopenharmony_ci        src0 = src4;
1003cabdff1aSopenharmony_ci    }
1004cabdff1aSopenharmony_ci}
1005cabdff1aSopenharmony_ci
1006cabdff1aSopenharmony_cistatic void common_hv_bil_and_aver_dst_16w_msa(const uint8_t *src,
1007cabdff1aSopenharmony_ci                                               int32_t src_stride,
1008cabdff1aSopenharmony_ci                                               uint8_t *dst, int32_t dst_stride,
1009cabdff1aSopenharmony_ci                                               uint8_t height)
1010cabdff1aSopenharmony_ci{
1011cabdff1aSopenharmony_ci    uint8_t loop_cnt;
1012cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1013cabdff1aSopenharmony_ci    v16u8 src11, src12, src13, src14, src15, src16, src17;
1014cabdff1aSopenharmony_ci    v16u8 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
1015cabdff1aSopenharmony_ci    v16u8 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
1016cabdff1aSopenharmony_ci    v16u8 src7_l, src8_l;
1017cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1018cabdff1aSopenharmony_ci    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
1019cabdff1aSopenharmony_ci    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
1020cabdff1aSopenharmony_ci    v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
1021cabdff1aSopenharmony_ci
1022cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
1023cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1024cabdff1aSopenharmony_ci        LD_UB8((src + 1), src_stride,
1025cabdff1aSopenharmony_ci               src9, src10, src11, src12, src13, src14, src15, src16);
1026cabdff1aSopenharmony_ci        src += (8 * src_stride);
1027cabdff1aSopenharmony_ci
1028cabdff1aSopenharmony_ci        src8 = LD_UB(src);
1029cabdff1aSopenharmony_ci        src17 = LD_UB(src + 1);
1030cabdff1aSopenharmony_ci
1031cabdff1aSopenharmony_ci        ILVRL_B2_UB(src9, src0, src0_r, src0_l);
1032cabdff1aSopenharmony_ci        ILVRL_B2_UB(src10, src1, src1_r, src1_l);
1033cabdff1aSopenharmony_ci        ILVRL_B2_UB(src11, src2, src2_r, src2_l);
1034cabdff1aSopenharmony_ci        ILVRL_B2_UB(src12, src3, src3_r, src3_l);
1035cabdff1aSopenharmony_ci        ILVRL_B2_UB(src13, src4, src4_r, src4_l);
1036cabdff1aSopenharmony_ci        ILVRL_B2_UB(src14, src5, src5_r, src5_l);
1037cabdff1aSopenharmony_ci        ILVRL_B2_UB(src15, src6, src6_r, src6_l);
1038cabdff1aSopenharmony_ci        ILVRL_B2_UB(src16, src7, src7_r, src7_l);
1039cabdff1aSopenharmony_ci        ILVRL_B2_UB(src17, src8, src8_r, src8_l);
1040cabdff1aSopenharmony_ci        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
1041cabdff1aSopenharmony_ci        HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
1042cabdff1aSopenharmony_ci        HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
1043cabdff1aSopenharmony_ci        ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_r, sum1_r,
1044cabdff1aSopenharmony_ci             sum2_r, sum3_r);
1045cabdff1aSopenharmony_ci        ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_r, sum5_r,
1046cabdff1aSopenharmony_ci             sum6_r, sum7_r);
1047cabdff1aSopenharmony_ci        HADD_UB3_UH(src0_l, src1_l, src2_l, add0, add1, add2);
1048cabdff1aSopenharmony_ci        HADD_UB3_UH(src3_l, src4_l, src5_l, add3, add4, add5);
1049cabdff1aSopenharmony_ci        HADD_UB3_UH(src6_l, src7_l, src8_l, add6, add7, add8);
1050cabdff1aSopenharmony_ci        ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_l, sum1_l,
1051cabdff1aSopenharmony_ci             sum2_l, sum3_l);
1052cabdff1aSopenharmony_ci        ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_l, sum5_l,
1053cabdff1aSopenharmony_ci             sum6_l, sum7_l);
1054cabdff1aSopenharmony_ci        SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
1055cabdff1aSopenharmony_ci        SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
1056cabdff1aSopenharmony_ci        SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
1057cabdff1aSopenharmony_ci        SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
1058cabdff1aSopenharmony_ci        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1059cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(sum0_l, sum0_r, dst0, dst);
1060cabdff1aSopenharmony_ci        dst += dst_stride;
1061cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(sum1_l, sum1_r, dst1, dst);
1062cabdff1aSopenharmony_ci        dst += dst_stride;
1063cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(sum2_l, sum2_r, dst2, dst);
1064cabdff1aSopenharmony_ci        dst += dst_stride;
1065cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(sum3_l, sum3_r, dst3, dst);
1066cabdff1aSopenharmony_ci        dst += dst_stride;
1067cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(sum4_l, sum4_r, dst4, dst);
1068cabdff1aSopenharmony_ci        dst += dst_stride;
1069cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(sum5_l, sum5_r, dst5, dst);
1070cabdff1aSopenharmony_ci        dst += dst_stride;
1071cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(sum6_l, sum6_r, dst6, dst);
1072cabdff1aSopenharmony_ci        dst += dst_stride;
1073cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(sum7_l, sum7_r, dst7, dst);
1074cabdff1aSopenharmony_ci        dst += dst_stride;
1075cabdff1aSopenharmony_ci    }
1076cabdff1aSopenharmony_ci}
1077cabdff1aSopenharmony_ci
1078cabdff1aSopenharmony_cistatic void copy_width8_msa(const uint8_t *src, int32_t src_stride,
1079cabdff1aSopenharmony_ci                            uint8_t *dst, int32_t dst_stride,
1080cabdff1aSopenharmony_ci                            int32_t height)
1081cabdff1aSopenharmony_ci{
1082cabdff1aSopenharmony_ci    int32_t cnt;
1083cabdff1aSopenharmony_ci    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
1084cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1085cabdff1aSopenharmony_ci
1086cabdff1aSopenharmony_ci    if (0 == height % 12) {
1087cabdff1aSopenharmony_ci        for (cnt = (height / 12); cnt--;) {
1088cabdff1aSopenharmony_ci            LD_UB8(src, src_stride,
1089cabdff1aSopenharmony_ci                   src0, src1, src2, src3, src4, src5, src6, src7);
1090cabdff1aSopenharmony_ci            src += (8 * src_stride);
1091cabdff1aSopenharmony_ci
1092cabdff1aSopenharmony_ci            out0 = __msa_copy_u_d((v2i64) src0, 0);
1093cabdff1aSopenharmony_ci            out1 = __msa_copy_u_d((v2i64) src1, 0);
1094cabdff1aSopenharmony_ci            out2 = __msa_copy_u_d((v2i64) src2, 0);
1095cabdff1aSopenharmony_ci            out3 = __msa_copy_u_d((v2i64) src3, 0);
1096cabdff1aSopenharmony_ci            out4 = __msa_copy_u_d((v2i64) src4, 0);
1097cabdff1aSopenharmony_ci            out5 = __msa_copy_u_d((v2i64) src5, 0);
1098cabdff1aSopenharmony_ci            out6 = __msa_copy_u_d((v2i64) src6, 0);
1099cabdff1aSopenharmony_ci            out7 = __msa_copy_u_d((v2i64) src7, 0);
1100cabdff1aSopenharmony_ci
1101cabdff1aSopenharmony_ci            SD4(out0, out1, out2, out3, dst, dst_stride);
1102cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
1103cabdff1aSopenharmony_ci            SD4(out4, out5, out6, out7, dst, dst_stride);
1104cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
1105cabdff1aSopenharmony_ci
1106cabdff1aSopenharmony_ci            LD_UB4(src, src_stride, src0, src1, src2, src3);
1107cabdff1aSopenharmony_ci            src += (4 * src_stride);
1108cabdff1aSopenharmony_ci
1109cabdff1aSopenharmony_ci            out0 = __msa_copy_u_d((v2i64) src0, 0);
1110cabdff1aSopenharmony_ci            out1 = __msa_copy_u_d((v2i64) src1, 0);
1111cabdff1aSopenharmony_ci            out2 = __msa_copy_u_d((v2i64) src2, 0);
1112cabdff1aSopenharmony_ci            out3 = __msa_copy_u_d((v2i64) src3, 0);
1113cabdff1aSopenharmony_ci
1114cabdff1aSopenharmony_ci            SD4(out0, out1, out2, out3, dst, dst_stride);
1115cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
1116cabdff1aSopenharmony_ci        }
1117cabdff1aSopenharmony_ci    } else if (0 == height % 8) {
1118cabdff1aSopenharmony_ci        for (cnt = height >> 3; cnt--;) {
1119cabdff1aSopenharmony_ci            LD_UB8(src, src_stride,
1120cabdff1aSopenharmony_ci                   src0, src1, src2, src3, src4, src5, src6, src7);
1121cabdff1aSopenharmony_ci            src += (8 * src_stride);
1122cabdff1aSopenharmony_ci
1123cabdff1aSopenharmony_ci            out0 = __msa_copy_u_d((v2i64) src0, 0);
1124cabdff1aSopenharmony_ci            out1 = __msa_copy_u_d((v2i64) src1, 0);
1125cabdff1aSopenharmony_ci            out2 = __msa_copy_u_d((v2i64) src2, 0);
1126cabdff1aSopenharmony_ci            out3 = __msa_copy_u_d((v2i64) src3, 0);
1127cabdff1aSopenharmony_ci            out4 = __msa_copy_u_d((v2i64) src4, 0);
1128cabdff1aSopenharmony_ci            out5 = __msa_copy_u_d((v2i64) src5, 0);
1129cabdff1aSopenharmony_ci            out6 = __msa_copy_u_d((v2i64) src6, 0);
1130cabdff1aSopenharmony_ci            out7 = __msa_copy_u_d((v2i64) src7, 0);
1131cabdff1aSopenharmony_ci
1132cabdff1aSopenharmony_ci            SD4(out0, out1, out2, out3, dst, dst_stride);
1133cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
1134cabdff1aSopenharmony_ci            SD4(out4, out5, out6, out7, dst, dst_stride);
1135cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
1136cabdff1aSopenharmony_ci        }
1137cabdff1aSopenharmony_ci    } else if (0 == height % 4) {
1138cabdff1aSopenharmony_ci        for (cnt = (height / 4); cnt--;) {
1139cabdff1aSopenharmony_ci            LD_UB4(src, src_stride, src0, src1, src2, src3);
1140cabdff1aSopenharmony_ci            src += (4 * src_stride);
1141cabdff1aSopenharmony_ci            out0 = __msa_copy_u_d((v2i64) src0, 0);
1142cabdff1aSopenharmony_ci            out1 = __msa_copy_u_d((v2i64) src1, 0);
1143cabdff1aSopenharmony_ci            out2 = __msa_copy_u_d((v2i64) src2, 0);
1144cabdff1aSopenharmony_ci            out3 = __msa_copy_u_d((v2i64) src3, 0);
1145cabdff1aSopenharmony_ci
1146cabdff1aSopenharmony_ci            SD4(out0, out1, out2, out3, dst, dst_stride);
1147cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
1148cabdff1aSopenharmony_ci        }
1149cabdff1aSopenharmony_ci    } else if (0 == height % 2) {
1150cabdff1aSopenharmony_ci        for (cnt = (height / 2); cnt--;) {
1151cabdff1aSopenharmony_ci            LD_UB2(src, src_stride, src0, src1);
1152cabdff1aSopenharmony_ci            src += (2 * src_stride);
1153cabdff1aSopenharmony_ci            out0 = __msa_copy_u_d((v2i64) src0, 0);
1154cabdff1aSopenharmony_ci            out1 = __msa_copy_u_d((v2i64) src1, 0);
1155cabdff1aSopenharmony_ci
1156cabdff1aSopenharmony_ci            SD(out0, dst);
1157cabdff1aSopenharmony_ci            dst += dst_stride;
1158cabdff1aSopenharmony_ci            SD(out1, dst);
1159cabdff1aSopenharmony_ci            dst += dst_stride;
1160cabdff1aSopenharmony_ci        }
1161cabdff1aSopenharmony_ci    }
1162cabdff1aSopenharmony_ci}
1163cabdff1aSopenharmony_ci
1164cabdff1aSopenharmony_cistatic void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
1165cabdff1aSopenharmony_ci                                  uint8_t *dst, int32_t dst_stride,
1166cabdff1aSopenharmony_ci                                  int32_t height, int32_t width)
1167cabdff1aSopenharmony_ci{
1168cabdff1aSopenharmony_ci    int32_t cnt, loop_cnt;
1169cabdff1aSopenharmony_ci    const uint8_t *src_tmp;
1170cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
1171cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1172cabdff1aSopenharmony_ci
1173cabdff1aSopenharmony_ci    for (cnt = (width >> 4); cnt--;) {
1174cabdff1aSopenharmony_ci        src_tmp = src;
1175cabdff1aSopenharmony_ci        dst_tmp = dst;
1176cabdff1aSopenharmony_ci
1177cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 3); loop_cnt--;) {
1178cabdff1aSopenharmony_ci            LD_UB8(src_tmp, src_stride,
1179cabdff1aSopenharmony_ci                   src0, src1, src2, src3, src4, src5, src6, src7);
1180cabdff1aSopenharmony_ci            src_tmp += (8 * src_stride);
1181cabdff1aSopenharmony_ci
1182cabdff1aSopenharmony_ci            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
1183cabdff1aSopenharmony_ci                   dst_tmp, dst_stride);
1184cabdff1aSopenharmony_ci            dst_tmp += (8 * dst_stride);
1185cabdff1aSopenharmony_ci        }
1186cabdff1aSopenharmony_ci
1187cabdff1aSopenharmony_ci        src += 16;
1188cabdff1aSopenharmony_ci        dst += 16;
1189cabdff1aSopenharmony_ci    }
1190cabdff1aSopenharmony_ci}
1191cabdff1aSopenharmony_ci
1192cabdff1aSopenharmony_cistatic void copy_width16_msa(const uint8_t *src, int32_t src_stride,
1193cabdff1aSopenharmony_ci                             uint8_t *dst, int32_t dst_stride,
1194cabdff1aSopenharmony_ci                             int32_t height)
1195cabdff1aSopenharmony_ci{
1196cabdff1aSopenharmony_ci    int32_t cnt;
1197cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1198cabdff1aSopenharmony_ci
1199cabdff1aSopenharmony_ci    if (0 == height % 12) {
1200cabdff1aSopenharmony_ci        for (cnt = (height / 12); cnt--;) {
1201cabdff1aSopenharmony_ci            LD_UB8(src, src_stride,
1202cabdff1aSopenharmony_ci                   src0, src1, src2, src3, src4, src5, src6, src7);
1203cabdff1aSopenharmony_ci            src += (8 * src_stride);
1204cabdff1aSopenharmony_ci            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
1205cabdff1aSopenharmony_ci                   dst, dst_stride);
1206cabdff1aSopenharmony_ci            dst += (8 * dst_stride);
1207cabdff1aSopenharmony_ci
1208cabdff1aSopenharmony_ci            LD_UB4(src, src_stride, src0, src1, src2, src3);
1209cabdff1aSopenharmony_ci            src += (4 * src_stride);
1210cabdff1aSopenharmony_ci            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
1211cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
1212cabdff1aSopenharmony_ci        }
1213cabdff1aSopenharmony_ci    } else if (0 == height % 8) {
1214cabdff1aSopenharmony_ci        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
1215cabdff1aSopenharmony_ci    } else if (0 == height % 4) {
1216cabdff1aSopenharmony_ci        for (cnt = (height >> 2); cnt--;) {
1217cabdff1aSopenharmony_ci            LD_UB4(src, src_stride, src0, src1, src2, src3);
1218cabdff1aSopenharmony_ci            src += (4 * src_stride);
1219cabdff1aSopenharmony_ci
1220cabdff1aSopenharmony_ci            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
1221cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
1222cabdff1aSopenharmony_ci        }
1223cabdff1aSopenharmony_ci    }
1224cabdff1aSopenharmony_ci}
1225cabdff1aSopenharmony_ci
1226cabdff1aSopenharmony_cistatic void avg_width4_msa(const uint8_t *src, int32_t src_stride,
1227cabdff1aSopenharmony_ci                           uint8_t *dst, int32_t dst_stride,
1228cabdff1aSopenharmony_ci                           int32_t height)
1229cabdff1aSopenharmony_ci{
1230cabdff1aSopenharmony_ci    int32_t cnt;
1231cabdff1aSopenharmony_ci    uint32_t out0, out1, out2, out3;
1232cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3;
1233cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3;
1234cabdff1aSopenharmony_ci
1235cabdff1aSopenharmony_ci    if (0 == (height % 4)) {
1236cabdff1aSopenharmony_ci        for (cnt = (height / 4); cnt--;) {
1237cabdff1aSopenharmony_ci            LD_UB4(src, src_stride, src0, src1, src2, src3);
1238cabdff1aSopenharmony_ci            src += (4 * src_stride);
1239cabdff1aSopenharmony_ci
1240cabdff1aSopenharmony_ci            LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1241cabdff1aSopenharmony_ci
1242cabdff1aSopenharmony_ci            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1243cabdff1aSopenharmony_ci                        dst0, dst1, dst2, dst3);
1244cabdff1aSopenharmony_ci
1245cabdff1aSopenharmony_ci            out0 = __msa_copy_u_w((v4i32) dst0, 0);
1246cabdff1aSopenharmony_ci            out1 = __msa_copy_u_w((v4i32) dst1, 0);
1247cabdff1aSopenharmony_ci            out2 = __msa_copy_u_w((v4i32) dst2, 0);
1248cabdff1aSopenharmony_ci            out3 = __msa_copy_u_w((v4i32) dst3, 0);
1249cabdff1aSopenharmony_ci            SW4(out0, out1, out2, out3, dst, dst_stride);
1250cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
1251cabdff1aSopenharmony_ci        }
1252cabdff1aSopenharmony_ci    } else if (0 == (height % 2)) {
1253cabdff1aSopenharmony_ci        for (cnt = (height / 2); cnt--;) {
1254cabdff1aSopenharmony_ci            LD_UB2(src, src_stride, src0, src1);
1255cabdff1aSopenharmony_ci            src += (2 * src_stride);
1256cabdff1aSopenharmony_ci
1257cabdff1aSopenharmony_ci            LD_UB2(dst, dst_stride, dst0, dst1);
1258cabdff1aSopenharmony_ci
1259cabdff1aSopenharmony_ci            AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1260cabdff1aSopenharmony_ci
1261cabdff1aSopenharmony_ci            out0 = __msa_copy_u_w((v4i32) dst0, 0);
1262cabdff1aSopenharmony_ci            out1 = __msa_copy_u_w((v4i32) dst1, 0);
1263cabdff1aSopenharmony_ci            SW(out0, dst);
1264cabdff1aSopenharmony_ci            dst += dst_stride;
1265cabdff1aSopenharmony_ci            SW(out1, dst);
1266cabdff1aSopenharmony_ci            dst += dst_stride;
1267cabdff1aSopenharmony_ci        }
1268cabdff1aSopenharmony_ci    }
1269cabdff1aSopenharmony_ci}
1270cabdff1aSopenharmony_ci
1271cabdff1aSopenharmony_cistatic void avg_width8_msa(const uint8_t *src, int32_t src_stride,
1272cabdff1aSopenharmony_ci                           uint8_t *dst, int32_t dst_stride,
1273cabdff1aSopenharmony_ci                           int32_t height)
1274cabdff1aSopenharmony_ci{
1275cabdff1aSopenharmony_ci    int32_t cnt;
1276cabdff1aSopenharmony_ci    uint64_t out0, out1, out2, out3;
1277cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3;
1278cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3;
1279cabdff1aSopenharmony_ci
1280cabdff1aSopenharmony_ci    for (cnt = (height / 4); cnt--;) {
1281cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
1282cabdff1aSopenharmony_ci        src += (4 * src_stride);
1283cabdff1aSopenharmony_ci        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1284cabdff1aSopenharmony_ci
1285cabdff1aSopenharmony_ci        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1286cabdff1aSopenharmony_ci                    dst0, dst1, dst2, dst3);
1287cabdff1aSopenharmony_ci
1288cabdff1aSopenharmony_ci        out0 = __msa_copy_u_d((v2i64) dst0, 0);
1289cabdff1aSopenharmony_ci        out1 = __msa_copy_u_d((v2i64) dst1, 0);
1290cabdff1aSopenharmony_ci        out2 = __msa_copy_u_d((v2i64) dst2, 0);
1291cabdff1aSopenharmony_ci        out3 = __msa_copy_u_d((v2i64) dst3, 0);
1292cabdff1aSopenharmony_ci        SD4(out0, out1, out2, out3, dst, dst_stride);
1293cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1294cabdff1aSopenharmony_ci    }
1295cabdff1aSopenharmony_ci}
1296cabdff1aSopenharmony_ci
1297cabdff1aSopenharmony_cistatic void avg_width16_msa(const uint8_t *src, int32_t src_stride,
1298cabdff1aSopenharmony_ci                            uint8_t *dst, int32_t dst_stride,
1299cabdff1aSopenharmony_ci                            int32_t height)
1300cabdff1aSopenharmony_ci{
1301cabdff1aSopenharmony_ci    int32_t cnt;
1302cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1303cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1304cabdff1aSopenharmony_ci
1305cabdff1aSopenharmony_ci    for (cnt = (height / 8); cnt--;) {
1306cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1307cabdff1aSopenharmony_ci        src += (8 * src_stride);
1308cabdff1aSopenharmony_ci        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1309cabdff1aSopenharmony_ci
1310cabdff1aSopenharmony_ci        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1311cabdff1aSopenharmony_ci                    dst0, dst1, dst2, dst3);
1312cabdff1aSopenharmony_ci        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
1313cabdff1aSopenharmony_ci                    dst4, dst5, dst6, dst7);
1314cabdff1aSopenharmony_ci        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
1315cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
1316cabdff1aSopenharmony_ci    }
1317cabdff1aSopenharmony_ci}
1318cabdff1aSopenharmony_ci
1319cabdff1aSopenharmony_civoid ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels,
1320cabdff1aSopenharmony_ci                         ptrdiff_t line_size, int h)
1321cabdff1aSopenharmony_ci{
1322cabdff1aSopenharmony_ci    copy_width16_msa(pixels, line_size, block, line_size, h);
1323cabdff1aSopenharmony_ci}
1324cabdff1aSopenharmony_ci
1325cabdff1aSopenharmony_civoid ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1326cabdff1aSopenharmony_ci                            ptrdiff_t line_size, int h)
1327cabdff1aSopenharmony_ci{
1328cabdff1aSopenharmony_ci    common_hz_bil_16w_msa(pixels, line_size, block, line_size, h);
1329cabdff1aSopenharmony_ci}
1330cabdff1aSopenharmony_ci
1331cabdff1aSopenharmony_civoid ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1332cabdff1aSopenharmony_ci                            ptrdiff_t line_size, int h)
1333cabdff1aSopenharmony_ci{
1334cabdff1aSopenharmony_ci    common_vt_bil_16w_msa(pixels, line_size, block, line_size, h);
1335cabdff1aSopenharmony_ci}
1336cabdff1aSopenharmony_ci
1337cabdff1aSopenharmony_civoid ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
1338cabdff1aSopenharmony_ci                             ptrdiff_t line_size, int h)
1339cabdff1aSopenharmony_ci{
1340cabdff1aSopenharmony_ci    common_hv_bil_16w_msa(pixels, line_size, block, line_size, h);
1341cabdff1aSopenharmony_ci}
1342cabdff1aSopenharmony_ci
1343cabdff1aSopenharmony_civoid ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels,
1344cabdff1aSopenharmony_ci                        ptrdiff_t line_size, int h)
1345cabdff1aSopenharmony_ci{
1346cabdff1aSopenharmony_ci    copy_width8_msa(pixels, line_size, block, line_size, h);
1347cabdff1aSopenharmony_ci}
1348cabdff1aSopenharmony_ci
1349cabdff1aSopenharmony_civoid ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1350cabdff1aSopenharmony_ci                           ptrdiff_t line_size, int h)
1351cabdff1aSopenharmony_ci{
1352cabdff1aSopenharmony_ci    common_hz_bil_8w_msa(pixels, line_size, block, line_size, h);
1353cabdff1aSopenharmony_ci}
1354cabdff1aSopenharmony_ci
1355cabdff1aSopenharmony_civoid ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1356cabdff1aSopenharmony_ci                           ptrdiff_t line_size, int h)
1357cabdff1aSopenharmony_ci{
1358cabdff1aSopenharmony_ci    common_vt_bil_8w_msa(pixels, line_size, block, line_size, h);
1359cabdff1aSopenharmony_ci}
1360cabdff1aSopenharmony_ci
1361cabdff1aSopenharmony_civoid ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1362cabdff1aSopenharmony_ci                            ptrdiff_t line_size, int h)
1363cabdff1aSopenharmony_ci{
1364cabdff1aSopenharmony_ci    common_hv_bil_8w_msa(pixels, line_size, block, line_size, h);
1365cabdff1aSopenharmony_ci}
1366cabdff1aSopenharmony_ci
1367cabdff1aSopenharmony_civoid ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
1368cabdff1aSopenharmony_ci                           ptrdiff_t line_size, int h)
1369cabdff1aSopenharmony_ci{
1370cabdff1aSopenharmony_ci    common_hz_bil_4w_msa(pixels, line_size, block, line_size, h);
1371cabdff1aSopenharmony_ci}
1372cabdff1aSopenharmony_ci
1373cabdff1aSopenharmony_civoid ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
1374cabdff1aSopenharmony_ci                           ptrdiff_t line_size, int h)
1375cabdff1aSopenharmony_ci{
1376cabdff1aSopenharmony_ci    common_vt_bil_4w_msa(pixels, line_size, block, line_size, h);
1377cabdff1aSopenharmony_ci}
1378cabdff1aSopenharmony_ci
1379cabdff1aSopenharmony_civoid ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
1380cabdff1aSopenharmony_ci                            ptrdiff_t line_size, int h)
1381cabdff1aSopenharmony_ci{
1382cabdff1aSopenharmony_ci    common_hv_bil_4w_msa(pixels, line_size, block, line_size, h);
1383cabdff1aSopenharmony_ci}
1384cabdff1aSopenharmony_ci
1385cabdff1aSopenharmony_civoid ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1386cabdff1aSopenharmony_ci                                   ptrdiff_t line_size, int h)
1387cabdff1aSopenharmony_ci{
1388cabdff1aSopenharmony_ci    if (h == 16) {
1389cabdff1aSopenharmony_ci        common_hz_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1390cabdff1aSopenharmony_ci    } else if (h == 8) {
1391cabdff1aSopenharmony_ci        common_hz_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1392cabdff1aSopenharmony_ci    }
1393cabdff1aSopenharmony_ci}
1394cabdff1aSopenharmony_ci
1395cabdff1aSopenharmony_civoid ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1396cabdff1aSopenharmony_ci                                   ptrdiff_t line_size, int h)
1397cabdff1aSopenharmony_ci{
1398cabdff1aSopenharmony_ci    if (h == 16) {
1399cabdff1aSopenharmony_ci        common_vt_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1400cabdff1aSopenharmony_ci    } else if (h == 8) {
1401cabdff1aSopenharmony_ci        common_vt_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1402cabdff1aSopenharmony_ci    }
1403cabdff1aSopenharmony_ci}
1404cabdff1aSopenharmony_ci
1405cabdff1aSopenharmony_civoid ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block,
1406cabdff1aSopenharmony_ci                                    const uint8_t *pixels,
1407cabdff1aSopenharmony_ci                                    ptrdiff_t line_size, int h)
1408cabdff1aSopenharmony_ci{
1409cabdff1aSopenharmony_ci    if (h == 16) {
1410cabdff1aSopenharmony_ci        common_hv_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1411cabdff1aSopenharmony_ci    } else if (h == 8) {
1412cabdff1aSopenharmony_ci        common_hv_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1413cabdff1aSopenharmony_ci    }
1414cabdff1aSopenharmony_ci}
1415cabdff1aSopenharmony_ci
1416cabdff1aSopenharmony_civoid ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1417cabdff1aSopenharmony_ci                                  ptrdiff_t line_size, int h)
1418cabdff1aSopenharmony_ci{
1419cabdff1aSopenharmony_ci    if (h == 8) {
1420cabdff1aSopenharmony_ci        common_hz_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1421cabdff1aSopenharmony_ci    } else if (h == 4) {
1422cabdff1aSopenharmony_ci        common_hz_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1423cabdff1aSopenharmony_ci    }
1424cabdff1aSopenharmony_ci}
1425cabdff1aSopenharmony_ci
1426cabdff1aSopenharmony_civoid ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1427cabdff1aSopenharmony_ci                                  ptrdiff_t line_size, int h)
1428cabdff1aSopenharmony_ci{
1429cabdff1aSopenharmony_ci    if (h == 8) {
1430cabdff1aSopenharmony_ci        common_vt_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1431cabdff1aSopenharmony_ci    } else if (h == 4) {
1432cabdff1aSopenharmony_ci        common_vt_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1433cabdff1aSopenharmony_ci    }
1434cabdff1aSopenharmony_ci}
1435cabdff1aSopenharmony_ci
1436cabdff1aSopenharmony_civoid ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1437cabdff1aSopenharmony_ci                                   ptrdiff_t line_size, int h)
1438cabdff1aSopenharmony_ci{
1439cabdff1aSopenharmony_ci    if (h == 8) {
1440cabdff1aSopenharmony_ci        common_hv_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1441cabdff1aSopenharmony_ci    } else if (h == 4) {
1442cabdff1aSopenharmony_ci        common_hv_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1443cabdff1aSopenharmony_ci    }
1444cabdff1aSopenharmony_ci}
1445cabdff1aSopenharmony_ci
1446cabdff1aSopenharmony_civoid ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels,
1447cabdff1aSopenharmony_ci                         ptrdiff_t line_size, int h)
1448cabdff1aSopenharmony_ci{
1449cabdff1aSopenharmony_ci    avg_width16_msa(pixels, line_size, block, line_size, h);
1450cabdff1aSopenharmony_ci}
1451cabdff1aSopenharmony_ci
1452cabdff1aSopenharmony_civoid ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1453cabdff1aSopenharmony_ci                            ptrdiff_t line_size, int h)
1454cabdff1aSopenharmony_ci{
1455cabdff1aSopenharmony_ci    common_hz_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1456cabdff1aSopenharmony_ci}
1457cabdff1aSopenharmony_ci
1458cabdff1aSopenharmony_civoid ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1459cabdff1aSopenharmony_ci                            ptrdiff_t line_size, int h)
1460cabdff1aSopenharmony_ci{
1461cabdff1aSopenharmony_ci    common_vt_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1462cabdff1aSopenharmony_ci}
1463cabdff1aSopenharmony_ci
1464cabdff1aSopenharmony_civoid ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
1465cabdff1aSopenharmony_ci                             ptrdiff_t line_size, int h)
1466cabdff1aSopenharmony_ci{
1467cabdff1aSopenharmony_ci    common_hv_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1468cabdff1aSopenharmony_ci}
1469cabdff1aSopenharmony_ci
1470cabdff1aSopenharmony_civoid ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels,
1471cabdff1aSopenharmony_ci                        ptrdiff_t line_size, int h)
1472cabdff1aSopenharmony_ci{
1473cabdff1aSopenharmony_ci    avg_width8_msa(pixels, line_size, block, line_size, h);
1474cabdff1aSopenharmony_ci}
1475cabdff1aSopenharmony_ci
1476cabdff1aSopenharmony_civoid ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1477cabdff1aSopenharmony_ci                           ptrdiff_t line_size, int h)
1478cabdff1aSopenharmony_ci{
1479cabdff1aSopenharmony_ci    common_hz_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1480cabdff1aSopenharmony_ci}
1481cabdff1aSopenharmony_ci
1482cabdff1aSopenharmony_civoid ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1483cabdff1aSopenharmony_ci                           ptrdiff_t line_size, int h)
1484cabdff1aSopenharmony_ci{
1485cabdff1aSopenharmony_ci    common_vt_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1486cabdff1aSopenharmony_ci}
1487cabdff1aSopenharmony_ci
1488cabdff1aSopenharmony_civoid ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1489cabdff1aSopenharmony_ci                            ptrdiff_t line_size, int h)
1490cabdff1aSopenharmony_ci{
1491cabdff1aSopenharmony_ci    common_hv_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1492cabdff1aSopenharmony_ci}
1493cabdff1aSopenharmony_ci
1494cabdff1aSopenharmony_civoid ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels,
1495cabdff1aSopenharmony_ci                        ptrdiff_t line_size, int h)
1496cabdff1aSopenharmony_ci{
1497cabdff1aSopenharmony_ci    avg_width4_msa(pixels, line_size, block, line_size, h);
1498cabdff1aSopenharmony_ci}
1499cabdff1aSopenharmony_ci
1500cabdff1aSopenharmony_civoid ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
1501cabdff1aSopenharmony_ci                           ptrdiff_t line_size, int h)
1502cabdff1aSopenharmony_ci{
1503cabdff1aSopenharmony_ci    common_hz_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1504cabdff1aSopenharmony_ci}
1505cabdff1aSopenharmony_ci
1506cabdff1aSopenharmony_civoid ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
1507cabdff1aSopenharmony_ci                           ptrdiff_t line_size, int h)
1508cabdff1aSopenharmony_ci{
1509cabdff1aSopenharmony_ci    common_vt_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1510cabdff1aSopenharmony_ci}
1511cabdff1aSopenharmony_ci
1512cabdff1aSopenharmony_civoid ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
1513cabdff1aSopenharmony_ci                            ptrdiff_t line_size, int h)
1514cabdff1aSopenharmony_ci{
1515cabdff1aSopenharmony_ci    common_hv_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1516cabdff1aSopenharmony_ci}
1517