1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
22cabdff1aSopenharmony_ci#include "h264chroma_mips.h"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_cistatic const uint8_t chroma_mask_arr[16 * 5] = {
25cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
26cabdff1aSopenharmony_ci    0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
27cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28cabdff1aSopenharmony_ci    0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
29cabdff1aSopenharmony_ci    0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
30cabdff1aSopenharmony_ci};
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_cistatic void avc_chroma_hz_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
33cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1)
34cabdff1aSopenharmony_ci{
35cabdff1aSopenharmony_ci    uint16_t out0, out1;
36cabdff1aSopenharmony_ci    v16i8 src0, src1;
37cabdff1aSopenharmony_ci    v8u16 res_r;
38cabdff1aSopenharmony_ci    v8i16 res;
39cabdff1aSopenharmony_ci    v16i8 mask;
40cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
41cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
42cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
43cabdff1aSopenharmony_ci
44cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[0]);
45cabdff1aSopenharmony_ci
46cabdff1aSopenharmony_ci    LD_SB2(src, stride, src0, src1);
47cabdff1aSopenharmony_ci
48cabdff1aSopenharmony_ci    src0 = __msa_vshf_b(mask, src1, src0);
49cabdff1aSopenharmony_ci    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
50cabdff1aSopenharmony_ci    res_r <<= 3;
51cabdff1aSopenharmony_ci    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
52cabdff1aSopenharmony_ci    res_r = __msa_sat_u_h(res_r, 7);
53cabdff1aSopenharmony_ci    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
54cabdff1aSopenharmony_ci
55cabdff1aSopenharmony_ci    out0 = __msa_copy_u_h(res, 0);
56cabdff1aSopenharmony_ci    out1 = __msa_copy_u_h(res, 2);
57cabdff1aSopenharmony_ci
58cabdff1aSopenharmony_ci    SH(out0, dst);
59cabdff1aSopenharmony_ci    dst += stride;
60cabdff1aSopenharmony_ci    SH(out1, dst);
61cabdff1aSopenharmony_ci}
62cabdff1aSopenharmony_ci
63cabdff1aSopenharmony_cistatic void avc_chroma_hz_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
64cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1)
65cabdff1aSopenharmony_ci{
66cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3;
67cabdff1aSopenharmony_ci    v8u16 res_r;
68cabdff1aSopenharmony_ci    v8i16 res;
69cabdff1aSopenharmony_ci    v16i8 mask;
70cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
71cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
72cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
73cabdff1aSopenharmony_ci
74cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[64]);
75cabdff1aSopenharmony_ci
76cabdff1aSopenharmony_ci    LD_UB4(src, stride, src0, src1, src2, src3);
77cabdff1aSopenharmony_ci
78cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
79cabdff1aSopenharmony_ci
80cabdff1aSopenharmony_ci    src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
81cabdff1aSopenharmony_ci
82cabdff1aSopenharmony_ci    res_r = __msa_dotp_u_h(src0, coeff_vec);
83cabdff1aSopenharmony_ci    res_r <<= 3;
84cabdff1aSopenharmony_ci    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
85cabdff1aSopenharmony_ci    res_r = __msa_sat_u_h(res_r, 7);
86cabdff1aSopenharmony_ci    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci    ST_H4(res, 0, 1, 2, 3, dst, stride);
89cabdff1aSopenharmony_ci}
90cabdff1aSopenharmony_ci
91cabdff1aSopenharmony_cistatic void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
92cabdff1aSopenharmony_ci                                 uint32_t coeff0, uint32_t coeff1,
93cabdff1aSopenharmony_ci                                 int32_t height)
94cabdff1aSopenharmony_ci{
95cabdff1aSopenharmony_ci    if (2 == height) {
96cabdff1aSopenharmony_ci        avc_chroma_hz_2x2_msa(src, dst, stride, coeff0, coeff1);
97cabdff1aSopenharmony_ci    } else if (4 == height) {
98cabdff1aSopenharmony_ci        avc_chroma_hz_2x4_msa(src, dst, stride, coeff0, coeff1);
99cabdff1aSopenharmony_ci    }
100cabdff1aSopenharmony_ci}
101cabdff1aSopenharmony_ci
102cabdff1aSopenharmony_cistatic void avc_chroma_hz_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
103cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1)
104cabdff1aSopenharmony_ci{
105cabdff1aSopenharmony_ci    v16i8 src0, src1;
106cabdff1aSopenharmony_ci    v8u16 res_r;
107cabdff1aSopenharmony_ci    v4i32 res;
108cabdff1aSopenharmony_ci    v16i8 mask;
109cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
110cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
111cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
112cabdff1aSopenharmony_ci
113cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[0]);
114cabdff1aSopenharmony_ci
115cabdff1aSopenharmony_ci    LD_SB2(src, stride, src0, src1);
116cabdff1aSopenharmony_ci
117cabdff1aSopenharmony_ci    src0 = __msa_vshf_b(mask, src1, src0);
118cabdff1aSopenharmony_ci    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
119cabdff1aSopenharmony_ci    res_r <<= 3;
120cabdff1aSopenharmony_ci    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
121cabdff1aSopenharmony_ci    res_r = __msa_sat_u_h(res_r, 7);
122cabdff1aSopenharmony_ci    res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
123cabdff1aSopenharmony_ci
124cabdff1aSopenharmony_ci    ST_W2(res, 0, 1, dst, stride);
125cabdff1aSopenharmony_ci}
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_cistatic void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
128cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1)
129cabdff1aSopenharmony_ci{
130cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, out;
131cabdff1aSopenharmony_ci    v8u16 res0_r, res1_r;
132cabdff1aSopenharmony_ci    v16i8 mask;
133cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
134cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
135cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
136cabdff1aSopenharmony_ci
137cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[0]);
138cabdff1aSopenharmony_ci
139cabdff1aSopenharmony_ci    LD_UB4(src, stride, src0, src1, src2, src3);
140cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
141cabdff1aSopenharmony_ci    DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
142cabdff1aSopenharmony_ci    res0_r <<= 3;
143cabdff1aSopenharmony_ci    res1_r <<= 3;
144cabdff1aSopenharmony_ci    SRARI_H2_UH(res0_r, res1_r, 6);
145cabdff1aSopenharmony_ci    SAT_UH2_UH(res0_r, res1_r, 7);
146cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
147cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, stride);
148cabdff1aSopenharmony_ci}
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_cistatic void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
151cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1)
152cabdff1aSopenharmony_ci{
153cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
154cabdff1aSopenharmony_ci    v16i8 mask;
155cabdff1aSopenharmony_ci    v8u16 res0, res1, res2, res3;
156cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
157cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
158cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
159cabdff1aSopenharmony_ci
160cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[0]);
161cabdff1aSopenharmony_ci
162cabdff1aSopenharmony_ci    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
163cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
164cabdff1aSopenharmony_ci    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
165cabdff1aSopenharmony_ci    DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
166cabdff1aSopenharmony_ci    DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
167cabdff1aSopenharmony_ci    SLLI_4V(res0, res1, res2, res3, 3);
168cabdff1aSopenharmony_ci    SRARI_H4_UH(res0, res1, res2, res3, 6);
169cabdff1aSopenharmony_ci    SAT_UH4_UH(res0, res1, res2, res3, 7);
170cabdff1aSopenharmony_ci    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
171cabdff1aSopenharmony_ci    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
172cabdff1aSopenharmony_ci}
173cabdff1aSopenharmony_ci
174cabdff1aSopenharmony_cistatic void avc_chroma_hz_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
175cabdff1aSopenharmony_ci                                 uint32_t coeff0, uint32_t coeff1,
176cabdff1aSopenharmony_ci                                 int32_t height)
177cabdff1aSopenharmony_ci{
178cabdff1aSopenharmony_ci    if (2 == height) {
179cabdff1aSopenharmony_ci        avc_chroma_hz_4x2_msa(src, dst, stride, coeff0, coeff1);
180cabdff1aSopenharmony_ci    } else if (4 == height) {
181cabdff1aSopenharmony_ci        avc_chroma_hz_4x4_msa(src, dst, stride, coeff0, coeff1);
182cabdff1aSopenharmony_ci    } else if (8 == height) {
183cabdff1aSopenharmony_ci        avc_chroma_hz_4x8_msa(src, dst, stride, coeff0, coeff1);
184cabdff1aSopenharmony_ci    }
185cabdff1aSopenharmony_ci}
186cabdff1aSopenharmony_ci
187cabdff1aSopenharmony_cistatic void avc_chroma_hz_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
188cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1)
189cabdff1aSopenharmony_ci{
190cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, out0, out1;
191cabdff1aSopenharmony_ci    v8u16 res0, res1, res2, res3;
192cabdff1aSopenharmony_ci    v16i8 mask;
193cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
194cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
195cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
196cabdff1aSopenharmony_ci
197cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[32]);
198cabdff1aSopenharmony_ci    LD_UB4(src, stride, src0, src1, src2, src3);
199cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
200cabdff1aSopenharmony_ci    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
201cabdff1aSopenharmony_ci    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
202cabdff1aSopenharmony_ci                coeff_vec, res0, res1, res2, res3);
203cabdff1aSopenharmony_ci    SLLI_4V(res0, res1, res2, res3, 3);
204cabdff1aSopenharmony_ci    SRARI_H4_UH(res0, res1, res2, res3, 6);
205cabdff1aSopenharmony_ci    SAT_UH4_UH(res0, res1, res2, res3, 7);
206cabdff1aSopenharmony_ci    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
207cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
208cabdff1aSopenharmony_ci}
209cabdff1aSopenharmony_ci
210cabdff1aSopenharmony_cistatic void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
211cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1)
212cabdff1aSopenharmony_ci{
213cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
214cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
215cabdff1aSopenharmony_ci    v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
216cabdff1aSopenharmony_ci    v16i8 mask;
217cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
218cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
219cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
220cabdff1aSopenharmony_ci
221cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[32]);
222cabdff1aSopenharmony_ci
223cabdff1aSopenharmony_ci    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
224cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
225cabdff1aSopenharmony_ci    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
226cabdff1aSopenharmony_ci    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
227cabdff1aSopenharmony_ci    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
228cabdff1aSopenharmony_ci    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
229cabdff1aSopenharmony_ci                coeff_vec, res0, res1, res2, res3);
230cabdff1aSopenharmony_ci    DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
231cabdff1aSopenharmony_ci                coeff_vec, res4, res5, res6, res7);
232cabdff1aSopenharmony_ci    SLLI_4V(res0, res1, res2, res3, 3);
233cabdff1aSopenharmony_ci    SLLI_4V(res4, res5, res6, res7, 3);
234cabdff1aSopenharmony_ci    SRARI_H4_UH(res0, res1, res2, res3, 6);
235cabdff1aSopenharmony_ci    SRARI_H4_UH(res4, res5, res6, res7, 6);
236cabdff1aSopenharmony_ci    SAT_UH4_UH(res0, res1, res2, res3, 7);
237cabdff1aSopenharmony_ci    SAT_UH4_UH(res4, res5, res6, res7, 7);
238cabdff1aSopenharmony_ci    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
239cabdff1aSopenharmony_ci    PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
240cabdff1aSopenharmony_ci    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
241cabdff1aSopenharmony_ci}
242cabdff1aSopenharmony_ci
243cabdff1aSopenharmony_cistatic void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst,
244cabdff1aSopenharmony_ci                                      int32_t stride, uint32_t coeff0,
245cabdff1aSopenharmony_ci                                      uint32_t coeff1, int32_t height)
246cabdff1aSopenharmony_ci{
247cabdff1aSopenharmony_ci    uint32_t row;
248cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, out0, out1;
249cabdff1aSopenharmony_ci    v8u16 res0, res1, res2, res3;
250cabdff1aSopenharmony_ci    v16i8 mask;
251cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
252cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
253cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
254cabdff1aSopenharmony_ci
255cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[32]);
256cabdff1aSopenharmony_ci
257cabdff1aSopenharmony_ci    for (row = height >> 2; row--;) {
258cabdff1aSopenharmony_ci        LD_UB4(src, stride, src0, src1, src2, src3);
259cabdff1aSopenharmony_ci        src += (4 * stride);
260cabdff1aSopenharmony_ci
261cabdff1aSopenharmony_ci        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
262cabdff1aSopenharmony_ci        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
263cabdff1aSopenharmony_ci        DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
264cabdff1aSopenharmony_ci                    coeff_vec, res0, res1, res2, res3);
265cabdff1aSopenharmony_ci        SLLI_4V(res0, res1, res2, res3, 3);
266cabdff1aSopenharmony_ci        SRARI_H4_UH(res0, res1, res2, res3, 6);
267cabdff1aSopenharmony_ci        SAT_UH4_UH(res0, res1, res2, res3, 7);
268cabdff1aSopenharmony_ci        PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
269cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
270cabdff1aSopenharmony_ci        dst += (4 * stride);
271cabdff1aSopenharmony_ci    }
272cabdff1aSopenharmony_ci
273cabdff1aSopenharmony_ci    if (0 != (height % 4)) {
274cabdff1aSopenharmony_ci        for (row = (height % 4); row--;) {
275cabdff1aSopenharmony_ci            src0 = LD_UB(src);
276cabdff1aSopenharmony_ci            src += stride;
277cabdff1aSopenharmony_ci
278cabdff1aSopenharmony_ci            src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
279cabdff1aSopenharmony_ci
280cabdff1aSopenharmony_ci            res0 = __msa_dotp_u_h(src0, coeff_vec);
281cabdff1aSopenharmony_ci            res0 <<= 3;
282cabdff1aSopenharmony_ci            res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
283cabdff1aSopenharmony_ci            res0 = __msa_sat_u_h(res0, 7);
284cabdff1aSopenharmony_ci            res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
285cabdff1aSopenharmony_ci
286cabdff1aSopenharmony_ci            ST_D1(res0, 0, dst);
287cabdff1aSopenharmony_ci            dst += stride;
288cabdff1aSopenharmony_ci        }
289cabdff1aSopenharmony_ci    }
290cabdff1aSopenharmony_ci}
291cabdff1aSopenharmony_ci
292cabdff1aSopenharmony_cistatic void avc_chroma_hz_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
293cabdff1aSopenharmony_ci                                 uint32_t coeff0, uint32_t coeff1,
294cabdff1aSopenharmony_ci                                 int32_t height)
295cabdff1aSopenharmony_ci{
296cabdff1aSopenharmony_ci    if (4 == height) {
297cabdff1aSopenharmony_ci        avc_chroma_hz_8x4_msa(src, dst, stride, coeff0, coeff1);
298cabdff1aSopenharmony_ci    } else if (8 == height) {
299cabdff1aSopenharmony_ci        avc_chroma_hz_8x8_msa(src, dst, stride, coeff0, coeff1);
300cabdff1aSopenharmony_ci    } else {
301cabdff1aSopenharmony_ci        avc_chroma_hz_nonmult_msa(src, dst, stride, coeff0, coeff1, height);
302cabdff1aSopenharmony_ci    }
303cabdff1aSopenharmony_ci}
304cabdff1aSopenharmony_ci
305cabdff1aSopenharmony_cistatic void avc_chroma_vt_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
306cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1)
307cabdff1aSopenharmony_ci{
308cabdff1aSopenharmony_ci    uint16_t out0, out1;
309cabdff1aSopenharmony_ci    v16i8 src0, src1, src2;
310cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1;
311cabdff1aSopenharmony_ci    v8i16 res;
312cabdff1aSopenharmony_ci    v8u16 res_r;
313cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
314cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
315cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
316cabdff1aSopenharmony_ci
317cabdff1aSopenharmony_ci    LD_SB3(src, stride, src0, src1, src2);
318cabdff1aSopenharmony_ci
319cabdff1aSopenharmony_ci    ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
320cabdff1aSopenharmony_ci
321cabdff1aSopenharmony_ci    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
322cabdff1aSopenharmony_ci
323cabdff1aSopenharmony_ci    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
324cabdff1aSopenharmony_ci    res_r <<= 3;
325cabdff1aSopenharmony_ci    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
326cabdff1aSopenharmony_ci    res_r = __msa_sat_u_h(res_r, 7);
327cabdff1aSopenharmony_ci    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
328cabdff1aSopenharmony_ci
329cabdff1aSopenharmony_ci    out0 = __msa_copy_u_h(res, 0);
330cabdff1aSopenharmony_ci    out1 = __msa_copy_u_h(res, 2);
331cabdff1aSopenharmony_ci
332cabdff1aSopenharmony_ci    SH(out0, dst);
333cabdff1aSopenharmony_ci    dst += stride;
334cabdff1aSopenharmony_ci    SH(out1, dst);
335cabdff1aSopenharmony_ci}
336cabdff1aSopenharmony_ci
337cabdff1aSopenharmony_cistatic void avc_chroma_vt_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
338cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1)
339cabdff1aSopenharmony_ci{
340cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4;
341cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3;
342cabdff1aSopenharmony_ci    v8i16 res;
343cabdff1aSopenharmony_ci    v8u16 res_r;
344cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
345cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
346cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
347cabdff1aSopenharmony_ci
348cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
349cabdff1aSopenharmony_ci    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
350cabdff1aSopenharmony_ci               tmp0, tmp1, tmp2, tmp3);
351cabdff1aSopenharmony_ci    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
352cabdff1aSopenharmony_ci
353cabdff1aSopenharmony_ci    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
354cabdff1aSopenharmony_ci
355cabdff1aSopenharmony_ci    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
356cabdff1aSopenharmony_ci    res_r <<= 3;
357cabdff1aSopenharmony_ci    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
358cabdff1aSopenharmony_ci    res_r = __msa_sat_u_h(res_r, 7);
359cabdff1aSopenharmony_ci
360cabdff1aSopenharmony_ci    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
361cabdff1aSopenharmony_ci
362cabdff1aSopenharmony_ci    ST_H4(res, 0, 1, 2, 3, dst, stride);
363cabdff1aSopenharmony_ci}
364cabdff1aSopenharmony_ci
365cabdff1aSopenharmony_cistatic void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
366cabdff1aSopenharmony_ci                                 uint32_t coeff0, uint32_t coeff1,
367cabdff1aSopenharmony_ci                                 int32_t height)
368cabdff1aSopenharmony_ci{
369cabdff1aSopenharmony_ci    if (2 == height) {
370cabdff1aSopenharmony_ci        avc_chroma_vt_2x2_msa(src, dst, stride, coeff0, coeff1);
371cabdff1aSopenharmony_ci    } else if (4 == height) {
372cabdff1aSopenharmony_ci        avc_chroma_vt_2x4_msa(src, dst, stride, coeff0, coeff1);
373cabdff1aSopenharmony_ci    }
374cabdff1aSopenharmony_ci}
375cabdff1aSopenharmony_ci
376cabdff1aSopenharmony_cistatic void avc_chroma_vt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
377cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1)
378cabdff1aSopenharmony_ci{
379cabdff1aSopenharmony_ci    v16u8 src0, src1, src2;
380cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1;
381cabdff1aSopenharmony_ci    v4i32 res;
382cabdff1aSopenharmony_ci    v8u16 res_r;
383cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
384cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
385cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
386cabdff1aSopenharmony_ci
387cabdff1aSopenharmony_ci    LD_UB3(src, stride, src0, src1, src2);
388cabdff1aSopenharmony_ci    ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
389cabdff1aSopenharmony_ci
390cabdff1aSopenharmony_ci    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
391cabdff1aSopenharmony_ci    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
392cabdff1aSopenharmony_ci    res_r <<= 3;
393cabdff1aSopenharmony_ci    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
394cabdff1aSopenharmony_ci    res_r = __msa_sat_u_h(res_r, 7);
395cabdff1aSopenharmony_ci    res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
396cabdff1aSopenharmony_ci
397cabdff1aSopenharmony_ci    ST_W2(res, 0, 1, dst, stride);
398cabdff1aSopenharmony_ci}
399cabdff1aSopenharmony_ci
400cabdff1aSopenharmony_cistatic void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
401cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1)
402cabdff1aSopenharmony_ci{
403cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4;
404cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3;
405cabdff1aSopenharmony_ci    v16u8 out;
406cabdff1aSopenharmony_ci    v8u16 res0_r, res1_r;
407cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
408cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
409cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
410cabdff1aSopenharmony_ci
411cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
412cabdff1aSopenharmony_ci    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
413cabdff1aSopenharmony_ci               tmp3);
414cabdff1aSopenharmony_ci    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
415cabdff1aSopenharmony_ci    DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
416cabdff1aSopenharmony_ci    res0_r <<= 3;
417cabdff1aSopenharmony_ci    res1_r <<= 3;
418cabdff1aSopenharmony_ci    SRARI_H2_UH(res0_r, res1_r, 6);
419cabdff1aSopenharmony_ci    SAT_UH2_UH(res0_r, res1_r, 7);
420cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
421cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, stride);
422cabdff1aSopenharmony_ci}
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_cistatic void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
425cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1)
426cabdff1aSopenharmony_ci{
427cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
428cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
429cabdff1aSopenharmony_ci    v8u16 res0, res1, res2, res3;
430cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
431cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
432cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
433cabdff1aSopenharmony_ci
434cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
435cabdff1aSopenharmony_ci    src += (5 * stride);
436cabdff1aSopenharmony_ci    LD_UB4(src, stride, src5, src6, src7, src8);
437cabdff1aSopenharmony_ci    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
438cabdff1aSopenharmony_ci               tmp3);
439cabdff1aSopenharmony_ci    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
440cabdff1aSopenharmony_ci               tmp7);
441cabdff1aSopenharmony_ci    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
442cabdff1aSopenharmony_ci    ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
443cabdff1aSopenharmony_ci    DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
444cabdff1aSopenharmony_ci    DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
445cabdff1aSopenharmony_ci    SLLI_4V(res0, res1, res2, res3, 3);
446cabdff1aSopenharmony_ci    SRARI_H4_UH(res0, res1, res2, res3, 6);
447cabdff1aSopenharmony_ci    SAT_UH4_UH(res0, res1, res2, res3, 7);
448cabdff1aSopenharmony_ci    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
449cabdff1aSopenharmony_ci    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
450cabdff1aSopenharmony_ci}
451cabdff1aSopenharmony_ci
452cabdff1aSopenharmony_cistatic void avc_chroma_vt_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
453cabdff1aSopenharmony_ci                                 uint32_t coeff0, uint32_t coeff1,
454cabdff1aSopenharmony_ci                                 int32_t height)
455cabdff1aSopenharmony_ci{
456cabdff1aSopenharmony_ci    if (2 == height) {
457cabdff1aSopenharmony_ci        avc_chroma_vt_4x2_msa(src, dst, stride, coeff0, coeff1);
458cabdff1aSopenharmony_ci    } else if (4 == height) {
459cabdff1aSopenharmony_ci        avc_chroma_vt_4x4_msa(src, dst, stride, coeff0, coeff1);
460cabdff1aSopenharmony_ci    } else if (8 == height) {
461cabdff1aSopenharmony_ci        avc_chroma_vt_4x8_msa(src, dst, stride, coeff0, coeff1);
462cabdff1aSopenharmony_ci    }
463cabdff1aSopenharmony_ci}
464cabdff1aSopenharmony_ci
465cabdff1aSopenharmony_cistatic void avc_chroma_vt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
466cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1)
467cabdff1aSopenharmony_ci{
468cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, out0, out1;
469cabdff1aSopenharmony_ci    v8u16 res0, res1, res2, res3;
470cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
471cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
472cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
473cabdff1aSopenharmony_ci
474cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
475cabdff1aSopenharmony_ci    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
476cabdff1aSopenharmony_ci               src3);
477cabdff1aSopenharmony_ci    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
478cabdff1aSopenharmony_ci                coeff_vec, res0, res1, res2, res3);
479cabdff1aSopenharmony_ci    SLLI_4V(res0, res1, res2, res3, 3);
480cabdff1aSopenharmony_ci    SRARI_H4_UH(res0, res1, res2, res3, 6);
481cabdff1aSopenharmony_ci    SAT_UH4_UH(res0, res1, res2, res3, 7);
482cabdff1aSopenharmony_ci    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
483cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
484cabdff1aSopenharmony_ci}
485cabdff1aSopenharmony_ci
486cabdff1aSopenharmony_cistatic void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
487cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1)
488cabdff1aSopenharmony_ci{
489cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
490cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
491cabdff1aSopenharmony_ci    v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
492cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
493cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
494cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
495cabdff1aSopenharmony_ci
496cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
497cabdff1aSopenharmony_ci    src += (5 * stride);
498cabdff1aSopenharmony_ci    LD_UB4(src, stride, src5, src6, src7, src8);
499cabdff1aSopenharmony_ci    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
500cabdff1aSopenharmony_ci               src3);
501cabdff1aSopenharmony_ci    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, src4, src5, src6,
502cabdff1aSopenharmony_ci               src7);
503cabdff1aSopenharmony_ci    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
504cabdff1aSopenharmony_ci                coeff_vec, res0, res1, res2, res3);
505cabdff1aSopenharmony_ci    DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
506cabdff1aSopenharmony_ci                coeff_vec, res4, res5, res6, res7);
507cabdff1aSopenharmony_ci    SLLI_4V(res0, res1, res2, res3, 3);
508cabdff1aSopenharmony_ci    SLLI_4V(res4, res5, res6, res7, 3);
509cabdff1aSopenharmony_ci    SRARI_H4_UH(res0, res1, res2, res3, 6);
510cabdff1aSopenharmony_ci    SRARI_H4_UH(res4, res5, res6, res7, 6);
511cabdff1aSopenharmony_ci    SAT_UH4_UH(res0, res1, res2, res3, 7);
512cabdff1aSopenharmony_ci    SAT_UH4_UH(res0, res1, res2, res3, 7);
513cabdff1aSopenharmony_ci    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
514cabdff1aSopenharmony_ci    PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
515cabdff1aSopenharmony_ci    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
516cabdff1aSopenharmony_ci}
517cabdff1aSopenharmony_ci
518cabdff1aSopenharmony_cistatic void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
519cabdff1aSopenharmony_ci                                 uint32_t coeff0, uint32_t coeff1,
520cabdff1aSopenharmony_ci                                 int32_t height)
521cabdff1aSopenharmony_ci{
522cabdff1aSopenharmony_ci    if (4 == height) {
523cabdff1aSopenharmony_ci        avc_chroma_vt_8x4_msa(src, dst, stride, coeff0, coeff1);
524cabdff1aSopenharmony_ci    } else if (8 == height) {
525cabdff1aSopenharmony_ci        avc_chroma_vt_8x8_msa(src, dst, stride, coeff0, coeff1);
526cabdff1aSopenharmony_ci    }
527cabdff1aSopenharmony_ci}
528cabdff1aSopenharmony_ci
529cabdff1aSopenharmony_cistatic void avc_chroma_hv_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
530cabdff1aSopenharmony_ci                                  uint32_t coef_hor0, uint32_t coef_hor1,
531cabdff1aSopenharmony_ci                                  uint32_t coef_ver0, uint32_t coef_ver1)
532cabdff1aSopenharmony_ci{
533cabdff1aSopenharmony_ci    uint16_t out0, out1;
534cabdff1aSopenharmony_ci    v16u8 src0, src1, src2;
535cabdff1aSopenharmony_ci    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
536cabdff1aSopenharmony_ci    v8i16 res_vert;
537cabdff1aSopenharmony_ci    v16i8 mask;
538cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
539cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
540cabdff1aSopenharmony_ci    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
541cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
542cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
543cabdff1aSopenharmony_ci
544cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[48]);
545cabdff1aSopenharmony_ci
546cabdff1aSopenharmony_ci    LD_UB3(src, stride, src0, src1, src2);
547cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
548cabdff1aSopenharmony_ci    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
549cabdff1aSopenharmony_ci    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
550cabdff1aSopenharmony_ci
551cabdff1aSopenharmony_ci    res_vt0 += res_vt1;
552cabdff1aSopenharmony_ci    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
553cabdff1aSopenharmony_ci    res_vt0 = __msa_sat_u_h(res_vt0, 7);
554cabdff1aSopenharmony_ci    res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
555cabdff1aSopenharmony_ci
556cabdff1aSopenharmony_ci    out0 = __msa_copy_u_h(res_vert, 0);
557cabdff1aSopenharmony_ci    out1 = __msa_copy_u_h(res_vert, 1);
558cabdff1aSopenharmony_ci
559cabdff1aSopenharmony_ci    SH(out0, dst);
560cabdff1aSopenharmony_ci    dst += stride;
561cabdff1aSopenharmony_ci    SH(out1, dst);
562cabdff1aSopenharmony_ci}
563cabdff1aSopenharmony_ci
564cabdff1aSopenharmony_cistatic void avc_chroma_hv_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
565cabdff1aSopenharmony_ci                                  uint32_t coef_hor0, uint32_t coef_hor1,
566cabdff1aSopenharmony_ci                                  uint32_t coef_ver0, uint32_t coef_ver1)
567cabdff1aSopenharmony_ci{
568cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4;
569cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3;
570cabdff1aSopenharmony_ci    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
571cabdff1aSopenharmony_ci    v8i16 res;
572cabdff1aSopenharmony_ci    v16i8 mask;
573cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
574cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
575cabdff1aSopenharmony_ci    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
576cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
577cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
578cabdff1aSopenharmony_ci
579cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[48]);
580cabdff1aSopenharmony_ci
581cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
582cabdff1aSopenharmony_ci
583cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
584cabdff1aSopenharmony_ci    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
585cabdff1aSopenharmony_ci    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
586cabdff1aSopenharmony_ci    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
587cabdff1aSopenharmony_ci    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
588cabdff1aSopenharmony_ci
589cabdff1aSopenharmony_ci    res_vt0 += res_vt1;
590cabdff1aSopenharmony_ci    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
591cabdff1aSopenharmony_ci    res_vt0 = __msa_sat_u_h(res_vt0, 7);
592cabdff1aSopenharmony_ci
593cabdff1aSopenharmony_ci    res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
594cabdff1aSopenharmony_ci
595cabdff1aSopenharmony_ci    ST_H4(res, 0, 1, 2, 3, dst, stride);
596cabdff1aSopenharmony_ci}
597cabdff1aSopenharmony_ci
598cabdff1aSopenharmony_cistatic void avc_chroma_hv_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
599cabdff1aSopenharmony_ci                                 uint32_t coef_hor0, uint32_t coef_hor1,
600cabdff1aSopenharmony_ci                                 uint32_t coef_ver0, uint32_t coef_ver1,
601cabdff1aSopenharmony_ci                                 int32_t height)
602cabdff1aSopenharmony_ci{
603cabdff1aSopenharmony_ci    if (2 == height) {
604cabdff1aSopenharmony_ci        avc_chroma_hv_2x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
605cabdff1aSopenharmony_ci                              coef_ver1);
606cabdff1aSopenharmony_ci    } else if (4 == height) {
607cabdff1aSopenharmony_ci        avc_chroma_hv_2x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
608cabdff1aSopenharmony_ci                              coef_ver1);
609cabdff1aSopenharmony_ci    }
610cabdff1aSopenharmony_ci}
611cabdff1aSopenharmony_ci
612cabdff1aSopenharmony_cistatic void avc_chroma_hv_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
613cabdff1aSopenharmony_ci                                  uint32_t coef_hor0, uint32_t coef_hor1,
614cabdff1aSopenharmony_ci                                  uint32_t coef_ver0, uint32_t coef_ver1)
615cabdff1aSopenharmony_ci{
616cabdff1aSopenharmony_ci    v16u8 src0, src1, src2;
617cabdff1aSopenharmony_ci    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
618cabdff1aSopenharmony_ci    v16i8 mask;
619cabdff1aSopenharmony_ci    v4i32 res;
620cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
621cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
622cabdff1aSopenharmony_ci    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
623cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
624cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
625cabdff1aSopenharmony_ci
626cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[0]);
627cabdff1aSopenharmony_ci    LD_UB3(src, stride, src0, src1, src2);
628cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
629cabdff1aSopenharmony_ci    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
630cabdff1aSopenharmony_ci    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
631cabdff1aSopenharmony_ci
632cabdff1aSopenharmony_ci    res_vt0 += res_vt1;
633cabdff1aSopenharmony_ci    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
634cabdff1aSopenharmony_ci    res_vt0 = __msa_sat_u_h(res_vt0, 7);
635cabdff1aSopenharmony_ci    res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
636cabdff1aSopenharmony_ci
637cabdff1aSopenharmony_ci    ST_W2(res, 0, 1, dst, stride);
638cabdff1aSopenharmony_ci}
639cabdff1aSopenharmony_ci
640cabdff1aSopenharmony_cistatic void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
641cabdff1aSopenharmony_ci                                  uint32_t coef_hor0, uint32_t coef_hor1,
642cabdff1aSopenharmony_ci                                  uint32_t coef_ver0, uint32_t coef_ver1)
643cabdff1aSopenharmony_ci{
644cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4;
645cabdff1aSopenharmony_ci    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
646cabdff1aSopenharmony_ci    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
647cabdff1aSopenharmony_ci    v16i8 mask;
648cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
649cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
650cabdff1aSopenharmony_ci    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
651cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
652cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
653cabdff1aSopenharmony_ci    v4i32 res0, res1;
654cabdff1aSopenharmony_ci
655cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[0]);
656cabdff1aSopenharmony_ci
657cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
658cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
659cabdff1aSopenharmony_ci    VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
660cabdff1aSopenharmony_ci    DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
661cabdff1aSopenharmony_ci                coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
662cabdff1aSopenharmony_ci                res_hz3);
663cabdff1aSopenharmony_ci    MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
664cabdff1aSopenharmony_ci         res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
665cabdff1aSopenharmony_ci    ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
666cabdff1aSopenharmony_ci    SRARI_H2_UH(res_vt0, res_vt1, 6);
667cabdff1aSopenharmony_ci    SAT_UH2_UH(res_vt0, res_vt1, 7);
668cabdff1aSopenharmony_ci    PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
669cabdff1aSopenharmony_ci    ST_W2(res0, 0, 1, dst, stride);
670cabdff1aSopenharmony_ci    ST_W2(res1, 0, 1, dst + 2 * stride, stride);
671cabdff1aSopenharmony_ci}
672cabdff1aSopenharmony_ci
673cabdff1aSopenharmony_cistatic void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
674cabdff1aSopenharmony_ci                                  uint32_t coef_hor0, uint32_t coef_hor1,
675cabdff1aSopenharmony_ci                                  uint32_t coef_ver0, uint32_t coef_ver1)
676cabdff1aSopenharmony_ci{
677cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
678cabdff1aSopenharmony_ci    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
679cabdff1aSopenharmony_ci    v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
680cabdff1aSopenharmony_ci    v16i8 mask;
681cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
682cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
683cabdff1aSopenharmony_ci    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
684cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
685cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
686cabdff1aSopenharmony_ci
687cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[0]);
688cabdff1aSopenharmony_ci
689cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
690cabdff1aSopenharmony_ci    src += (5 * stride);
691cabdff1aSopenharmony_ci    LD_UB4(src, stride, src5, src6, src7, src8);
692cabdff1aSopenharmony_ci
693cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
694cabdff1aSopenharmony_ci    VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
695cabdff1aSopenharmony_ci    VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
696cabdff1aSopenharmony_ci    VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
697cabdff1aSopenharmony_ci    DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
698cabdff1aSopenharmony_ci                coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
699cabdff1aSopenharmony_ci    DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
700cabdff1aSopenharmony_ci                coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
701cabdff1aSopenharmony_ci    MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
702cabdff1aSopenharmony_ci         res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
703cabdff1aSopenharmony_ci    MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
704cabdff1aSopenharmony_ci         res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
705cabdff1aSopenharmony_ci    ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
706cabdff1aSopenharmony_ci    ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
707cabdff1aSopenharmony_ci    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
708cabdff1aSopenharmony_ci    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
709cabdff1aSopenharmony_ci    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
710cabdff1aSopenharmony_ci    ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
711cabdff1aSopenharmony_ci}
712cabdff1aSopenharmony_ci
713cabdff1aSopenharmony_cistatic void avc_chroma_hv_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
714cabdff1aSopenharmony_ci                                 uint32_t coef_hor0, uint32_t coef_hor1,
715cabdff1aSopenharmony_ci                                 uint32_t coef_ver0, uint32_t coef_ver1,
716cabdff1aSopenharmony_ci                                 int32_t height)
717cabdff1aSopenharmony_ci{
718cabdff1aSopenharmony_ci    if (2 == height) {
719cabdff1aSopenharmony_ci        avc_chroma_hv_4x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
720cabdff1aSopenharmony_ci                              coef_ver1);
721cabdff1aSopenharmony_ci    } else if (4 == height) {
722cabdff1aSopenharmony_ci        avc_chroma_hv_4x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
723cabdff1aSopenharmony_ci                              coef_ver1);
724cabdff1aSopenharmony_ci    } else if (8 == height) {
725cabdff1aSopenharmony_ci        avc_chroma_hv_4x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
726cabdff1aSopenharmony_ci                              coef_ver1);
727cabdff1aSopenharmony_ci    }
728cabdff1aSopenharmony_ci}
729cabdff1aSopenharmony_ci
730cabdff1aSopenharmony_cistatic void avc_chroma_hv_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
731cabdff1aSopenharmony_ci                                  uint32_t coef_hor0, uint32_t coef_hor1,
732cabdff1aSopenharmony_ci                                  uint32_t coef_ver0, uint32_t coef_ver1)
733cabdff1aSopenharmony_ci{
734cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, out0, out1;
735cabdff1aSopenharmony_ci    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
736cabdff1aSopenharmony_ci    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
737cabdff1aSopenharmony_ci    v16i8 mask;
738cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
739cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
740cabdff1aSopenharmony_ci    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
741cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
742cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
743cabdff1aSopenharmony_ci
744cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[32]);
745cabdff1aSopenharmony_ci
746cabdff1aSopenharmony_ci    src0 = LD_UB(src);
747cabdff1aSopenharmony_ci    src += stride;
748cabdff1aSopenharmony_ci
749cabdff1aSopenharmony_ci    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
750cabdff1aSopenharmony_ci    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
751cabdff1aSopenharmony_ci
752cabdff1aSopenharmony_ci    LD_UB4(src, stride, src1, src2, src3, src4);
753cabdff1aSopenharmony_ci    src += (4 * stride);
754cabdff1aSopenharmony_ci
755cabdff1aSopenharmony_ci    VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
756cabdff1aSopenharmony_ci    VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
757cabdff1aSopenharmony_ci    DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
758cabdff1aSopenharmony_ci                coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
759cabdff1aSopenharmony_ci    MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
760cabdff1aSopenharmony_ci         res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
761cabdff1aSopenharmony_ci
762cabdff1aSopenharmony_ci    res_vt0 += (res_hz0 * coeff_vt_vec1);
763cabdff1aSopenharmony_ci    res_vt1 += (res_hz1 * coeff_vt_vec1);
764cabdff1aSopenharmony_ci    res_vt2 += (res_hz2 * coeff_vt_vec1);
765cabdff1aSopenharmony_ci    res_vt3 += (res_hz3 * coeff_vt_vec1);
766cabdff1aSopenharmony_ci
767cabdff1aSopenharmony_ci    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
768cabdff1aSopenharmony_ci    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
769cabdff1aSopenharmony_ci    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
770cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
771cabdff1aSopenharmony_ci}
772cabdff1aSopenharmony_ci
773cabdff1aSopenharmony_cistatic void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
774cabdff1aSopenharmony_ci                                  uint32_t coef_hor0, uint32_t coef_hor1,
775cabdff1aSopenharmony_ci                                  uint32_t coef_ver0, uint32_t coef_ver1)
776cabdff1aSopenharmony_ci{
777cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
778cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
779cabdff1aSopenharmony_ci    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
780cabdff1aSopenharmony_ci    v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
781cabdff1aSopenharmony_ci    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
782cabdff1aSopenharmony_ci    v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
783cabdff1aSopenharmony_ci    v16i8 mask;
784cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
785cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
786cabdff1aSopenharmony_ci    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
787cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
788cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
789cabdff1aSopenharmony_ci
790cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[32]);
791cabdff1aSopenharmony_ci
792cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
793cabdff1aSopenharmony_ci    src += (5 * stride);
794cabdff1aSopenharmony_ci    LD_UB4(src, stride, src5, src6, src7, src8);
795cabdff1aSopenharmony_ci    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
796cabdff1aSopenharmony_ci    VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
797cabdff1aSopenharmony_ci    VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
798cabdff1aSopenharmony_ci    VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
799cabdff1aSopenharmony_ci    VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
800cabdff1aSopenharmony_ci    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
801cabdff1aSopenharmony_ci    DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
802cabdff1aSopenharmony_ci                coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
803cabdff1aSopenharmony_ci                res_hz4);
804cabdff1aSopenharmony_ci    DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
805cabdff1aSopenharmony_ci                coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
806cabdff1aSopenharmony_ci    MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
807cabdff1aSopenharmony_ci         coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
808cabdff1aSopenharmony_ci         res_vt3);
809cabdff1aSopenharmony_ci    MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
810cabdff1aSopenharmony_ci         coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
811cabdff1aSopenharmony_ci         res_vt7);
812cabdff1aSopenharmony_ci    res_vt0 += (res_hz0 * coeff_vt_vec1);
813cabdff1aSopenharmony_ci    res_vt1 += (res_hz1 * coeff_vt_vec1);
814cabdff1aSopenharmony_ci    res_vt2 += (res_hz2 * coeff_vt_vec1);
815cabdff1aSopenharmony_ci    res_vt3 += (res_hz3 * coeff_vt_vec1);
816cabdff1aSopenharmony_ci    res_vt4 += (res_hz4 * coeff_vt_vec1);
817cabdff1aSopenharmony_ci    res_vt5 += (res_hz5 * coeff_vt_vec1);
818cabdff1aSopenharmony_ci    res_vt6 += (res_hz6 * coeff_vt_vec1);
819cabdff1aSopenharmony_ci    res_vt7 += (res_hz7 * coeff_vt_vec1);
820cabdff1aSopenharmony_ci    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
821cabdff1aSopenharmony_ci    SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
822cabdff1aSopenharmony_ci    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
823cabdff1aSopenharmony_ci    SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
824cabdff1aSopenharmony_ci    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
825cabdff1aSopenharmony_ci    PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
826cabdff1aSopenharmony_ci    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
827cabdff1aSopenharmony_ci}
828cabdff1aSopenharmony_ci
829cabdff1aSopenharmony_cistatic void avc_chroma_hv_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
830cabdff1aSopenharmony_ci                                 uint32_t coef_hor0, uint32_t coef_hor1,
831cabdff1aSopenharmony_ci                                 uint32_t coef_ver0, uint32_t coef_ver1,
832cabdff1aSopenharmony_ci                                 int32_t height)
833cabdff1aSopenharmony_ci{
834cabdff1aSopenharmony_ci    if (4 == height) {
835cabdff1aSopenharmony_ci        avc_chroma_hv_8x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
836cabdff1aSopenharmony_ci                              coef_ver1);
837cabdff1aSopenharmony_ci    } else if (8 == height) {
838cabdff1aSopenharmony_ci        avc_chroma_hv_8x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
839cabdff1aSopenharmony_ci                              coef_ver1);
840cabdff1aSopenharmony_ci    }
841cabdff1aSopenharmony_ci}
842cabdff1aSopenharmony_ci
843cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
844cabdff1aSopenharmony_ci                                               int32_t stride, uint32_t coeff0,
845cabdff1aSopenharmony_ci                                               uint32_t coeff1)
846cabdff1aSopenharmony_ci{
847cabdff1aSopenharmony_ci    uint16_t out0, out1;
848cabdff1aSopenharmony_ci    v16i8 src0, src1;
849cabdff1aSopenharmony_ci    v16u8 dst_data = { 0 };
850cabdff1aSopenharmony_ci    v8u16 res_r;
851cabdff1aSopenharmony_ci    v16u8 res;
852cabdff1aSopenharmony_ci    v16i8 mask;
853cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
854cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
855cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
856cabdff1aSopenharmony_ci
857cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[0]);
858cabdff1aSopenharmony_ci
859cabdff1aSopenharmony_ci    LD_SB2(src, stride, src0, src1);
860cabdff1aSopenharmony_ci
861cabdff1aSopenharmony_ci    out0 = LH(dst);
862cabdff1aSopenharmony_ci    out1 = LH(dst + stride);
863cabdff1aSopenharmony_ci
864cabdff1aSopenharmony_ci    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
865cabdff1aSopenharmony_ci    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
866cabdff1aSopenharmony_ci
867cabdff1aSopenharmony_ci    src0 = __msa_vshf_b(mask, src1, src0);
868cabdff1aSopenharmony_ci
869cabdff1aSopenharmony_ci    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
870cabdff1aSopenharmony_ci    res_r <<= 3;
871cabdff1aSopenharmony_ci    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
872cabdff1aSopenharmony_ci    res_r = __msa_sat_u_h(res_r, 7);
873cabdff1aSopenharmony_ci
874cabdff1aSopenharmony_ci    res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
875cabdff1aSopenharmony_ci    dst_data = __msa_aver_u_b(res, dst_data);
876cabdff1aSopenharmony_ci
877cabdff1aSopenharmony_ci    out0 = __msa_copy_u_h((v8i16) dst_data, 0);
878cabdff1aSopenharmony_ci    out1 = __msa_copy_u_h((v8i16) dst_data, 2);
879cabdff1aSopenharmony_ci
880cabdff1aSopenharmony_ci    SH(out0, dst);
881cabdff1aSopenharmony_ci    dst += stride;
882cabdff1aSopenharmony_ci    SH(out1, dst);
883cabdff1aSopenharmony_ci}
884cabdff1aSopenharmony_ci
885cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
886cabdff1aSopenharmony_ci                                               int32_t stride, uint32_t coeff0,
887cabdff1aSopenharmony_ci                                               uint32_t coeff1)
888cabdff1aSopenharmony_ci{
889cabdff1aSopenharmony_ci    uint16_t tp0, tp1, tp2, tp3;
890cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3;
891cabdff1aSopenharmony_ci    v16u8 dst0, dst_data = { 0 };
892cabdff1aSopenharmony_ci    v8u16 res_r;
893cabdff1aSopenharmony_ci    v16i8 mask;
894cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
895cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
896cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
897cabdff1aSopenharmony_ci
898cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[64]);
899cabdff1aSopenharmony_ci
900cabdff1aSopenharmony_ci    LD_UB4(src, stride, src0, src1, src2, src3);
901cabdff1aSopenharmony_ci    tp0 = LH(dst);
902cabdff1aSopenharmony_ci    tp1 = LH(dst + stride);
903cabdff1aSopenharmony_ci    tp2 = LH(dst + 2 * stride);
904cabdff1aSopenharmony_ci    tp3 = LH(dst + 3 * stride);
905cabdff1aSopenharmony_ci    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
906cabdff1aSopenharmony_ci    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
907cabdff1aSopenharmony_ci    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
908cabdff1aSopenharmony_ci    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
909cabdff1aSopenharmony_ci
910cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
911cabdff1aSopenharmony_ci
912cabdff1aSopenharmony_ci    src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
913cabdff1aSopenharmony_ci
914cabdff1aSopenharmony_ci    res_r = __msa_dotp_u_h(src0, coeff_vec);
915cabdff1aSopenharmony_ci    res_r <<= 3;
916cabdff1aSopenharmony_ci    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
917cabdff1aSopenharmony_ci    res_r = __msa_sat_u_h(res_r, 7);
918cabdff1aSopenharmony_ci
919cabdff1aSopenharmony_ci    dst0 = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
920cabdff1aSopenharmony_ci    dst0 = __msa_aver_u_b(dst0, dst_data);
921cabdff1aSopenharmony_ci
922cabdff1aSopenharmony_ci    ST_H4(dst0, 0, 1, 2, 3, dst, stride);
923cabdff1aSopenharmony_ci}
924cabdff1aSopenharmony_ci
925cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
926cabdff1aSopenharmony_ci                                              int32_t stride, uint32_t coeff0,
927cabdff1aSopenharmony_ci                                              uint32_t coeff1, int32_t height)
928cabdff1aSopenharmony_ci{
929cabdff1aSopenharmony_ci    if (2 == height) {
930cabdff1aSopenharmony_ci        avc_chroma_hz_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1);
931cabdff1aSopenharmony_ci    } else if (4 == height) {
932cabdff1aSopenharmony_ci        avc_chroma_hz_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1);
933cabdff1aSopenharmony_ci    }
934cabdff1aSopenharmony_ci}
935cabdff1aSopenharmony_ci
936cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
937cabdff1aSopenharmony_ci                                               int32_t stride, uint32_t coeff0,
938cabdff1aSopenharmony_ci                                               uint32_t coeff1)
939cabdff1aSopenharmony_ci{
940cabdff1aSopenharmony_ci    uint32_t load0, load1;
941cabdff1aSopenharmony_ci    v16i8 src0, src1;
942cabdff1aSopenharmony_ci    v16u8 dst_data = { 0 };
943cabdff1aSopenharmony_ci    v8u16 res_r;
944cabdff1aSopenharmony_ci    v16i8 res, mask;
945cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
946cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
947cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
948cabdff1aSopenharmony_ci
949cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[0]);
950cabdff1aSopenharmony_ci
951cabdff1aSopenharmony_ci    LD_SB2(src, stride, src0, src1);
952cabdff1aSopenharmony_ci
953cabdff1aSopenharmony_ci    LW2(dst, stride, load0, load1);
954cabdff1aSopenharmony_ci
955cabdff1aSopenharmony_ci    INSERT_W2_UB(load0, load1, dst_data);
956cabdff1aSopenharmony_ci
957cabdff1aSopenharmony_ci    src0 = __msa_vshf_b(mask, src1, src0);
958cabdff1aSopenharmony_ci
959cabdff1aSopenharmony_ci    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
960cabdff1aSopenharmony_ci    res_r <<= 3;
961cabdff1aSopenharmony_ci    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
962cabdff1aSopenharmony_ci    res_r = __msa_sat_u_h(res_r, 7);
963cabdff1aSopenharmony_ci    res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
964cabdff1aSopenharmony_ci    dst_data = __msa_aver_u_b((v16u8) res, dst_data);
965cabdff1aSopenharmony_ci
966cabdff1aSopenharmony_ci    ST_W2(dst_data, 0, 1, dst, stride);
967cabdff1aSopenharmony_ci}
968cabdff1aSopenharmony_ci
969cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
970cabdff1aSopenharmony_ci                                               int32_t stride, uint32_t coeff0,
971cabdff1aSopenharmony_ci                                               uint32_t coeff1)
972cabdff1aSopenharmony_ci{
973cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
974cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3;
975cabdff1aSopenharmony_ci    v16u8 out, dst_data = { 0 };
976cabdff1aSopenharmony_ci    v16i8 mask;
977cabdff1aSopenharmony_ci    v8u16 res0_r, res1_r;
978cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
979cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
980cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
981cabdff1aSopenharmony_ci
982cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[0]);
983cabdff1aSopenharmony_ci
984cabdff1aSopenharmony_ci    LD_UB4(src, stride, src0, src1, src2, src3);
985cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
986cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data);
987cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
988cabdff1aSopenharmony_ci    DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
989cabdff1aSopenharmony_ci    res0_r <<= 3;
990cabdff1aSopenharmony_ci    res1_r <<= 3;
991cabdff1aSopenharmony_ci    SRARI_H2_UH(res0_r, res1_r, 6);
992cabdff1aSopenharmony_ci    SAT_UH2_UH(res0_r, res1_r, 7);
993cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
994cabdff1aSopenharmony_ci    out = __msa_aver_u_b(out, dst_data);
995cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, stride);
996cabdff1aSopenharmony_ci}
997cabdff1aSopenharmony_ci
998cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
999cabdff1aSopenharmony_ci                                               int32_t stride, uint32_t coeff0,
1000cabdff1aSopenharmony_ci                                               uint32_t coeff1)
1001cabdff1aSopenharmony_ci{
1002cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
1003cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
1004cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 };
1005cabdff1aSopenharmony_ci    v16i8 mask;
1006cabdff1aSopenharmony_ci    v8u16 res0, res1, res2, res3;
1007cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1008cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1009cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1010cabdff1aSopenharmony_ci
1011cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[0]);
1012cabdff1aSopenharmony_ci
1013cabdff1aSopenharmony_ci    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1014cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
1015cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1016cabdff1aSopenharmony_ci    LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1017cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1018cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
1019cabdff1aSopenharmony_ci    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
1020cabdff1aSopenharmony_ci    DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
1021cabdff1aSopenharmony_ci    DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
1022cabdff1aSopenharmony_ci    SLLI_4V(res0, res1, res2, res3, 3);
1023cabdff1aSopenharmony_ci    SRARI_H4_UH(res0, res1, res2, res3, 6);
1024cabdff1aSopenharmony_ci    SAT_UH4_UH(res0, res1, res2, res3, 7);
1025cabdff1aSopenharmony_ci    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1026cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1027cabdff1aSopenharmony_ci    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1028cabdff1aSopenharmony_ci}
1029cabdff1aSopenharmony_ci
1030cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
1031cabdff1aSopenharmony_ci                                              int32_t stride, uint32_t coeff0,
1032cabdff1aSopenharmony_ci                                              uint32_t coeff1, int32_t height)
1033cabdff1aSopenharmony_ci{
1034cabdff1aSopenharmony_ci    if (2 == height) {
1035cabdff1aSopenharmony_ci        avc_chroma_hz_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1);
1036cabdff1aSopenharmony_ci    } else if (4 == height) {
1037cabdff1aSopenharmony_ci        avc_chroma_hz_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1);
1038cabdff1aSopenharmony_ci    } else if (8 == height) {
1039cabdff1aSopenharmony_ci        avc_chroma_hz_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1);
1040cabdff1aSopenharmony_ci    }
1041cabdff1aSopenharmony_ci}
1042cabdff1aSopenharmony_ci
1043cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
1044cabdff1aSopenharmony_ci                                               int32_t stride, uint32_t coeff0,
1045cabdff1aSopenharmony_ci                                               uint32_t coeff1)
1046cabdff1aSopenharmony_ci{
1047cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
1048cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, out0, out1;
1049cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 };
1050cabdff1aSopenharmony_ci    v8u16 res0, res1, res2, res3;
1051cabdff1aSopenharmony_ci    v16i8 mask;
1052cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1053cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1054cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1055cabdff1aSopenharmony_ci
1056cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[32]);
1057cabdff1aSopenharmony_ci    LD_UB4(src, stride, src0, src1, src2, src3);
1058cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
1059cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
1060cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
1061cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
1062cabdff1aSopenharmony_ci    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1063cabdff1aSopenharmony_ci    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1064cabdff1aSopenharmony_ci                coeff_vec, res0, res1, res2, res3);
1065cabdff1aSopenharmony_ci    SLLI_4V(res0, res1, res2, res3, 3);
1066cabdff1aSopenharmony_ci    SRARI_H4_UH(res0, res1, res2, res3, 6);
1067cabdff1aSopenharmony_ci    SAT_UH4_UH(res0, res1, res2, res3, 7);
1068cabdff1aSopenharmony_ci    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1069cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
1070cabdff1aSopenharmony_ci    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
1071cabdff1aSopenharmony_ci}
1072cabdff1aSopenharmony_ci
1073cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
1074cabdff1aSopenharmony_ci                                               int32_t stride, uint32_t coeff0,
1075cabdff1aSopenharmony_ci                                               uint32_t coeff1)
1076cabdff1aSopenharmony_ci{
1077cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
1078cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1079cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
1080cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1081cabdff1aSopenharmony_ci    v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1082cabdff1aSopenharmony_ci    v16i8 mask;
1083cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1084cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1085cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1086cabdff1aSopenharmony_ci
1087cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[32]);
1088cabdff1aSopenharmony_ci
1089cabdff1aSopenharmony_ci    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1090cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
1091cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
1092cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
1093cabdff1aSopenharmony_ci    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1094cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst2);
1095cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst3);
1096cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
1097cabdff1aSopenharmony_ci    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1098cabdff1aSopenharmony_ci    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
1099cabdff1aSopenharmony_ci    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
1100cabdff1aSopenharmony_ci    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1101cabdff1aSopenharmony_ci                coeff_vec, res0, res1, res2, res3);
1102cabdff1aSopenharmony_ci    DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
1103cabdff1aSopenharmony_ci                coeff_vec, res4, res5, res6, res7);
1104cabdff1aSopenharmony_ci    SLLI_4V(res0, res1, res2, res3, 3);
1105cabdff1aSopenharmony_ci    SLLI_4V(res4, res5, res6, res7, 3);
1106cabdff1aSopenharmony_ci    SRARI_H4_UH(res0, res1, res2, res3, 6);
1107cabdff1aSopenharmony_ci    SRARI_H4_UH(res4, res5, res6, res7, 6);
1108cabdff1aSopenharmony_ci    SAT_UH4_UH(res0, res1, res2, res3, 7);
1109cabdff1aSopenharmony_ci    SAT_UH4_UH(res4, res5, res6, res7, 7);
1110cabdff1aSopenharmony_ci    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1111cabdff1aSopenharmony_ci    PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
1112cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1113cabdff1aSopenharmony_ci    AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1114cabdff1aSopenharmony_ci    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1115cabdff1aSopenharmony_ci}
1116cabdff1aSopenharmony_ci
1117cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
1118cabdff1aSopenharmony_ci                                              int32_t stride, uint32_t coeff0,
1119cabdff1aSopenharmony_ci                                              uint32_t coeff1, int32_t height)
1120cabdff1aSopenharmony_ci{
1121cabdff1aSopenharmony_ci    if (4 == height) {
1122cabdff1aSopenharmony_ci        avc_chroma_hz_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1);
1123cabdff1aSopenharmony_ci    } else if (8 == height) {
1124cabdff1aSopenharmony_ci        avc_chroma_hz_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1);
1125cabdff1aSopenharmony_ci    }
1126cabdff1aSopenharmony_ci}
1127cabdff1aSopenharmony_ci
1128cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
1129cabdff1aSopenharmony_ci                                               int32_t stride, uint32_t coeff0,
1130cabdff1aSopenharmony_ci                                               uint32_t coeff1)
1131cabdff1aSopenharmony_ci{
1132cabdff1aSopenharmony_ci    uint16_t out0, out1;
1133cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, tmp0, tmp1, res;
1134cabdff1aSopenharmony_ci    v16u8 dst_data = { 0 };
1135cabdff1aSopenharmony_ci    v8i16 out;
1136cabdff1aSopenharmony_ci    v8u16 res_r;
1137cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1138cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1139cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1140cabdff1aSopenharmony_ci
1141cabdff1aSopenharmony_ci    LD_SB3(src, stride, src0, src1, src2);
1142cabdff1aSopenharmony_ci    out0 = LH(dst);
1143cabdff1aSopenharmony_ci    out1 = LH(dst + stride);
1144cabdff1aSopenharmony_ci
1145cabdff1aSopenharmony_ci    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
1146cabdff1aSopenharmony_ci    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
1147cabdff1aSopenharmony_ci
1148cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
1149cabdff1aSopenharmony_ci
1150cabdff1aSopenharmony_ci    tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1151cabdff1aSopenharmony_ci    res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
1152cabdff1aSopenharmony_ci    res_r <<= 3;
1153cabdff1aSopenharmony_ci    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1154cabdff1aSopenharmony_ci    res_r = __msa_sat_u_h(res_r, 7);
1155cabdff1aSopenharmony_ci    res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1156cabdff1aSopenharmony_ci    out = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1157cabdff1aSopenharmony_ci    out0 = __msa_copy_u_h(out, 0);
1158cabdff1aSopenharmony_ci    out1 = __msa_copy_u_h(out, 2);
1159cabdff1aSopenharmony_ci
1160cabdff1aSopenharmony_ci    SH(out0, dst);
1161cabdff1aSopenharmony_ci    dst += stride;
1162cabdff1aSopenharmony_ci    SH(out1, dst);
1163cabdff1aSopenharmony_ci}
1164cabdff1aSopenharmony_ci
1165cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
1166cabdff1aSopenharmony_ci                                               int32_t stride, uint32_t coeff0,
1167cabdff1aSopenharmony_ci                                               uint32_t coeff1)
1168cabdff1aSopenharmony_ci{
1169cabdff1aSopenharmony_ci    uint16_t tp0, tp1, tp2, tp3;
1170cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
1171cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3;
1172cabdff1aSopenharmony_ci    v8u16 res_r;
1173cabdff1aSopenharmony_ci    v8i16 res;
1174cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1175cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1176cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1177cabdff1aSopenharmony_ci    v16u8 dst_data = { 0 };
1178cabdff1aSopenharmony_ci
1179cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1180cabdff1aSopenharmony_ci
1181cabdff1aSopenharmony_ci    tp0 = LH(dst);
1182cabdff1aSopenharmony_ci    tp1 = LH(dst + stride);
1183cabdff1aSopenharmony_ci    tp2 = LH(dst + 2 * stride);
1184cabdff1aSopenharmony_ci    tp3 = LH(dst + 3 * stride);
1185cabdff1aSopenharmony_ci    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
1186cabdff1aSopenharmony_ci    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
1187cabdff1aSopenharmony_ci    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
1188cabdff1aSopenharmony_ci    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
1189cabdff1aSopenharmony_ci
1190cabdff1aSopenharmony_ci    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1191cabdff1aSopenharmony_ci               tmp0, tmp1, tmp2, tmp3);
1192cabdff1aSopenharmony_ci    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1193cabdff1aSopenharmony_ci
1194cabdff1aSopenharmony_ci    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1195cabdff1aSopenharmony_ci
1196cabdff1aSopenharmony_ci    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1197cabdff1aSopenharmony_ci    res_r <<= 3;
1198cabdff1aSopenharmony_ci    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1199cabdff1aSopenharmony_ci    res_r = __msa_sat_u_h(res_r, 7);
1200cabdff1aSopenharmony_ci
1201cabdff1aSopenharmony_ci    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1202cabdff1aSopenharmony_ci    res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1203cabdff1aSopenharmony_ci
1204cabdff1aSopenharmony_ci    ST_H4(res, 0, 1, 2, 3, dst, stride);
1205cabdff1aSopenharmony_ci}
1206cabdff1aSopenharmony_ci
1207cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
1208cabdff1aSopenharmony_ci                                              int32_t stride, uint32_t coeff0,
1209cabdff1aSopenharmony_ci                                              uint32_t coeff1, int32_t height)
1210cabdff1aSopenharmony_ci{
1211cabdff1aSopenharmony_ci    if (2 == height) {
1212cabdff1aSopenharmony_ci        avc_chroma_vt_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1);
1213cabdff1aSopenharmony_ci    } else if (4 == height) {
1214cabdff1aSopenharmony_ci        avc_chroma_vt_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1);
1215cabdff1aSopenharmony_ci    }
1216cabdff1aSopenharmony_ci}
1217cabdff1aSopenharmony_ci
1218cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
1219cabdff1aSopenharmony_ci                                               int32_t stride, uint32_t coeff0,
1220cabdff1aSopenharmony_ci                                               uint32_t coeff1)
1221cabdff1aSopenharmony_ci{
1222cabdff1aSopenharmony_ci    uint32_t load0, load1;
1223cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, tmp0, tmp1;
1224cabdff1aSopenharmony_ci    v16u8 dst_data = { 0 };
1225cabdff1aSopenharmony_ci    v8u16 res_r;
1226cabdff1aSopenharmony_ci    v16u8 res;
1227cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1228cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1229cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1230cabdff1aSopenharmony_ci
1231cabdff1aSopenharmony_ci    LD_UB3(src, stride, src0, src1, src2);
1232cabdff1aSopenharmony_ci
1233cabdff1aSopenharmony_ci    LW2(dst, stride, load0, load1);
1234cabdff1aSopenharmony_ci
1235cabdff1aSopenharmony_ci    INSERT_W2_UB(load0, load1, dst_data);
1236cabdff1aSopenharmony_ci    ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
1237cabdff1aSopenharmony_ci
1238cabdff1aSopenharmony_ci    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1239cabdff1aSopenharmony_ci
1240cabdff1aSopenharmony_ci    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1241cabdff1aSopenharmony_ci    res_r <<= 3;
1242cabdff1aSopenharmony_ci    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1243cabdff1aSopenharmony_ci    res_r = __msa_sat_u_h(res_r, 7);
1244cabdff1aSopenharmony_ci    res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1245cabdff1aSopenharmony_ci    res = __msa_aver_u_b(res, dst_data);
1246cabdff1aSopenharmony_ci
1247cabdff1aSopenharmony_ci    ST_W2(res, 0, 1, dst, stride);
1248cabdff1aSopenharmony_ci}
1249cabdff1aSopenharmony_ci
1250cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
1251cabdff1aSopenharmony_ci                                               int32_t stride, uint32_t coeff0,
1252cabdff1aSopenharmony_ci                                               uint32_t coeff1)
1253cabdff1aSopenharmony_ci{
1254cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
1255cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4;
1256cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3;
1257cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 };
1258cabdff1aSopenharmony_ci    v8u16 res0_r, res1_r;
1259cabdff1aSopenharmony_ci    v16u8 out;
1260cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1261cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1262cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1263cabdff1aSopenharmony_ci
1264cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1265cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
1266cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1267cabdff1aSopenharmony_ci    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
1268cabdff1aSopenharmony_ci               tmp3);
1269cabdff1aSopenharmony_ci    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1270cabdff1aSopenharmony_ci    DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
1271cabdff1aSopenharmony_ci    res0_r <<= 3;
1272cabdff1aSopenharmony_ci    res1_r <<= 3;
1273cabdff1aSopenharmony_ci    SRARI_H2_UH(res0_r, res1_r, 6);
1274cabdff1aSopenharmony_ci    SAT_UH2_UH(res0_r, res1_r, 7);
1275cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
1276cabdff1aSopenharmony_ci    out = __msa_aver_u_b(out, dst0);
1277cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, stride);
1278cabdff1aSopenharmony_ci}
1279cabdff1aSopenharmony_ci
1280cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
1281cabdff1aSopenharmony_ci                                               int32_t stride, uint32_t coeff0,
1282cabdff1aSopenharmony_ci                                               uint32_t coeff1)
1283cabdff1aSopenharmony_ci{
1284cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
1285cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1286cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
1287cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 };
1288cabdff1aSopenharmony_ci    v8u16 res0, res1, res2, res3;
1289cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1290cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1291cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1292cabdff1aSopenharmony_ci
1293cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1294cabdff1aSopenharmony_ci    src += (5 * stride);
1295cabdff1aSopenharmony_ci    LD_UB4(src, stride, src5, src6, src7, src8);
1296cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
1297cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1298cabdff1aSopenharmony_ci    LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1299cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1300cabdff1aSopenharmony_ci    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
1301cabdff1aSopenharmony_ci               tmp3);
1302cabdff1aSopenharmony_ci    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
1303cabdff1aSopenharmony_ci               tmp7);
1304cabdff1aSopenharmony_ci    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1305cabdff1aSopenharmony_ci    ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
1306cabdff1aSopenharmony_ci    DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
1307cabdff1aSopenharmony_ci    DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
1308cabdff1aSopenharmony_ci    SLLI_4V(res0, res1, res2, res3, 3);
1309cabdff1aSopenharmony_ci    SRARI_H4_UH(res0, res1, res2, res3, 6);
1310cabdff1aSopenharmony_ci    SAT_UH4_UH(res0, res1, res2, res3, 7);
1311cabdff1aSopenharmony_ci    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1312cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1313cabdff1aSopenharmony_ci    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1314cabdff1aSopenharmony_ci}
1315cabdff1aSopenharmony_ci
1316cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
1317cabdff1aSopenharmony_ci                                              int32_t stride, uint32_t coeff0,
1318cabdff1aSopenharmony_ci                                              uint32_t coeff1, int32_t height)
1319cabdff1aSopenharmony_ci{
1320cabdff1aSopenharmony_ci    if (2 == height) {
1321cabdff1aSopenharmony_ci        avc_chroma_vt_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1);
1322cabdff1aSopenharmony_ci    } else if (4 == height) {
1323cabdff1aSopenharmony_ci        avc_chroma_vt_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1);
1324cabdff1aSopenharmony_ci    } else if (8 == height) {
1325cabdff1aSopenharmony_ci        avc_chroma_vt_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1);
1326cabdff1aSopenharmony_ci    }
1327cabdff1aSopenharmony_ci}
1328cabdff1aSopenharmony_ci
1329cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
1330cabdff1aSopenharmony_ci                                               int32_t stride, uint32_t coeff0,
1331cabdff1aSopenharmony_ci                                               uint32_t coeff1)
1332cabdff1aSopenharmony_ci{
1333cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
1334cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4;
1335cabdff1aSopenharmony_ci    v16u8 out0, out1;
1336cabdff1aSopenharmony_ci    v8u16 res0, res1, res2, res3;
1337cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 };
1338cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1339cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1340cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1341cabdff1aSopenharmony_ci
1342cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1343cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
1344cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
1345cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
1346cabdff1aSopenharmony_ci    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1347cabdff1aSopenharmony_ci               src0, src1, src2, src3);
1348cabdff1aSopenharmony_ci    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1349cabdff1aSopenharmony_ci                coeff_vec, res0, res1, res2, res3);
1350cabdff1aSopenharmony_ci    SLLI_4V(res0, res1, res2, res3, 3);
1351cabdff1aSopenharmony_ci    SRARI_H4_UH(res0, res1, res2, res3, 6);
1352cabdff1aSopenharmony_ci    SAT_UH4_UH(res0, res1, res2, res3, 7);
1353cabdff1aSopenharmony_ci    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1354cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1355cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1356cabdff1aSopenharmony_ci}
1357cabdff1aSopenharmony_ci
1358cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
1359cabdff1aSopenharmony_ci                                               int32_t stride, uint32_t coeff0,
1360cabdff1aSopenharmony_ci                                               uint32_t coeff1)
1361cabdff1aSopenharmony_ci{
1362cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
1363cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1364cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
1365cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1366cabdff1aSopenharmony_ci    v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1367cabdff1aSopenharmony_ci    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1368cabdff1aSopenharmony_ci    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1369cabdff1aSopenharmony_ci    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1370cabdff1aSopenharmony_ci
1371cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1372cabdff1aSopenharmony_ci    src += (5 * stride);
1373cabdff1aSopenharmony_ci    LD_UB4(src, stride, src5, src6, src7, src8);
1374cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
1375cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
1376cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
1377cabdff1aSopenharmony_ci    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1378cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst2);
1379cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst3);
1380cabdff1aSopenharmony_ci    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1381cabdff1aSopenharmony_ci               src0, src1, src2, src3);
1382cabdff1aSopenharmony_ci    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1383cabdff1aSopenharmony_ci               src4, src5, src6, src7);
1384cabdff1aSopenharmony_ci    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1385cabdff1aSopenharmony_ci                coeff_vec, res0, res1, res2, res3);
1386cabdff1aSopenharmony_ci    DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
1387cabdff1aSopenharmony_ci                coeff_vec, res4, res5, res6, res7);
1388cabdff1aSopenharmony_ci    SLLI_4V(res0, res1, res2, res3, 3);
1389cabdff1aSopenharmony_ci    SLLI_4V(res4, res5, res6, res7, 3);
1390cabdff1aSopenharmony_ci    SRARI_H4_UH(res0, res1, res2, res3, 6);
1391cabdff1aSopenharmony_ci    SRARI_H4_UH(res4, res5, res6, res7, 6);
1392cabdff1aSopenharmony_ci    SAT_UH4_UH(res0, res1, res2, res3, 7);
1393cabdff1aSopenharmony_ci    SAT_UH4_UH(res0, res1, res2, res3, 7);
1394cabdff1aSopenharmony_ci    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1395cabdff1aSopenharmony_ci    PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
1396cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1397cabdff1aSopenharmony_ci    AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1398cabdff1aSopenharmony_ci    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1399cabdff1aSopenharmony_ci}
1400cabdff1aSopenharmony_ci
1401cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
1402cabdff1aSopenharmony_ci                                              int32_t stride, uint32_t coeff0,
1403cabdff1aSopenharmony_ci                                              uint32_t coeff1, int32_t height)
1404cabdff1aSopenharmony_ci{
1405cabdff1aSopenharmony_ci    if (4 == height) {
1406cabdff1aSopenharmony_ci        avc_chroma_vt_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1);
1407cabdff1aSopenharmony_ci    } else if (8 == height) {
1408cabdff1aSopenharmony_ci        avc_chroma_vt_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1);
1409cabdff1aSopenharmony_ci    }
1410cabdff1aSopenharmony_ci}
1411cabdff1aSopenharmony_ci
1412cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
1413cabdff1aSopenharmony_ci                                               int32_t stride,
1414cabdff1aSopenharmony_ci                                               uint32_t coef_hor0,
1415cabdff1aSopenharmony_ci                                               uint32_t coef_hor1,
1416cabdff1aSopenharmony_ci                                               uint32_t coef_ver0,
1417cabdff1aSopenharmony_ci                                               uint32_t coef_ver1)
1418cabdff1aSopenharmony_ci{
1419cabdff1aSopenharmony_ci    uint16_t out0, out1;
1420cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 };
1421cabdff1aSopenharmony_ci    v16u8 src0, src1, src2;
1422cabdff1aSopenharmony_ci    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1423cabdff1aSopenharmony_ci    v16i8 res, mask;
1424cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1425cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1426cabdff1aSopenharmony_ci    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1427cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1428cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1429cabdff1aSopenharmony_ci
1430cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[48]);
1431cabdff1aSopenharmony_ci
1432cabdff1aSopenharmony_ci    LD_UB3(src, stride, src0, src1, src2);
1433cabdff1aSopenharmony_ci    out0 = LH(dst);
1434cabdff1aSopenharmony_ci    out1 = LH(dst + stride);
1435cabdff1aSopenharmony_ci    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, out0);
1436cabdff1aSopenharmony_ci    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, out1);
1437cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1438cabdff1aSopenharmony_ci    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1439cabdff1aSopenharmony_ci    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1440cabdff1aSopenharmony_ci
1441cabdff1aSopenharmony_ci    res_vt0 += res_vt1;
1442cabdff1aSopenharmony_ci    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1443cabdff1aSopenharmony_ci    res_vt0 = __msa_sat_u_h(res_vt0, 7);
1444cabdff1aSopenharmony_ci    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1445cabdff1aSopenharmony_ci    dst0 = __msa_aver_u_b((v16u8) res, dst0);
1446cabdff1aSopenharmony_ci    out0 = __msa_copy_u_h((v8i16) dst0, 0);
1447cabdff1aSopenharmony_ci    out1 = __msa_copy_u_h((v8i16) dst0, 1);
1448cabdff1aSopenharmony_ci
1449cabdff1aSopenharmony_ci    SH(out0, dst);
1450cabdff1aSopenharmony_ci    dst += stride;
1451cabdff1aSopenharmony_ci    SH(out1, dst);
1452cabdff1aSopenharmony_ci}
1453cabdff1aSopenharmony_ci
1454cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
1455cabdff1aSopenharmony_ci                                               int32_t stride,
1456cabdff1aSopenharmony_ci                                               uint32_t coef_hor0,
1457cabdff1aSopenharmony_ci                                               uint32_t coef_hor1,
1458cabdff1aSopenharmony_ci                                               uint32_t coef_ver0,
1459cabdff1aSopenharmony_ci                                               uint32_t coef_ver1)
1460cabdff1aSopenharmony_ci{
1461cabdff1aSopenharmony_ci    uint16_t tp0, tp1, tp2, tp3;
1462cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4;
1463cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3;
1464cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 };
1465cabdff1aSopenharmony_ci    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1466cabdff1aSopenharmony_ci    v16i8 res, mask;
1467cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1468cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1469cabdff1aSopenharmony_ci    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1470cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1471cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1472cabdff1aSopenharmony_ci
1473cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[48]);
1474cabdff1aSopenharmony_ci
1475cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1476cabdff1aSopenharmony_ci    tp0 = LH(dst);
1477cabdff1aSopenharmony_ci    tp1 = LH(dst + stride);
1478cabdff1aSopenharmony_ci    tp2 = LH(dst + 2 * stride);
1479cabdff1aSopenharmony_ci    tp3 = LH(dst + 3 * stride);
1480cabdff1aSopenharmony_ci    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, tp0);
1481cabdff1aSopenharmony_ci    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, tp1);
1482cabdff1aSopenharmony_ci    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 2, tp2);
1483cabdff1aSopenharmony_ci    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 3, tp3);
1484cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
1485cabdff1aSopenharmony_ci    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
1486cabdff1aSopenharmony_ci    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1487cabdff1aSopenharmony_ci    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1488cabdff1aSopenharmony_ci    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1489cabdff1aSopenharmony_ci
1490cabdff1aSopenharmony_ci    res_vt0 += res_vt1;
1491cabdff1aSopenharmony_ci    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1492cabdff1aSopenharmony_ci    res_vt0 = __msa_sat_u_h(res_vt0, 7);
1493cabdff1aSopenharmony_ci    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1494cabdff1aSopenharmony_ci    dst0 = __msa_aver_u_b((v16u8) res, dst0);
1495cabdff1aSopenharmony_ci
1496cabdff1aSopenharmony_ci    ST_H4(dst0, 0, 1, 2, 3, dst, stride);
1497cabdff1aSopenharmony_ci}
1498cabdff1aSopenharmony_ci
1499cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
1500cabdff1aSopenharmony_ci                                              int32_t stride,
1501cabdff1aSopenharmony_ci                                              uint32_t coef_hor0,
1502cabdff1aSopenharmony_ci                                              uint32_t coef_hor1,
1503cabdff1aSopenharmony_ci                                              uint32_t coef_ver0,
1504cabdff1aSopenharmony_ci                                              uint32_t coef_ver1,
1505cabdff1aSopenharmony_ci                                              int32_t height)
1506cabdff1aSopenharmony_ci{
1507cabdff1aSopenharmony_ci    if (2 == height) {
1508cabdff1aSopenharmony_ci        avc_chroma_hv_and_aver_dst_2x2_msa(src, dst, stride, coef_hor0,
1509cabdff1aSopenharmony_ci                                           coef_hor1, coef_ver0, coef_ver1);
1510cabdff1aSopenharmony_ci    } else if (4 == height) {
1511cabdff1aSopenharmony_ci        avc_chroma_hv_and_aver_dst_2x4_msa(src, dst, stride, coef_hor0,
1512cabdff1aSopenharmony_ci                                           coef_hor1, coef_ver0, coef_ver1);
1513cabdff1aSopenharmony_ci    }
1514cabdff1aSopenharmony_ci}
1515cabdff1aSopenharmony_ci
1516cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
1517cabdff1aSopenharmony_ci                                               int32_t stride,
1518cabdff1aSopenharmony_ci                                               uint32_t coef_hor0,
1519cabdff1aSopenharmony_ci                                               uint32_t coef_hor1,
1520cabdff1aSopenharmony_ci                                               uint32_t coef_ver0,
1521cabdff1aSopenharmony_ci                                               uint32_t coef_ver1)
1522cabdff1aSopenharmony_ci{
1523cabdff1aSopenharmony_ci    uint32_t tp0, tp1;
1524cabdff1aSopenharmony_ci    v16u8 src0, src1, src2;
1525cabdff1aSopenharmony_ci    v16u8 dst0, dst_data = { 0 };
1526cabdff1aSopenharmony_ci    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1527cabdff1aSopenharmony_ci    v16i8 mask;
1528cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1529cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1530cabdff1aSopenharmony_ci    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1531cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1532cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1533cabdff1aSopenharmony_ci
1534cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[0]);
1535cabdff1aSopenharmony_ci
1536cabdff1aSopenharmony_ci    LD_UB3(src, stride, src0, src1, src2);
1537cabdff1aSopenharmony_ci    LW2(dst, stride, tp0, tp1);
1538cabdff1aSopenharmony_ci    INSERT_W2_UB(tp0, tp1, dst_data);
1539cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1540cabdff1aSopenharmony_ci    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1541cabdff1aSopenharmony_ci    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1542cabdff1aSopenharmony_ci
1543cabdff1aSopenharmony_ci    res_vt0 += res_vt1;
1544cabdff1aSopenharmony_ci    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1545cabdff1aSopenharmony_ci    res_vt0 = __msa_sat_u_h(res_vt0, 7);
1546cabdff1aSopenharmony_ci    dst0 = (v16u8) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1547cabdff1aSopenharmony_ci    dst0 = __msa_aver_u_b(dst0, dst_data);
1548cabdff1aSopenharmony_ci
1549cabdff1aSopenharmony_ci    ST_W2(dst0, 0, 1, dst, stride);
1550cabdff1aSopenharmony_ci}
1551cabdff1aSopenharmony_ci
1552cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
1553cabdff1aSopenharmony_ci                                               int32_t stride,
1554cabdff1aSopenharmony_ci                                               uint32_t coef_hor0,
1555cabdff1aSopenharmony_ci                                               uint32_t coef_hor1,
1556cabdff1aSopenharmony_ci                                               uint32_t coef_ver0,
1557cabdff1aSopenharmony_ci                                               uint32_t coef_ver1)
1558cabdff1aSopenharmony_ci{
1559cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
1560cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4;
1561cabdff1aSopenharmony_ci    v16u8 out, dst_data = { 0 };
1562cabdff1aSopenharmony_ci    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
1563cabdff1aSopenharmony_ci    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1564cabdff1aSopenharmony_ci    v16i8 mask;
1565cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1566cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1567cabdff1aSopenharmony_ci    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1568cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1569cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1570cabdff1aSopenharmony_ci
1571cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[0]);
1572cabdff1aSopenharmony_ci
1573cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1574cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
1575cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data);
1576cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1577cabdff1aSopenharmony_ci    VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1578cabdff1aSopenharmony_ci    DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1579cabdff1aSopenharmony_ci                coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
1580cabdff1aSopenharmony_ci                res_hz3);
1581cabdff1aSopenharmony_ci    MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
1582cabdff1aSopenharmony_ci         res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1583cabdff1aSopenharmony_ci    ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1584cabdff1aSopenharmony_ci    SRARI_H2_UH(res_vt0, res_vt1, 6);
1585cabdff1aSopenharmony_ci    SAT_UH2_UH(res_vt0, res_vt1, 7);
1586cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) res_vt1, (v16i8) res_vt0);
1587cabdff1aSopenharmony_ci    out = __msa_aver_u_b(out, dst_data);
1588cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, stride);
1589cabdff1aSopenharmony_ci}
1590cabdff1aSopenharmony_ci
1591cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
1592cabdff1aSopenharmony_ci                                               int32_t stride,
1593cabdff1aSopenharmony_ci                                               uint32_t coef_hor0,
1594cabdff1aSopenharmony_ci                                               uint32_t coef_hor1,
1595cabdff1aSopenharmony_ci                                               uint32_t coef_ver0,
1596cabdff1aSopenharmony_ci                                               uint32_t coef_ver1)
1597cabdff1aSopenharmony_ci{
1598cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
1599cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
1600cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 };
1601cabdff1aSopenharmony_ci    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
1602cabdff1aSopenharmony_ci    v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
1603cabdff1aSopenharmony_ci    v16i8 mask;
1604cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1605cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1606cabdff1aSopenharmony_ci    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1607cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1608cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1609cabdff1aSopenharmony_ci
1610cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[0]);
1611cabdff1aSopenharmony_ci
1612cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1613cabdff1aSopenharmony_ci    src += (5 * stride);
1614cabdff1aSopenharmony_ci    LD_UB4(src, stride, src5, src6, src7, src8);
1615cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
1616cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1617cabdff1aSopenharmony_ci    LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1618cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1619cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1620cabdff1aSopenharmony_ci    VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1621cabdff1aSopenharmony_ci    VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
1622cabdff1aSopenharmony_ci    VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
1623cabdff1aSopenharmony_ci    DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1624cabdff1aSopenharmony_ci                coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
1625cabdff1aSopenharmony_ci    DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
1626cabdff1aSopenharmony_ci                coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
1627cabdff1aSopenharmony_ci    MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
1628cabdff1aSopenharmony_ci         res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1629cabdff1aSopenharmony_ci    MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
1630cabdff1aSopenharmony_ci         res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
1631cabdff1aSopenharmony_ci    ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1632cabdff1aSopenharmony_ci    ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
1633cabdff1aSopenharmony_ci    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1634cabdff1aSopenharmony_ci    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1635cabdff1aSopenharmony_ci    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
1636cabdff1aSopenharmony_ci    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
1637cabdff1aSopenharmony_ci    ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1638cabdff1aSopenharmony_ci}
1639cabdff1aSopenharmony_ci
1640cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
1641cabdff1aSopenharmony_ci                                              int32_t stride,
1642cabdff1aSopenharmony_ci                                              uint32_t coef_hor0,
1643cabdff1aSopenharmony_ci                                              uint32_t coef_hor1,
1644cabdff1aSopenharmony_ci                                              uint32_t coef_ver0,
1645cabdff1aSopenharmony_ci                                              uint32_t coef_ver1,
1646cabdff1aSopenharmony_ci                                              int32_t height)
1647cabdff1aSopenharmony_ci{
1648cabdff1aSopenharmony_ci    if (2 == height) {
1649cabdff1aSopenharmony_ci        avc_chroma_hv_and_aver_dst_4x2_msa(src, dst, stride, coef_hor0,
1650cabdff1aSopenharmony_ci                                           coef_hor1, coef_ver0, coef_ver1);
1651cabdff1aSopenharmony_ci    } else if (4 == height) {
1652cabdff1aSopenharmony_ci        avc_chroma_hv_and_aver_dst_4x4_msa(src, dst, stride, coef_hor0,
1653cabdff1aSopenharmony_ci                                           coef_hor1, coef_ver0, coef_ver1);
1654cabdff1aSopenharmony_ci    } else if (8 == height) {
1655cabdff1aSopenharmony_ci        avc_chroma_hv_and_aver_dst_4x8_msa(src, dst, stride, coef_hor0,
1656cabdff1aSopenharmony_ci                                           coef_hor1, coef_ver0, coef_ver1);
1657cabdff1aSopenharmony_ci    }
1658cabdff1aSopenharmony_ci}
1659cabdff1aSopenharmony_ci
1660cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
1661cabdff1aSopenharmony_ci                                               int32_t stride,
1662cabdff1aSopenharmony_ci                                               uint32_t coef_hor0,
1663cabdff1aSopenharmony_ci                                               uint32_t coef_hor1,
1664cabdff1aSopenharmony_ci                                               uint32_t coef_ver0,
1665cabdff1aSopenharmony_ci                                               uint32_t coef_ver1)
1666cabdff1aSopenharmony_ci{
1667cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
1668cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, out0, out1;
1669cabdff1aSopenharmony_ci    v8u16 res_hz0, res_hz1, res_hz2;
1670cabdff1aSopenharmony_ci    v8u16 res_hz3, res_hz4;
1671cabdff1aSopenharmony_ci    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1672cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 };
1673cabdff1aSopenharmony_ci    v16i8 mask;
1674cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1675cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1676cabdff1aSopenharmony_ci    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1677cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1678cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1679cabdff1aSopenharmony_ci
1680cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[32]);
1681cabdff1aSopenharmony_ci
1682cabdff1aSopenharmony_ci    src0 = LD_UB(src);
1683cabdff1aSopenharmony_ci    src += stride;
1684cabdff1aSopenharmony_ci    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
1685cabdff1aSopenharmony_ci    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1686cabdff1aSopenharmony_ci    LD_UB4(src, stride, src1, src2, src3, src4);
1687cabdff1aSopenharmony_ci    src += (4 * stride);
1688cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
1689cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
1690cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
1691cabdff1aSopenharmony_ci    VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1692cabdff1aSopenharmony_ci    VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1693cabdff1aSopenharmony_ci    DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1694cabdff1aSopenharmony_ci                coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
1695cabdff1aSopenharmony_ci    MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
1696cabdff1aSopenharmony_ci         res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1697cabdff1aSopenharmony_ci    res_vt0 += (res_hz0 * coeff_vt_vec1);
1698cabdff1aSopenharmony_ci    res_vt1 += (res_hz1 * coeff_vt_vec1);
1699cabdff1aSopenharmony_ci    res_vt2 += (res_hz2 * coeff_vt_vec1);
1700cabdff1aSopenharmony_ci    res_vt3 += (res_hz3 * coeff_vt_vec1);
1701cabdff1aSopenharmony_ci    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1702cabdff1aSopenharmony_ci    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1703cabdff1aSopenharmony_ci    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1704cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1705cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1706cabdff1aSopenharmony_ci}
1707cabdff1aSopenharmony_ci
1708cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
1709cabdff1aSopenharmony_ci                                               int32_t stride,
1710cabdff1aSopenharmony_ci                                               uint32_t coef_hor0,
1711cabdff1aSopenharmony_ci                                               uint32_t coef_hor1,
1712cabdff1aSopenharmony_ci                                               uint32_t coef_ver0,
1713cabdff1aSopenharmony_ci                                               uint32_t coef_ver1)
1714cabdff1aSopenharmony_ci{
1715cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
1716cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1717cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
1718cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1719cabdff1aSopenharmony_ci    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
1720cabdff1aSopenharmony_ci    v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
1721cabdff1aSopenharmony_ci    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1722cabdff1aSopenharmony_ci    v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
1723cabdff1aSopenharmony_ci    v16i8 mask;
1724cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1725cabdff1aSopenharmony_ci    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1726cabdff1aSopenharmony_ci    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1727cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1728cabdff1aSopenharmony_ci    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1729cabdff1aSopenharmony_ci
1730cabdff1aSopenharmony_ci    mask = LD_SB(&chroma_mask_arr[32]);
1731cabdff1aSopenharmony_ci
1732cabdff1aSopenharmony_ci    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1733cabdff1aSopenharmony_ci    src += (5 * stride);
1734cabdff1aSopenharmony_ci    LD_UB4(src, stride, src5, src6, src7, src8);
1735cabdff1aSopenharmony_ci    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
1736cabdff1aSopenharmony_ci    VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1737cabdff1aSopenharmony_ci    VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1738cabdff1aSopenharmony_ci    VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
1739cabdff1aSopenharmony_ci    VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
1740cabdff1aSopenharmony_ci    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1741cabdff1aSopenharmony_ci    DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1742cabdff1aSopenharmony_ci                coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
1743cabdff1aSopenharmony_ci                res_hz4);
1744cabdff1aSopenharmony_ci    DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
1745cabdff1aSopenharmony_ci                coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
1746cabdff1aSopenharmony_ci    MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
1747cabdff1aSopenharmony_ci         coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
1748cabdff1aSopenharmony_ci         res_vt3);
1749cabdff1aSopenharmony_ci    MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
1750cabdff1aSopenharmony_ci         coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
1751cabdff1aSopenharmony_ci         res_vt7);
1752cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
1753cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
1754cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
1755cabdff1aSopenharmony_ci    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1756cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst2);
1757cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst3);
1758cabdff1aSopenharmony_ci    res_vt0 += (res_hz0 * coeff_vt_vec1);
1759cabdff1aSopenharmony_ci    res_vt1 += (res_hz1 * coeff_vt_vec1);
1760cabdff1aSopenharmony_ci    res_vt2 += (res_hz2 * coeff_vt_vec1);
1761cabdff1aSopenharmony_ci    res_vt3 += (res_hz3 * coeff_vt_vec1);
1762cabdff1aSopenharmony_ci    res_vt4 += (res_hz4 * coeff_vt_vec1);
1763cabdff1aSopenharmony_ci    res_vt5 += (res_hz5 * coeff_vt_vec1);
1764cabdff1aSopenharmony_ci    res_vt6 += (res_hz6 * coeff_vt_vec1);
1765cabdff1aSopenharmony_ci    res_vt7 += (res_hz7 * coeff_vt_vec1);
1766cabdff1aSopenharmony_ci    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1767cabdff1aSopenharmony_ci    SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
1768cabdff1aSopenharmony_ci    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1769cabdff1aSopenharmony_ci    SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
1770cabdff1aSopenharmony_ci    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1771cabdff1aSopenharmony_ci    PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
1772cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1773cabdff1aSopenharmony_ci    AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1774cabdff1aSopenharmony_ci    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1775cabdff1aSopenharmony_ci}
1776cabdff1aSopenharmony_ci
1777cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
1778cabdff1aSopenharmony_ci                                              int32_t stride,
1779cabdff1aSopenharmony_ci                                              uint32_t coef_hor0,
1780cabdff1aSopenharmony_ci                                              uint32_t coef_hor1,
1781cabdff1aSopenharmony_ci                                              uint32_t coef_ver0,
1782cabdff1aSopenharmony_ci                                              uint32_t coef_ver1,
1783cabdff1aSopenharmony_ci                                              int32_t height)
1784cabdff1aSopenharmony_ci{
1785cabdff1aSopenharmony_ci    if (4 == height) {
1786cabdff1aSopenharmony_ci        avc_chroma_hv_and_aver_dst_8x4_msa(src, dst, stride, coef_hor0,
1787cabdff1aSopenharmony_ci                                           coef_hor1, coef_ver0, coef_ver1);
1788cabdff1aSopenharmony_ci    } else if (8 == height) {
1789cabdff1aSopenharmony_ci        avc_chroma_hv_and_aver_dst_8x8_msa(src, dst, stride, coef_hor0,
1790cabdff1aSopenharmony_ci                                           coef_hor1, coef_ver0, coef_ver1);
1791cabdff1aSopenharmony_ci    }
1792cabdff1aSopenharmony_ci}
1793cabdff1aSopenharmony_ci
1794cabdff1aSopenharmony_cistatic void copy_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1795cabdff1aSopenharmony_ci                            int32_t height)
1796cabdff1aSopenharmony_ci{
1797cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1798cabdff1aSopenharmony_ci
1799cabdff1aSopenharmony_ci    if (8 == height) {
1800cabdff1aSopenharmony_ci        LW4(src, stride, tp0, tp1, tp2, tp3);
1801cabdff1aSopenharmony_ci        src += 4 * stride;
1802cabdff1aSopenharmony_ci        LW4(src, stride, tp4, tp5, tp6, tp7);
1803cabdff1aSopenharmony_ci        SW4(tp0, tp1, tp2, tp3, dst, stride);
1804cabdff1aSopenharmony_ci        dst += 4 * stride;
1805cabdff1aSopenharmony_ci        SW4(tp4, tp5, tp6, tp7, dst, stride);
1806cabdff1aSopenharmony_ci    } else if (4 == height) {
1807cabdff1aSopenharmony_ci        LW4(src, stride, tp0, tp1, tp2, tp3);
1808cabdff1aSopenharmony_ci        SW4(tp0, tp1, tp2, tp3, dst, stride);
1809cabdff1aSopenharmony_ci    } else if (2 == height) {
1810cabdff1aSopenharmony_ci        LW2(src, stride, tp0, tp1);
1811cabdff1aSopenharmony_ci        SW(tp0, dst);
1812cabdff1aSopenharmony_ci        dst += stride;
1813cabdff1aSopenharmony_ci        SW(tp1, dst);
1814cabdff1aSopenharmony_ci    }
1815cabdff1aSopenharmony_ci}
1816cabdff1aSopenharmony_ci
1817cabdff1aSopenharmony_cistatic void copy_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1818cabdff1aSopenharmony_ci                            int32_t height)
1819cabdff1aSopenharmony_ci{
1820cabdff1aSopenharmony_ci    uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
1821cabdff1aSopenharmony_ci
1822cabdff1aSopenharmony_ci    if (8 == height) {
1823cabdff1aSopenharmony_ci        LD4(src, stride, src0, src1, src2, src3);
1824cabdff1aSopenharmony_ci        src += 4 * stride;
1825cabdff1aSopenharmony_ci        LD4(src, stride, src4, src5, src6, src7);
1826cabdff1aSopenharmony_ci        SD4(src0, src1, src2, src3, dst, stride);
1827cabdff1aSopenharmony_ci        dst += 4 * stride;
1828cabdff1aSopenharmony_ci        SD4(src4, src5, src6, src7, dst, stride);
1829cabdff1aSopenharmony_ci    } else if (4 == height) {
1830cabdff1aSopenharmony_ci        LD4(src, stride, src0, src1, src2, src3);
1831cabdff1aSopenharmony_ci        SD4(src0, src1, src2, src3, dst, stride);
1832cabdff1aSopenharmony_ci    }
1833cabdff1aSopenharmony_ci}
1834cabdff1aSopenharmony_ci
1835cabdff1aSopenharmony_cistatic void avg_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1836cabdff1aSopenharmony_ci                           int32_t height)
1837cabdff1aSopenharmony_ci{
1838cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
1839cabdff1aSopenharmony_ci    v16u8 src0 = { 0 }, src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
1840cabdff1aSopenharmony_ci
1841cabdff1aSopenharmony_ci    if (8 == height) {
1842cabdff1aSopenharmony_ci        LW4(src, stride, tp0, tp1, tp2, tp3);
1843cabdff1aSopenharmony_ci        src += 4 * stride;
1844cabdff1aSopenharmony_ci        INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
1845cabdff1aSopenharmony_ci        LW4(src, stride, tp0, tp1, tp2, tp3);
1846cabdff1aSopenharmony_ci        INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
1847cabdff1aSopenharmony_ci        LW4(dst, stride, tp0, tp1, tp2, tp3);
1848cabdff1aSopenharmony_ci        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1849cabdff1aSopenharmony_ci        LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1850cabdff1aSopenharmony_ci        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1851cabdff1aSopenharmony_ci        AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1852cabdff1aSopenharmony_ci        ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1853cabdff1aSopenharmony_ci    } else if (4 == height) {
1854cabdff1aSopenharmony_ci        LW4(src, stride, tp0, tp1, tp2, tp3);
1855cabdff1aSopenharmony_ci        INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
1856cabdff1aSopenharmony_ci        LW4(dst, stride, tp0, tp1, tp2, tp3);
1857cabdff1aSopenharmony_ci        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1858cabdff1aSopenharmony_ci        dst0 = __msa_aver_u_b(src0, dst0);
1859cabdff1aSopenharmony_ci        ST_W4(dst0, 0, 1, 2, 3, dst, stride);
1860cabdff1aSopenharmony_ci    } else if (2 == height) {
1861cabdff1aSopenharmony_ci        LW2(src, stride, tp0, tp1);
1862cabdff1aSopenharmony_ci        INSERT_W2_UB(tp0, tp1, src0);
1863cabdff1aSopenharmony_ci        LW2(dst, stride, tp0, tp1);
1864cabdff1aSopenharmony_ci        INSERT_W2_UB(tp0, tp1, dst0);
1865cabdff1aSopenharmony_ci        dst0 = __msa_aver_u_b(src0, dst0);
1866cabdff1aSopenharmony_ci        ST_W2(dst0, 0, 1, dst, stride);
1867cabdff1aSopenharmony_ci    }
1868cabdff1aSopenharmony_ci}
1869cabdff1aSopenharmony_ci
1870cabdff1aSopenharmony_cistatic void avg_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1871cabdff1aSopenharmony_ci                           int32_t height)
1872cabdff1aSopenharmony_ci{
1873cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1874cabdff1aSopenharmony_ci    v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
1875cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1876cabdff1aSopenharmony_ci
1877cabdff1aSopenharmony_ci    if (8 == height) {
1878cabdff1aSopenharmony_ci        LD4(src, stride, tp0, tp1, tp2, tp3);
1879cabdff1aSopenharmony_ci        src += 4 * stride;
1880cabdff1aSopenharmony_ci        LD4(src, stride, tp4, tp5, tp6, tp7);
1881cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, src0);
1882cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, src1);
1883cabdff1aSopenharmony_ci        INSERT_D2_UB(tp4, tp5, src2);
1884cabdff1aSopenharmony_ci        INSERT_D2_UB(tp6, tp7, src3);
1885cabdff1aSopenharmony_ci        LD4(dst, stride, tp0, tp1, tp2, tp3);
1886cabdff1aSopenharmony_ci        LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
1887cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, dst0);
1888cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, dst1);
1889cabdff1aSopenharmony_ci        INSERT_D2_UB(tp4, tp5, dst2);
1890cabdff1aSopenharmony_ci        INSERT_D2_UB(tp6, tp7, dst3);
1891cabdff1aSopenharmony_ci        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1892cabdff1aSopenharmony_ci                    dst2, dst3);
1893cabdff1aSopenharmony_ci        ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1894cabdff1aSopenharmony_ci    } else if (4 == height) {
1895cabdff1aSopenharmony_ci        LD4(src, stride, tp0, tp1, tp2, tp3);
1896cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, src0);
1897cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, src1);
1898cabdff1aSopenharmony_ci        LD4(dst, stride, tp0, tp1, tp2, tp3);
1899cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, dst0);
1900cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, dst1);
1901cabdff1aSopenharmony_ci        AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1902cabdff1aSopenharmony_ci        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
1903cabdff1aSopenharmony_ci    }
1904cabdff1aSopenharmony_ci}
1905cabdff1aSopenharmony_ci
1906cabdff1aSopenharmony_civoid ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
1907cabdff1aSopenharmony_ci                                ptrdiff_t stride, int height, int x, int y)
1908cabdff1aSopenharmony_ci{
1909cabdff1aSopenharmony_ci    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1910cabdff1aSopenharmony_ci
1911cabdff1aSopenharmony_ci    if (x && y) {
1912cabdff1aSopenharmony_ci        avc_chroma_hv_8w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1913cabdff1aSopenharmony_ci    } else if (x) {
1914cabdff1aSopenharmony_ci        avc_chroma_hz_8w_msa(src, dst, stride, x, (8 - x), height);
1915cabdff1aSopenharmony_ci    } else if (y) {
1916cabdff1aSopenharmony_ci        avc_chroma_vt_8w_msa(src, dst, stride, y, (8 - y), height);
1917cabdff1aSopenharmony_ci    } else {
1918cabdff1aSopenharmony_ci        copy_width8_msa(src, dst, stride, height);
1919cabdff1aSopenharmony_ci    }
1920cabdff1aSopenharmony_ci}
1921cabdff1aSopenharmony_ci
1922cabdff1aSopenharmony_civoid ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
1923cabdff1aSopenharmony_ci                                ptrdiff_t stride, int height, int x, int y)
1924cabdff1aSopenharmony_ci{
1925cabdff1aSopenharmony_ci    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1926cabdff1aSopenharmony_ci
1927cabdff1aSopenharmony_ci    if (x && y) {
1928cabdff1aSopenharmony_ci        avc_chroma_hv_4w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1929cabdff1aSopenharmony_ci    } else if (x) {
1930cabdff1aSopenharmony_ci        avc_chroma_hz_4w_msa(src, dst, stride, x, (8 - x), height);
1931cabdff1aSopenharmony_ci    } else if (y) {
1932cabdff1aSopenharmony_ci        avc_chroma_vt_4w_msa(src, dst, stride, y, (8 - y), height);
1933cabdff1aSopenharmony_ci    } else {
1934cabdff1aSopenharmony_ci        copy_width4_msa(src, dst, stride, height);
1935cabdff1aSopenharmony_ci    }
1936cabdff1aSopenharmony_ci}
1937cabdff1aSopenharmony_ci
1938cabdff1aSopenharmony_civoid ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
1939cabdff1aSopenharmony_ci                                ptrdiff_t stride, int height, int x, int y)
1940cabdff1aSopenharmony_ci{
1941cabdff1aSopenharmony_ci    int32_t cnt;
1942cabdff1aSopenharmony_ci
1943cabdff1aSopenharmony_ci    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1944cabdff1aSopenharmony_ci
1945cabdff1aSopenharmony_ci    if (x && y) {
1946cabdff1aSopenharmony_ci        avc_chroma_hv_2w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1947cabdff1aSopenharmony_ci    } else if (x) {
1948cabdff1aSopenharmony_ci        avc_chroma_hz_2w_msa(src, dst, stride, x, (8 - x), height);
1949cabdff1aSopenharmony_ci    } else if (y) {
1950cabdff1aSopenharmony_ci        avc_chroma_vt_2w_msa(src, dst, stride, y, (8 - y), height);
1951cabdff1aSopenharmony_ci    } else {
1952cabdff1aSopenharmony_ci        for (cnt = height; cnt--;) {
1953cabdff1aSopenharmony_ci            *((uint16_t *) dst) = *((uint16_t *) src);
1954cabdff1aSopenharmony_ci
1955cabdff1aSopenharmony_ci            src += stride;
1956cabdff1aSopenharmony_ci            dst += stride;
1957cabdff1aSopenharmony_ci        }
1958cabdff1aSopenharmony_ci    }
1959cabdff1aSopenharmony_ci}
1960cabdff1aSopenharmony_ci
1961cabdff1aSopenharmony_civoid ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
1962cabdff1aSopenharmony_ci                                ptrdiff_t stride, int height, int x, int y)
1963cabdff1aSopenharmony_ci{
1964cabdff1aSopenharmony_ci    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1965cabdff1aSopenharmony_ci
1966cabdff1aSopenharmony_ci
1967cabdff1aSopenharmony_ci    if (x && y) {
1968cabdff1aSopenharmony_ci        avc_chroma_hv_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), y,
1969cabdff1aSopenharmony_ci                                          (8 - y), height);
1970cabdff1aSopenharmony_ci    } else if (x) {
1971cabdff1aSopenharmony_ci        avc_chroma_hz_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), height);
1972cabdff1aSopenharmony_ci    } else if (y) {
1973cabdff1aSopenharmony_ci        avc_chroma_vt_and_aver_dst_8w_msa(src, dst, stride, y, (8 - y), height);
1974cabdff1aSopenharmony_ci    } else {
1975cabdff1aSopenharmony_ci        avg_width8_msa(src, dst, stride, height);
1976cabdff1aSopenharmony_ci    }
1977cabdff1aSopenharmony_ci}
1978cabdff1aSopenharmony_ci
1979cabdff1aSopenharmony_civoid ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
1980cabdff1aSopenharmony_ci                                ptrdiff_t stride, int height, int x, int y)
1981cabdff1aSopenharmony_ci{
1982cabdff1aSopenharmony_ci    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1983cabdff1aSopenharmony_ci
1984cabdff1aSopenharmony_ci    if (x && y) {
1985cabdff1aSopenharmony_ci        avc_chroma_hv_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), y,
1986cabdff1aSopenharmony_ci                                          (8 - y), height);
1987cabdff1aSopenharmony_ci    } else if (x) {
1988cabdff1aSopenharmony_ci        avc_chroma_hz_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), height);
1989cabdff1aSopenharmony_ci    } else if (y) {
1990cabdff1aSopenharmony_ci        avc_chroma_vt_and_aver_dst_4w_msa(src, dst, stride, y, (8 - y), height);
1991cabdff1aSopenharmony_ci    } else {
1992cabdff1aSopenharmony_ci        avg_width4_msa(src, dst, stride, height);
1993cabdff1aSopenharmony_ci    }
1994cabdff1aSopenharmony_ci}
1995cabdff1aSopenharmony_ci
1996cabdff1aSopenharmony_civoid ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
1997cabdff1aSopenharmony_ci                                ptrdiff_t stride, int height, int x, int y)
1998cabdff1aSopenharmony_ci{
1999cabdff1aSopenharmony_ci    int32_t cnt;
2000cabdff1aSopenharmony_ci
2001cabdff1aSopenharmony_ci    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2002cabdff1aSopenharmony_ci
2003cabdff1aSopenharmony_ci    if (x && y) {
2004cabdff1aSopenharmony_ci        avc_chroma_hv_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), y,
2005cabdff1aSopenharmony_ci                                          (8 - y), height);
2006cabdff1aSopenharmony_ci    } else if (x) {
2007cabdff1aSopenharmony_ci        avc_chroma_hz_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), height);
2008cabdff1aSopenharmony_ci    } else if (y) {
2009cabdff1aSopenharmony_ci        avc_chroma_vt_and_aver_dst_2w_msa(src, dst, stride, y, (8 - y), height);
2010cabdff1aSopenharmony_ci    } else {
2011cabdff1aSopenharmony_ci        for (cnt = height; cnt--;) {
2012cabdff1aSopenharmony_ci            dst[0] = (dst[0] + src[0] + 1) >> 1;
2013cabdff1aSopenharmony_ci            dst[1] = (dst[1] + src[1] + 1) >> 1;
2014cabdff1aSopenharmony_ci
2015cabdff1aSopenharmony_ci            src += stride;
2016cabdff1aSopenharmony_ci            dst += stride;
2017cabdff1aSopenharmony_ci        }
2018cabdff1aSopenharmony_ci    }
2019cabdff1aSopenharmony_ci}
2020