1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 -2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
22cabdff1aSopenharmony_ci#include "h264dsp_mips.h"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_cistatic const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = {
25cabdff1aSopenharmony_ci    /* 8 width cases */
26cabdff1aSopenharmony_ci    0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
27cabdff1aSopenharmony_ci    1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
28cabdff1aSopenharmony_ci    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ci    /* 4 width cases */
31cabdff1aSopenharmony_ci    0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
32cabdff1aSopenharmony_ci    1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
33cabdff1aSopenharmony_ci    2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
34cabdff1aSopenharmony_ci};
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_ci#define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5,  \
37cabdff1aSopenharmony_ci                                        out1, out2)                          \
38cabdff1aSopenharmony_ci{                                                                            \
39cabdff1aSopenharmony_ci    v16i8 tmp0_m, tmp1_m;                                                    \
40cabdff1aSopenharmony_ci    v16i8 minus5b_m = __msa_ldi_b(-5);                                       \
41cabdff1aSopenharmony_ci    v16i8 plus20b_m = __msa_ldi_b(20);                                       \
42cabdff1aSopenharmony_ci                                                                             \
43cabdff1aSopenharmony_ci    ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m);                                 \
44cabdff1aSopenharmony_ci    HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2);                                 \
45cabdff1aSopenharmony_ci    ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m);                                 \
46cabdff1aSopenharmony_ci    DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2);          \
47cabdff1aSopenharmony_ci    ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m);                                 \
48cabdff1aSopenharmony_ci    DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2);          \
49cabdff1aSopenharmony_ci}
50cabdff1aSopenharmony_ci
51cabdff1aSopenharmony_ci#define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2)  \
52cabdff1aSopenharmony_ci( {                                                        \
53cabdff1aSopenharmony_ci    v8i16 out0_m;                                          \
54cabdff1aSopenharmony_ci    v16i8 tmp0_m;                                          \
55cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);                       \
56cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);                       \
57cabdff1aSopenharmony_ci                                                           \
58cabdff1aSopenharmony_ci    tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0);        \
59cabdff1aSopenharmony_ci    out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m);               \
60cabdff1aSopenharmony_ci                                                           \
61cabdff1aSopenharmony_ci    tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0);        \
62cabdff1aSopenharmony_ci    out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m);     \
63cabdff1aSopenharmony_ci                                                           \
64cabdff1aSopenharmony_ci    tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0);        \
65cabdff1aSopenharmony_ci    out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m);     \
66cabdff1aSopenharmony_ci                                                           \
67cabdff1aSopenharmony_ci    out0_m;                                                \
68cabdff1aSopenharmony_ci} )
69cabdff1aSopenharmony_ci
70cabdff1aSopenharmony_ci#define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)       \
71cabdff1aSopenharmony_ci( {                                                                 \
72cabdff1aSopenharmony_ci    v8i16 out0_m;                                                   \
73cabdff1aSopenharmony_ci                                                                    \
74cabdff1aSopenharmony_ci    out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0);           \
75cabdff1aSopenharmony_ci    out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1);  \
76cabdff1aSopenharmony_ci    out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2);  \
77cabdff1aSopenharmony_ci                                                                    \
78cabdff1aSopenharmony_ci    out0_m;                                                         \
79cabdff1aSopenharmony_ci} )
80cabdff1aSopenharmony_ci
81cabdff1aSopenharmony_ci#define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2)       \
82cabdff1aSopenharmony_ci( {                                                                 \
83cabdff1aSopenharmony_ci    v4i32 out0_m;                                                   \
84cabdff1aSopenharmony_ci                                                                    \
85cabdff1aSopenharmony_ci    out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0);           \
86cabdff1aSopenharmony_ci    out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1);  \
87cabdff1aSopenharmony_ci    out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2);  \
88cabdff1aSopenharmony_ci    out0_m = __msa_srari_w(out0_m, 10);                             \
89cabdff1aSopenharmony_ci    out0_m = __msa_sat_s_w(out0_m, 7);                              \
90cabdff1aSopenharmony_ci    out0_m;                                                         \
91cabdff1aSopenharmony_ci} )
92cabdff1aSopenharmony_ci
93cabdff1aSopenharmony_cistatic void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y,
94cabdff1aSopenharmony_ci                                    uint8_t *dst, int32_t stride)
95cabdff1aSopenharmony_ci{
96cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
97cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
98cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
99cabdff1aSopenharmony_ci    v16u8 out;
100cabdff1aSopenharmony_ci    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
101cabdff1aSopenharmony_ci    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
102cabdff1aSopenharmony_ci    v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
103cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
104cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1;
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
107cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
108cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
109cabdff1aSopenharmony_ci
110cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_ci    LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
113cabdff1aSopenharmony_ci    src_y += (5 * stride);
114cabdff1aSopenharmony_ci
115cabdff1aSopenharmony_ci    src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
116cabdff1aSopenharmony_ci    src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
117cabdff1aSopenharmony_ci    src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
118cabdff1aSopenharmony_ci    src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
119cabdff1aSopenharmony_ci
120cabdff1aSopenharmony_ci    XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
121cabdff1aSopenharmony_ci
122cabdff1aSopenharmony_ci    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
123cabdff1aSopenharmony_ci    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
124cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2);
125cabdff1aSopenharmony_ci    hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2);
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_ci    SRARI_H2_SH(hz_out0, hz_out1, 5);
128cabdff1aSopenharmony_ci    SAT_SH2_SH(hz_out0, hz_out1, 7);
129cabdff1aSopenharmony_ci
130cabdff1aSopenharmony_ci    LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_ci    src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
133cabdff1aSopenharmony_ci    src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
134cabdff1aSopenharmony_ci    src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
135cabdff1aSopenharmony_ci    src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
136cabdff1aSopenharmony_ci
137cabdff1aSopenharmony_ci    XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
138cabdff1aSopenharmony_ci    ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
139cabdff1aSopenharmony_ci    ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
140cabdff1aSopenharmony_ci    vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
141cabdff1aSopenharmony_ci                             filt2);
142cabdff1aSopenharmony_ci    vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
143cabdff1aSopenharmony_ci                             filt2);
144cabdff1aSopenharmony_ci    SRARI_H2_SH(vt_out0, vt_out1, 5);
145cabdff1aSopenharmony_ci    SAT_SH2_SH(vt_out0, vt_out1, 7);
146cabdff1aSopenharmony_ci
147cabdff1aSopenharmony_ci    out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
148cabdff1aSopenharmony_ci    out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_ci    SAT_SH2_SH(out0, out1, 7);
151cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
152cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, stride);
153cabdff1aSopenharmony_ci}
154cabdff1aSopenharmony_ci
155cabdff1aSopenharmony_cistatic void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y,
156cabdff1aSopenharmony_ci                                    uint8_t *dst, int32_t stride)
157cabdff1aSopenharmony_ci{
158cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
159cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
160cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
161cabdff1aSopenharmony_ci    v16u8 out0, out1;
162cabdff1aSopenharmony_ci    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
163cabdff1aSopenharmony_ci    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
164cabdff1aSopenharmony_ci    v16i8 src_vt7, src_vt8, src_vt9, src_vt10, src_vt11, src_vt12;
165cabdff1aSopenharmony_ci    v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
166cabdff1aSopenharmony_ci    v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
167cabdff1aSopenharmony_ci    v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
168cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
169cabdff1aSopenharmony_ci    v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
172cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
173cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
174cabdff1aSopenharmony_ci
175cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
176cabdff1aSopenharmony_ci    LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
177cabdff1aSopenharmony_ci    src_y += (5 * stride);
178cabdff1aSopenharmony_ci
179cabdff1aSopenharmony_ci    XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
180cabdff1aSopenharmony_ci
181cabdff1aSopenharmony_ci    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
182cabdff1aSopenharmony_ci    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
183cabdff1aSopenharmony_ci    src_x += (4 * stride);
184cabdff1aSopenharmony_ci
185cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
186cabdff1aSopenharmony_ci    hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
187cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
188cabdff1aSopenharmony_ci    hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
189cabdff1aSopenharmony_ci
190cabdff1aSopenharmony_ci    SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
191cabdff1aSopenharmony_ci    SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
192cabdff1aSopenharmony_ci
193cabdff1aSopenharmony_ci    LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
194cabdff1aSopenharmony_ci    src_y += (4 * stride);
195cabdff1aSopenharmony_ci    XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
196cabdff1aSopenharmony_ci
197cabdff1aSopenharmony_ci    ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
198cabdff1aSopenharmony_ci               src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
199cabdff1aSopenharmony_ci    ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
200cabdff1aSopenharmony_ci               src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
201cabdff1aSopenharmony_ci    vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
202cabdff1aSopenharmony_ci                             filt2);
203cabdff1aSopenharmony_ci    vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
204cabdff1aSopenharmony_ci                             filt2);
205cabdff1aSopenharmony_ci    vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
206cabdff1aSopenharmony_ci                             filt2);
207cabdff1aSopenharmony_ci    vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
208cabdff1aSopenharmony_ci                             filt2);
209cabdff1aSopenharmony_ci    SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
210cabdff1aSopenharmony_ci    SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
211cabdff1aSopenharmony_ci
212cabdff1aSopenharmony_ci    tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
213cabdff1aSopenharmony_ci    tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
214cabdff1aSopenharmony_ci    tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
215cabdff1aSopenharmony_ci    tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
216cabdff1aSopenharmony_ci
217cabdff1aSopenharmony_ci    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
218cabdff1aSopenharmony_ci    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
219cabdff1aSopenharmony_ci
220cabdff1aSopenharmony_ci    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
221cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
222cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
223cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
224cabdff1aSopenharmony_ci    dst += (4 * stride);
225cabdff1aSopenharmony_ci
226cabdff1aSopenharmony_ci    LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
227cabdff1aSopenharmony_ci    XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12);
228cabdff1aSopenharmony_ci
229cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
230cabdff1aSopenharmony_ci    hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
231cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
232cabdff1aSopenharmony_ci    hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
233cabdff1aSopenharmony_ci
234cabdff1aSopenharmony_ci    SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
235cabdff1aSopenharmony_ci    SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
236cabdff1aSopenharmony_ci
237cabdff1aSopenharmony_ci    ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
238cabdff1aSopenharmony_ci               src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
239cabdff1aSopenharmony_ci               src_vt1211_r);
240cabdff1aSopenharmony_ci    vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
241cabdff1aSopenharmony_ci                             filt2);
242cabdff1aSopenharmony_ci    vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
243cabdff1aSopenharmony_ci                             filt2);
244cabdff1aSopenharmony_ci    vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
245cabdff1aSopenharmony_ci                             filt2);
246cabdff1aSopenharmony_ci    vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
247cabdff1aSopenharmony_ci                             filt1, filt2);
248cabdff1aSopenharmony_ci    SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
249cabdff1aSopenharmony_ci    SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
250cabdff1aSopenharmony_ci
251cabdff1aSopenharmony_ci    tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
252cabdff1aSopenharmony_ci    tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
253cabdff1aSopenharmony_ci    tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
254cabdff1aSopenharmony_ci    tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
255cabdff1aSopenharmony_ci
256cabdff1aSopenharmony_ci    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
257cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
258cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
259cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
260cabdff1aSopenharmony_ci}
261cabdff1aSopenharmony_ci
262cabdff1aSopenharmony_cistatic void avc_luma_hv_qrt_16x16_msa(const uint8_t *src_x,
263cabdff1aSopenharmony_ci                                      const uint8_t *src_y, uint8_t *dst,
264cabdff1aSopenharmony_ci                                      int32_t stride)
265cabdff1aSopenharmony_ci{
266cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
267cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
268cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
269cabdff1aSopenharmony_ci    const uint8_t *src_x_tmp = src_x;
270cabdff1aSopenharmony_ci    const uint8_t *src_y_tmp = src_y;
271cabdff1aSopenharmony_ci    uint8_t *dst_tmp = dst;
272cabdff1aSopenharmony_ci    uint32_t multiple8_cnt, loop_cnt;
273cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1;
274cabdff1aSopenharmony_ci    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
275cabdff1aSopenharmony_ci    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
276cabdff1aSopenharmony_ci    v16i8 src_vt7, src_vt8;
277cabdff1aSopenharmony_ci    v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
278cabdff1aSopenharmony_ci    v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
279cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
280cabdff1aSopenharmony_ci    v8i16 vt_out3, out0, out1, out2, out3;
281cabdff1aSopenharmony_ci
282cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
283cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
284cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
285cabdff1aSopenharmony_ci
286cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
287cabdff1aSopenharmony_ci
288cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
289cabdff1aSopenharmony_ci        src_x = src_x_tmp;
290cabdff1aSopenharmony_ci        src_y = src_y_tmp;
291cabdff1aSopenharmony_ci        dst = dst_tmp;
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ci        LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
294cabdff1aSopenharmony_ci        src_y += (5 * stride);
295cabdff1aSopenharmony_ci
296cabdff1aSopenharmony_ci        XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
297cabdff1aSopenharmony_ci
298cabdff1aSopenharmony_ci        for (loop_cnt = 4; loop_cnt--;) {
299cabdff1aSopenharmony_ci            LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
300cabdff1aSopenharmony_ci            XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
301cabdff1aSopenharmony_ci            src_x += (4 * stride);
302cabdff1aSopenharmony_ci
303cabdff1aSopenharmony_ci            hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
304cabdff1aSopenharmony_ci            hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
305cabdff1aSopenharmony_ci            hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
306cabdff1aSopenharmony_ci            hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
307cabdff1aSopenharmony_ci            SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
308cabdff1aSopenharmony_ci            SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
309cabdff1aSopenharmony_ci
310cabdff1aSopenharmony_ci            LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
311cabdff1aSopenharmony_ci            src_y += (4 * stride);
312cabdff1aSopenharmony_ci
313cabdff1aSopenharmony_ci            XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
314cabdff1aSopenharmony_ci            ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
315cabdff1aSopenharmony_ci                       src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
316cabdff1aSopenharmony_ci                       src_vt43_r);
317cabdff1aSopenharmony_ci            ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
318cabdff1aSopenharmony_ci                       src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
319cabdff1aSopenharmony_ci                       src_vt87_r);
320cabdff1aSopenharmony_ci            vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
321cabdff1aSopenharmony_ci                                     filt1, filt2);
322cabdff1aSopenharmony_ci            vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
323cabdff1aSopenharmony_ci                                     filt1, filt2);
324cabdff1aSopenharmony_ci            vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
325cabdff1aSopenharmony_ci                                     filt1, filt2);
326cabdff1aSopenharmony_ci            vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
327cabdff1aSopenharmony_ci                                     filt1, filt2);
328cabdff1aSopenharmony_ci            SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
329cabdff1aSopenharmony_ci            SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
330cabdff1aSopenharmony_ci
331cabdff1aSopenharmony_ci            out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
332cabdff1aSopenharmony_ci            out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
333cabdff1aSopenharmony_ci            out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
334cabdff1aSopenharmony_ci            out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
335cabdff1aSopenharmony_ci
336cabdff1aSopenharmony_ci            SAT_SH4_SH(out0, out1, out2, out3, 7);
337cabdff1aSopenharmony_ci            tmp0 = PCKEV_XORI128_UB(out0, out1);
338cabdff1aSopenharmony_ci            tmp1 = PCKEV_XORI128_UB(out2, out3);
339cabdff1aSopenharmony_ci            ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, stride);
340cabdff1aSopenharmony_ci            dst += (4 * stride);
341cabdff1aSopenharmony_ci
342cabdff1aSopenharmony_ci            src_vt0 = src_vt4;
343cabdff1aSopenharmony_ci            src_vt1 = src_vt5;
344cabdff1aSopenharmony_ci            src_vt2 = src_vt6;
345cabdff1aSopenharmony_ci            src_vt3 = src_vt7;
346cabdff1aSopenharmony_ci            src_vt4 = src_vt8;
347cabdff1aSopenharmony_ci        }
348cabdff1aSopenharmony_ci
349cabdff1aSopenharmony_ci        src_x_tmp += 8;
350cabdff1aSopenharmony_ci        src_y_tmp += 8;
351cabdff1aSopenharmony_ci        dst_tmp += 8;
352cabdff1aSopenharmony_ci    }
353cabdff1aSopenharmony_ci}
354cabdff1aSopenharmony_ci
355cabdff1aSopenharmony_cistatic void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x,
356cabdff1aSopenharmony_ci                                                 const uint8_t *src_y,
357cabdff1aSopenharmony_ci                                                 uint8_t *dst,
358cabdff1aSopenharmony_ci                                                 int32_t stride)
359cabdff1aSopenharmony_ci{
360cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
361cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
362cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
363cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
364cabdff1aSopenharmony_ci    v16u8 res, dst0 = { 0 };
365cabdff1aSopenharmony_ci    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
366cabdff1aSopenharmony_ci    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
367cabdff1aSopenharmony_ci    v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
368cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
369cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, vt_out0, vt_out1, res0, res1;
370cabdff1aSopenharmony_ci
371cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
372cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
373cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
374cabdff1aSopenharmony_ci
375cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
376cabdff1aSopenharmony_ci
377cabdff1aSopenharmony_ci    LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
378cabdff1aSopenharmony_ci    src_y += (5 * stride);
379cabdff1aSopenharmony_ci
380cabdff1aSopenharmony_ci    src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
381cabdff1aSopenharmony_ci    src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
382cabdff1aSopenharmony_ci    src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
383cabdff1aSopenharmony_ci    src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
384cabdff1aSopenharmony_ci
385cabdff1aSopenharmony_ci    XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
386cabdff1aSopenharmony_ci
387cabdff1aSopenharmony_ci    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
388cabdff1aSopenharmony_ci    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
389cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2);
390cabdff1aSopenharmony_ci    hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2);
391cabdff1aSopenharmony_ci
392cabdff1aSopenharmony_ci    SRARI_H2_SH(hz_out0, hz_out1, 5);
393cabdff1aSopenharmony_ci    SAT_SH2_SH(hz_out0, hz_out1, 7);
394cabdff1aSopenharmony_ci
395cabdff1aSopenharmony_ci    LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
396cabdff1aSopenharmony_ci
397cabdff1aSopenharmony_ci    src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
398cabdff1aSopenharmony_ci    src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
399cabdff1aSopenharmony_ci    src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
400cabdff1aSopenharmony_ci    src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
401cabdff1aSopenharmony_ci
402cabdff1aSopenharmony_ci    XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
403cabdff1aSopenharmony_ci    ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
404cabdff1aSopenharmony_ci    ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
405cabdff1aSopenharmony_ci    vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
406cabdff1aSopenharmony_ci                             filt2);
407cabdff1aSopenharmony_ci    vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
408cabdff1aSopenharmony_ci                             filt2);
409cabdff1aSopenharmony_ci    SRARI_H2_SH(vt_out0, vt_out1, 5);
410cabdff1aSopenharmony_ci    SAT_SH2_SH(vt_out0, vt_out1, 7);
411cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
412cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
413cabdff1aSopenharmony_ci
414cabdff1aSopenharmony_ci    res1 = __msa_srari_h((hz_out1 + vt_out1), 1);
415cabdff1aSopenharmony_ci    res0 = __msa_srari_h((hz_out0 + vt_out0), 1);
416cabdff1aSopenharmony_ci
417cabdff1aSopenharmony_ci    SAT_SH2_SH(res0, res1, 7);
418cabdff1aSopenharmony_ci    res = PCKEV_XORI128_UB(res0, res1);
419cabdff1aSopenharmony_ci    dst0 = __msa_aver_u_b(res, dst0);
420cabdff1aSopenharmony_ci
421cabdff1aSopenharmony_ci    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
422cabdff1aSopenharmony_ci}
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_cistatic void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
425cabdff1aSopenharmony_ci                                                 const uint8_t *src_y,
426cabdff1aSopenharmony_ci                                                 uint8_t *dst,
427cabdff1aSopenharmony_ci                                                 int32_t stride)
428cabdff1aSopenharmony_ci{
429cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
430cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
431cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
432cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
433cabdff1aSopenharmony_ci    v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
434cabdff1aSopenharmony_ci    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt0, src_vt1, src_vt2;
435cabdff1aSopenharmony_ci    v16i8 src_vt3, src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
436cabdff1aSopenharmony_ci    v16i8 src_vt9, src_vt10, src_vt11, src_vt12, mask0, mask1, mask2;
437cabdff1aSopenharmony_ci    v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
438cabdff1aSopenharmony_ci    v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
439cabdff1aSopenharmony_ci    v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
440cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
441cabdff1aSopenharmony_ci    v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
442cabdff1aSopenharmony_ci
443cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
444cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
445cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
446cabdff1aSopenharmony_ci
447cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
448cabdff1aSopenharmony_ci    LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
449cabdff1aSopenharmony_ci    src_y += (5 * stride);
450cabdff1aSopenharmony_ci
451cabdff1aSopenharmony_ci    XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
452cabdff1aSopenharmony_ci
453cabdff1aSopenharmony_ci    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
454cabdff1aSopenharmony_ci    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
455cabdff1aSopenharmony_ci    src_x += (4 * stride);
456cabdff1aSopenharmony_ci
457cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
458cabdff1aSopenharmony_ci    hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
459cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
460cabdff1aSopenharmony_ci    hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
461cabdff1aSopenharmony_ci
462cabdff1aSopenharmony_ci    SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
463cabdff1aSopenharmony_ci    SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
464cabdff1aSopenharmony_ci
465cabdff1aSopenharmony_ci    LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
466cabdff1aSopenharmony_ci    src_y += (4 * stride);
467cabdff1aSopenharmony_ci    XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
468cabdff1aSopenharmony_ci
469cabdff1aSopenharmony_ci    ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
470cabdff1aSopenharmony_ci               src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
471cabdff1aSopenharmony_ci    ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
472cabdff1aSopenharmony_ci               src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
473cabdff1aSopenharmony_ci    vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
474cabdff1aSopenharmony_ci                             filt2);
475cabdff1aSopenharmony_ci    vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
476cabdff1aSopenharmony_ci                             filt2);
477cabdff1aSopenharmony_ci    vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
478cabdff1aSopenharmony_ci                             filt2);
479cabdff1aSopenharmony_ci    vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
480cabdff1aSopenharmony_ci                             filt2);
481cabdff1aSopenharmony_ci    SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
482cabdff1aSopenharmony_ci    SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
483cabdff1aSopenharmony_ci
484cabdff1aSopenharmony_ci    tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
485cabdff1aSopenharmony_ci    tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
486cabdff1aSopenharmony_ci    tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
487cabdff1aSopenharmony_ci    tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
488cabdff1aSopenharmony_ci
489cabdff1aSopenharmony_ci    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
490cabdff1aSopenharmony_ci    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
491cabdff1aSopenharmony_ci
492cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
493cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
494cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
495cabdff1aSopenharmony_ci
496cabdff1aSopenharmony_ci    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
497cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
498cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
499cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
500cabdff1aSopenharmony_ci    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
501cabdff1aSopenharmony_ci    dst += (4 * stride);
502cabdff1aSopenharmony_ci
503cabdff1aSopenharmony_ci    LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
504cabdff1aSopenharmony_ci    XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12);
505cabdff1aSopenharmony_ci
506cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
507cabdff1aSopenharmony_ci    hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
508cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
509cabdff1aSopenharmony_ci    hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
510cabdff1aSopenharmony_ci
511cabdff1aSopenharmony_ci    SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
512cabdff1aSopenharmony_ci    SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
513cabdff1aSopenharmony_ci
514cabdff1aSopenharmony_ci    ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
515cabdff1aSopenharmony_ci               src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
516cabdff1aSopenharmony_ci               src_vt1211_r);
517cabdff1aSopenharmony_ci    vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
518cabdff1aSopenharmony_ci                             filt2);
519cabdff1aSopenharmony_ci    vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
520cabdff1aSopenharmony_ci                             filt2);
521cabdff1aSopenharmony_ci    vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
522cabdff1aSopenharmony_ci                             filt2);
523cabdff1aSopenharmony_ci    vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
524cabdff1aSopenharmony_ci                             filt1, filt2);
525cabdff1aSopenharmony_ci    SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
526cabdff1aSopenharmony_ci    SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
527cabdff1aSopenharmony_ci
528cabdff1aSopenharmony_ci    tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
529cabdff1aSopenharmony_ci    tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
530cabdff1aSopenharmony_ci    tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
531cabdff1aSopenharmony_ci    tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
532cabdff1aSopenharmony_ci
533cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
534cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
535cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
536cabdff1aSopenharmony_ci
537cabdff1aSopenharmony_ci    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
538cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
539cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
540cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
541cabdff1aSopenharmony_ci    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
542cabdff1aSopenharmony_ci}
543cabdff1aSopenharmony_ci
544cabdff1aSopenharmony_cistatic void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
545cabdff1aSopenharmony_ci                                                   const uint8_t *src_y,
546cabdff1aSopenharmony_ci                                                   uint8_t *dst,
547cabdff1aSopenharmony_ci                                                   int32_t stride)
548cabdff1aSopenharmony_ci{
549cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
550cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
551cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
552cabdff1aSopenharmony_ci    const uint8_t *src_x_tmp = src_x;
553cabdff1aSopenharmony_ci    const uint8_t *src_y_tmp = src_y;
554cabdff1aSopenharmony_ci    uint8_t *dst_tmp = dst;
555cabdff1aSopenharmony_ci    uint32_t multiple8_cnt, loop_cnt;
556cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
557cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, dst0 = { 0 }, dst1 = { 0 };
558cabdff1aSopenharmony_ci    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
559cabdff1aSopenharmony_ci    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
560cabdff1aSopenharmony_ci    v16i8 src_vt7, src_vt8;
561cabdff1aSopenharmony_ci    v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
562cabdff1aSopenharmony_ci    v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
563cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
564cabdff1aSopenharmony_ci    v8i16 vt_out3, out0, out1, out2, out3;
565cabdff1aSopenharmony_ci
566cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
567cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
568cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
569cabdff1aSopenharmony_ci
570cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
571cabdff1aSopenharmony_ci
572cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
573cabdff1aSopenharmony_ci        src_x = src_x_tmp;
574cabdff1aSopenharmony_ci        src_y = src_y_tmp;
575cabdff1aSopenharmony_ci        dst = dst_tmp;
576cabdff1aSopenharmony_ci
577cabdff1aSopenharmony_ci        LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
578cabdff1aSopenharmony_ci        src_y += (5 * stride);
579cabdff1aSopenharmony_ci
580cabdff1aSopenharmony_ci        XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
581cabdff1aSopenharmony_ci
582cabdff1aSopenharmony_ci        for (loop_cnt = 4; loop_cnt--;) {
583cabdff1aSopenharmony_ci            LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
584cabdff1aSopenharmony_ci            XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
585cabdff1aSopenharmony_ci            src_x += (4 * stride);
586cabdff1aSopenharmony_ci
587cabdff1aSopenharmony_ci            hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
588cabdff1aSopenharmony_ci            hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
589cabdff1aSopenharmony_ci            hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
590cabdff1aSopenharmony_ci            hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
591cabdff1aSopenharmony_ci            SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
592cabdff1aSopenharmony_ci            SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
593cabdff1aSopenharmony_ci
594cabdff1aSopenharmony_ci            LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
595cabdff1aSopenharmony_ci            src_y += (4 * stride);
596cabdff1aSopenharmony_ci
597cabdff1aSopenharmony_ci            XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
598cabdff1aSopenharmony_ci            ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
599cabdff1aSopenharmony_ci                       src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
600cabdff1aSopenharmony_ci                       src_vt43_r);
601cabdff1aSopenharmony_ci            ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
602cabdff1aSopenharmony_ci                       src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
603cabdff1aSopenharmony_ci                       src_vt87_r);
604cabdff1aSopenharmony_ci            vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
605cabdff1aSopenharmony_ci                                     filt1, filt2);
606cabdff1aSopenharmony_ci            vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
607cabdff1aSopenharmony_ci                                     filt1, filt2);
608cabdff1aSopenharmony_ci            vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
609cabdff1aSopenharmony_ci                                     filt1, filt2);
610cabdff1aSopenharmony_ci            vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
611cabdff1aSopenharmony_ci                                     filt1, filt2);
612cabdff1aSopenharmony_ci            SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
613cabdff1aSopenharmony_ci            SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
614cabdff1aSopenharmony_ci
615cabdff1aSopenharmony_ci            out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
616cabdff1aSopenharmony_ci            out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
617cabdff1aSopenharmony_ci            out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
618cabdff1aSopenharmony_ci            out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
619cabdff1aSopenharmony_ci
620cabdff1aSopenharmony_ci            LD4(dst, stride, tp0, tp1, tp2, tp3);
621cabdff1aSopenharmony_ci            INSERT_D2_UB(tp0, tp1, dst0);
622cabdff1aSopenharmony_ci            INSERT_D2_UB(tp2, tp3, dst1);
623cabdff1aSopenharmony_ci
624cabdff1aSopenharmony_ci            SAT_SH4_SH(out0, out1, out2, out3, 7);
625cabdff1aSopenharmony_ci            tmp0 = PCKEV_XORI128_UB(out0, out1);
626cabdff1aSopenharmony_ci            tmp1 = PCKEV_XORI128_UB(out2, out3);
627cabdff1aSopenharmony_ci            AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
628cabdff1aSopenharmony_ci            ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
629cabdff1aSopenharmony_ci            dst += (4 * stride);
630cabdff1aSopenharmony_ci
631cabdff1aSopenharmony_ci            src_vt0 = src_vt4;
632cabdff1aSopenharmony_ci            src_vt1 = src_vt5;
633cabdff1aSopenharmony_ci            src_vt2 = src_vt6;
634cabdff1aSopenharmony_ci            src_vt3 = src_vt7;
635cabdff1aSopenharmony_ci            src_vt4 = src_vt8;
636cabdff1aSopenharmony_ci        }
637cabdff1aSopenharmony_ci
638cabdff1aSopenharmony_ci        src_x_tmp += 8;
639cabdff1aSopenharmony_ci        src_y_tmp += 8;
640cabdff1aSopenharmony_ci        dst_tmp += 8;
641cabdff1aSopenharmony_ci    }
642cabdff1aSopenharmony_ci}
643cabdff1aSopenharmony_ci
644cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
645cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
646cabdff1aSopenharmony_ci{
647cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
648cabdff1aSopenharmony_ci    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
649cabdff1aSopenharmony_ci
650cabdff1aSopenharmony_ci    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
651cabdff1aSopenharmony_ci    src += (8 * stride);
652cabdff1aSopenharmony_ci    LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15);
653cabdff1aSopenharmony_ci
654cabdff1aSopenharmony_ci    ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
655cabdff1aSopenharmony_ci    dst += (8 * stride);
656cabdff1aSopenharmony_ci    ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride);
657cabdff1aSopenharmony_ci}
658cabdff1aSopenharmony_ci
659cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
660cabdff1aSopenharmony_ci                                ptrdiff_t stride)
661cabdff1aSopenharmony_ci{
662cabdff1aSopenharmony_ci    uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
663cabdff1aSopenharmony_ci
664cabdff1aSopenharmony_ci    LD4(src, stride, src0, src1, src2, src3);
665cabdff1aSopenharmony_ci    src += 4 * stride;
666cabdff1aSopenharmony_ci    LD4(src, stride, src4, src5, src6, src7);
667cabdff1aSopenharmony_ci    SD4(src0, src1, src2, src3, dst, stride);
668cabdff1aSopenharmony_ci    dst += 4 * stride;
669cabdff1aSopenharmony_ci    SD4(src4, src5, src6, src7, dst, stride);
670cabdff1aSopenharmony_ci}
671cabdff1aSopenharmony_ci
672cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
673cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
674cabdff1aSopenharmony_ci{
675cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
676cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
677cabdff1aSopenharmony_ci
678cabdff1aSopenharmony_ci    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
679cabdff1aSopenharmony_ci    src += (8 * stride);
680cabdff1aSopenharmony_ci    LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
681cabdff1aSopenharmony_ci
682cabdff1aSopenharmony_ci    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
683cabdff1aSopenharmony_ci                dst2, dst3);
684cabdff1aSopenharmony_ci    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
685cabdff1aSopenharmony_ci                dst6, dst7);
686cabdff1aSopenharmony_ci    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
687cabdff1aSopenharmony_ci    dst += (8 * stride);
688cabdff1aSopenharmony_ci
689cabdff1aSopenharmony_ci    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
690cabdff1aSopenharmony_ci    LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
691cabdff1aSopenharmony_ci
692cabdff1aSopenharmony_ci    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
693cabdff1aSopenharmony_ci                dst2, dst3);
694cabdff1aSopenharmony_ci    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
695cabdff1aSopenharmony_ci                dst6, dst7);
696cabdff1aSopenharmony_ci    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
697cabdff1aSopenharmony_ci}
698cabdff1aSopenharmony_ci
699cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
700cabdff1aSopenharmony_ci                                ptrdiff_t stride)
701cabdff1aSopenharmony_ci{
702cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
703cabdff1aSopenharmony_ci    v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
704cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
705cabdff1aSopenharmony_ci
706cabdff1aSopenharmony_ci    LD4(src, stride, tp0, tp1, tp2, tp3);
707cabdff1aSopenharmony_ci    src += 4 * stride;
708cabdff1aSopenharmony_ci    LD4(src, stride, tp4, tp5, tp6, tp7);
709cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, src0);
710cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, src1);
711cabdff1aSopenharmony_ci    INSERT_D2_UB(tp4, tp5, src2);
712cabdff1aSopenharmony_ci    INSERT_D2_UB(tp6, tp7, src3);
713cabdff1aSopenharmony_ci
714cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
715cabdff1aSopenharmony_ci    LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
716cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
717cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
718cabdff1aSopenharmony_ci    INSERT_D2_UB(tp4, tp5, dst2);
719cabdff1aSopenharmony_ci    INSERT_D2_UB(tp6, tp7, dst3);
720cabdff1aSopenharmony_ci
721cabdff1aSopenharmony_ci    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
722cabdff1aSopenharmony_ci                dst2, dst3);
723cabdff1aSopenharmony_ci
724cabdff1aSopenharmony_ci    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
725cabdff1aSopenharmony_ci}
726cabdff1aSopenharmony_ci
727cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
728cabdff1aSopenharmony_ci                                ptrdiff_t stride)
729cabdff1aSopenharmony_ci{
730cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
731cabdff1aSopenharmony_ci    v16u8 src0 = { 0 }, dst0 = { 0 };
732cabdff1aSopenharmony_ci
733cabdff1aSopenharmony_ci    LW4(src, stride, tp0, tp1, tp2, tp3);
734cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
735cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
736cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
737cabdff1aSopenharmony_ci
738cabdff1aSopenharmony_ci    dst0 = __msa_aver_u_b(src0, dst0);
739cabdff1aSopenharmony_ci
740cabdff1aSopenharmony_ci    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
741cabdff1aSopenharmony_ci}
742cabdff1aSopenharmony_ci
743cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
744cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
745cabdff1aSopenharmony_ci{
746cabdff1aSopenharmony_ci    uint32_t loop_cnt;
747cabdff1aSopenharmony_ci    v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
748cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
749cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
750cabdff1aSopenharmony_ci    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
751cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
752cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
753cabdff1aSopenharmony_ci
754cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
755cabdff1aSopenharmony_ci    mask3 = mask0 + 8;
756cabdff1aSopenharmony_ci    mask4 = mask1 + 8;
757cabdff1aSopenharmony_ci    mask5 = mask2 + 8;
758cabdff1aSopenharmony_ci    src -= 2;
759cabdff1aSopenharmony_ci
760cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
761cabdff1aSopenharmony_ci        LD_SB2(src, 16, src0, src1);
762cabdff1aSopenharmony_ci        src += stride;
763cabdff1aSopenharmony_ci        LD_SB2(src, 16, src2, src3);
764cabdff1aSopenharmony_ci        src += stride;
765cabdff1aSopenharmony_ci        LD_SB2(src, 16, src4, src5);
766cabdff1aSopenharmony_ci        src += stride;
767cabdff1aSopenharmony_ci        LD_SB2(src, 16, src6, src7);
768cabdff1aSopenharmony_ci        src += stride;
769cabdff1aSopenharmony_ci
770cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
771cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
772cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
773cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
774cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
775cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
776cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
777cabdff1aSopenharmony_ci        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
778cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
779cabdff1aSopenharmony_ci                     minus5b, res0, res1, res2, res3);
780cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
781cabdff1aSopenharmony_ci                     plus20b, res0, res1, res2, res3);
782cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
783cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
784cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
785cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
786cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
787cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
788cabdff1aSopenharmony_ci        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
789cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
790cabdff1aSopenharmony_ci                     minus5b, res4, res5, res6, res7);
791cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
792cabdff1aSopenharmony_ci                     plus20b, res4, res5, res6, res7);
793cabdff1aSopenharmony_ci        SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2,
794cabdff1aSopenharmony_ci                   src0, src2, src4, src6);
795cabdff1aSopenharmony_ci        SRARI_H4_SH(res0, res1, res2, res3, 5);
796cabdff1aSopenharmony_ci        SRARI_H4_SH(res4, res5, res6, res7, 5);
797cabdff1aSopenharmony_ci        SAT_SH4_SH(res0, res1, res2, res3, 7);
798cabdff1aSopenharmony_ci        SAT_SH4_SH(res4, res5, res6, res7, 7);
799cabdff1aSopenharmony_ci        PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
800cabdff1aSopenharmony_ci        PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
801cabdff1aSopenharmony_ci        dst0 = __msa_aver_s_b(dst0, src0);
802cabdff1aSopenharmony_ci        dst1 = __msa_aver_s_b(dst1, src2);
803cabdff1aSopenharmony_ci        dst2 = __msa_aver_s_b(dst2, src4);
804cabdff1aSopenharmony_ci        dst3 = __msa_aver_s_b(dst3, src6);
805cabdff1aSopenharmony_ci        XORI_B4_128_SB(dst0, dst1, dst2, dst3);
806cabdff1aSopenharmony_ci        ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
807cabdff1aSopenharmony_ci        dst += (4 * stride);
808cabdff1aSopenharmony_ci    }
809cabdff1aSopenharmony_ci}
810cabdff1aSopenharmony_ci
811cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
812cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
813cabdff1aSopenharmony_ci{
814cabdff1aSopenharmony_ci    uint32_t loop_cnt;
815cabdff1aSopenharmony_ci    v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
816cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
817cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
818cabdff1aSopenharmony_ci    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
819cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
820cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
821cabdff1aSopenharmony_ci
822cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
823cabdff1aSopenharmony_ci    mask3 = mask0 + 8;
824cabdff1aSopenharmony_ci    mask4 = mask1 + 8;
825cabdff1aSopenharmony_ci    mask5 = mask2 + 8;
826cabdff1aSopenharmony_ci    src -= 2;
827cabdff1aSopenharmony_ci
828cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
829cabdff1aSopenharmony_ci        LD_SB2(src, 16, src0, src1);
830cabdff1aSopenharmony_ci        src += stride;
831cabdff1aSopenharmony_ci        LD_SB2(src, 16, src2, src3);
832cabdff1aSopenharmony_ci        src += stride;
833cabdff1aSopenharmony_ci        LD_SB2(src, 16, src4, src5);
834cabdff1aSopenharmony_ci        src += stride;
835cabdff1aSopenharmony_ci        LD_SB2(src, 16, src6, src7);
836cabdff1aSopenharmony_ci        src += stride;
837cabdff1aSopenharmony_ci
838cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
839cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
840cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
841cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
842cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
843cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
844cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
845cabdff1aSopenharmony_ci        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
846cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
847cabdff1aSopenharmony_ci                     minus5b, res0, res1, res2, res3);
848cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
849cabdff1aSopenharmony_ci                     plus20b, res0, res1, res2, res3);
850cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
851cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
852cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
853cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
854cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
855cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
856cabdff1aSopenharmony_ci        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
857cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
858cabdff1aSopenharmony_ci                     minus5b, res4, res5, res6, res7);
859cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
860cabdff1aSopenharmony_ci                     plus20b, res4, res5, res6, res7);
861cabdff1aSopenharmony_ci        SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3,
862cabdff1aSopenharmony_ci                   src0, src2, src4, src6);
863cabdff1aSopenharmony_ci        SRARI_H4_SH(res0, res1, res2, res3, 5);
864cabdff1aSopenharmony_ci        SRARI_H4_SH(res4, res5, res6, res7, 5);
865cabdff1aSopenharmony_ci        SAT_SH4_SH(res0, res1, res2, res3, 7);
866cabdff1aSopenharmony_ci        SAT_SH4_SH(res4, res5, res6, res7, 7);
867cabdff1aSopenharmony_ci        PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
868cabdff1aSopenharmony_ci        PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
869cabdff1aSopenharmony_ci        dst0 = __msa_aver_s_b(dst0, src0);
870cabdff1aSopenharmony_ci        dst1 = __msa_aver_s_b(dst1, src2);
871cabdff1aSopenharmony_ci        dst2 = __msa_aver_s_b(dst2, src4);
872cabdff1aSopenharmony_ci        dst3 = __msa_aver_s_b(dst3, src6);
873cabdff1aSopenharmony_ci        XORI_B4_128_SB(dst0, dst1, dst2, dst3);
874cabdff1aSopenharmony_ci        ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
875cabdff1aSopenharmony_ci        dst += (4 * stride);
876cabdff1aSopenharmony_ci    }
877cabdff1aSopenharmony_ci}
878cabdff1aSopenharmony_ci
879cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
880cabdff1aSopenharmony_ci                                ptrdiff_t stride)
881cabdff1aSopenharmony_ci{
882cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
883cabdff1aSopenharmony_ci    v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
884cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
885cabdff1aSopenharmony_ci    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
886cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
887cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
888cabdff1aSopenharmony_ci
889cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
890cabdff1aSopenharmony_ci    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
891cabdff1aSopenharmony_ci    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
892cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
893cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
894cabdff1aSopenharmony_ci    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
895cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
896cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
897cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
898cabdff1aSopenharmony_ci                 res0, res1, res2, res3);
899cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
900cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
901cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
902cabdff1aSopenharmony_ci                 res0, res1, res2, res3);
903cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
904cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
905cabdff1aSopenharmony_ci    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
906cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
907cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
908cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
909cabdff1aSopenharmony_ci                 res4, res5, res6, res7);
910cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
911cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
912cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
913cabdff1aSopenharmony_ci                 res4, res5, res6, res7);
914cabdff1aSopenharmony_ci    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
915cabdff1aSopenharmony_ci               src0, src1, src2, src3);
916cabdff1aSopenharmony_ci    SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2,
917cabdff1aSopenharmony_ci               src4, src5, src6, src7);
918cabdff1aSopenharmony_ci    PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
919cabdff1aSopenharmony_ci    PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
920cabdff1aSopenharmony_ci    SRARI_H4_SH(res0, res1, res2, res3, 5);
921cabdff1aSopenharmony_ci    SRARI_H4_SH(res4, res5, res6, res7, 5);
922cabdff1aSopenharmony_ci    SAT_SH4_SH(res0, res1, res2, res3, 7);
923cabdff1aSopenharmony_ci    SAT_SH4_SH(res4, res5, res6, res7, 7);
924cabdff1aSopenharmony_ci    PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
925cabdff1aSopenharmony_ci    PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
926cabdff1aSopenharmony_ci    tmp0 = __msa_aver_s_b(tmp0, src0);
927cabdff1aSopenharmony_ci    tmp1 = __msa_aver_s_b(tmp1, src1);
928cabdff1aSopenharmony_ci    tmp2 = __msa_aver_s_b(tmp2, src4);
929cabdff1aSopenharmony_ci    tmp3 = __msa_aver_s_b(tmp3, src5);
930cabdff1aSopenharmony_ci    XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
931cabdff1aSopenharmony_ci    ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
932cabdff1aSopenharmony_ci}
933cabdff1aSopenharmony_ci
934cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
935cabdff1aSopenharmony_ci                                ptrdiff_t stride)
936cabdff1aSopenharmony_ci{
937cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
938cabdff1aSopenharmony_ci    v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
939cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
940cabdff1aSopenharmony_ci    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
941cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
942cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
943cabdff1aSopenharmony_ci
944cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
945cabdff1aSopenharmony_ci    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
946cabdff1aSopenharmony_ci    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
947cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
948cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
949cabdff1aSopenharmony_ci    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
950cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
951cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
952cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
953cabdff1aSopenharmony_ci                 res0, res1, res2, res3);
954cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
955cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
956cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
957cabdff1aSopenharmony_ci                 res0, res1, res2, res3);
958cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
959cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
960cabdff1aSopenharmony_ci    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
961cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
962cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
963cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
964cabdff1aSopenharmony_ci                 res4, res5, res6, res7);
965cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
966cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
967cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
968cabdff1aSopenharmony_ci                 res4, res5, res6, res7);
969cabdff1aSopenharmony_ci    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
970cabdff1aSopenharmony_ci               src0, src1, src2, src3);
971cabdff1aSopenharmony_ci    SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3,
972cabdff1aSopenharmony_ci               src4, src5, src6, src7);
973cabdff1aSopenharmony_ci    PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
974cabdff1aSopenharmony_ci    PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
975cabdff1aSopenharmony_ci    SRARI_H4_SH(res0, res1, res2, res3, 5);
976cabdff1aSopenharmony_ci    SRARI_H4_SH(res4, res5, res6, res7, 5);
977cabdff1aSopenharmony_ci    SAT_SH4_SH(res0, res1, res2, res3, 7);
978cabdff1aSopenharmony_ci    SAT_SH4_SH(res4, res5, res6, res7, 7);
979cabdff1aSopenharmony_ci    PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
980cabdff1aSopenharmony_ci    PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
981cabdff1aSopenharmony_ci    tmp0 = __msa_aver_s_b(tmp0, src0);
982cabdff1aSopenharmony_ci    tmp1 = __msa_aver_s_b(tmp1, src1);
983cabdff1aSopenharmony_ci    tmp2 = __msa_aver_s_b(tmp2, src4);
984cabdff1aSopenharmony_ci    tmp3 = __msa_aver_s_b(tmp3, src5);
985cabdff1aSopenharmony_ci    XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
986cabdff1aSopenharmony_ci    ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
987cabdff1aSopenharmony_ci}
988cabdff1aSopenharmony_ci
989cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
990cabdff1aSopenharmony_ci                                ptrdiff_t stride)
991cabdff1aSopenharmony_ci{
992cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
993cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
994cabdff1aSopenharmony_ci    v8i16 res0, res1;
995cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
996cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
997cabdff1aSopenharmony_ci
998cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
999cabdff1aSopenharmony_ci    LD_SB4(src - 2, stride, src0, src1, src2, src3);
1000cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
1001cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1002cabdff1aSopenharmony_ci    HADD_SB2_SH(vec0, vec1, res0, res1);
1003cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1004cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1005cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1006cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1007cabdff1aSopenharmony_ci    SRARI_H2_SH(res0, res1, 5);
1008cabdff1aSopenharmony_ci    SAT_SH2_SH(res0, res1, 7);
1009cabdff1aSopenharmony_ci    res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1010cabdff1aSopenharmony_ci    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
1011cabdff1aSopenharmony_ci               src0, src1, src2, src3);
1012cabdff1aSopenharmony_ci    src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
1013cabdff1aSopenharmony_ci    src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1014cabdff1aSopenharmony_ci    src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
1015cabdff1aSopenharmony_ci    res = __msa_aver_s_b(res, src0);
1016cabdff1aSopenharmony_ci    res = (v16i8) __msa_xori_b((v16u8) res, 128);
1017cabdff1aSopenharmony_ci    ST_W4(res, 0, 1, 2, 3, dst, stride);
1018cabdff1aSopenharmony_ci}
1019cabdff1aSopenharmony_ci
1020cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
1021cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1022cabdff1aSopenharmony_ci{
1023cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
1024cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1025cabdff1aSopenharmony_ci    v8i16 res0, res1;
1026cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
1027cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
1028cabdff1aSopenharmony_ci
1029cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1030cabdff1aSopenharmony_ci    LD_SB4(src - 2, stride, src0, src1, src2, src3);
1031cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
1032cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1033cabdff1aSopenharmony_ci    HADD_SB2_SH(vec0, vec1, res0, res1);
1034cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1035cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1036cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1037cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1038cabdff1aSopenharmony_ci    SRARI_H2_SH(res0, res1, 5);
1039cabdff1aSopenharmony_ci    SAT_SH2_SH(res0, res1, 7);
1040cabdff1aSopenharmony_ci    res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1041cabdff1aSopenharmony_ci    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
1042cabdff1aSopenharmony_ci               src0, src1, src2, src3);
1043cabdff1aSopenharmony_ci    src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
1044cabdff1aSopenharmony_ci    src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1045cabdff1aSopenharmony_ci    src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
1046cabdff1aSopenharmony_ci    res = __msa_aver_s_b(res, src0);
1047cabdff1aSopenharmony_ci    res = (v16i8) __msa_xori_b((v16u8) res, 128);
1048cabdff1aSopenharmony_ci    ST_W4(res, 0, 1, 2, 3, dst, stride);
1049cabdff1aSopenharmony_ci}
1050cabdff1aSopenharmony_ci
1051cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
1052cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1053cabdff1aSopenharmony_ci{
1054cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1055cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1056cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1057cabdff1aSopenharmony_ci    v16i8 vec11;
1058cabdff1aSopenharmony_ci    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1059cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
1060cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
1061cabdff1aSopenharmony_ci
1062cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1063cabdff1aSopenharmony_ci    src -= 2;
1064cabdff1aSopenharmony_ci
1065cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
1066cabdff1aSopenharmony_ci        LD_SB2(src, 8, src0, src1);
1067cabdff1aSopenharmony_ci        src += stride;
1068cabdff1aSopenharmony_ci        LD_SB2(src, 8, src2, src3);
1069cabdff1aSopenharmony_ci        src += stride;
1070cabdff1aSopenharmony_ci        LD_SB2(src, 8, src4, src5);
1071cabdff1aSopenharmony_ci        src += stride;
1072cabdff1aSopenharmony_ci        LD_SB2(src, 8, src6, src7);
1073cabdff1aSopenharmony_ci        src += stride;
1074cabdff1aSopenharmony_ci
1075cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1076cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
1077cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
1078cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
1079cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
1080cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
1081cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
1082cabdff1aSopenharmony_ci        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1083cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1084cabdff1aSopenharmony_ci                     minus5b, res0, res1, res2, res3);
1085cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1086cabdff1aSopenharmony_ci                     plus20b, res0, res1, res2, res3);
1087cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
1088cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
1089cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
1090cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
1091cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
1092cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
1093cabdff1aSopenharmony_ci        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
1094cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1095cabdff1aSopenharmony_ci                     minus5b, res4, res5, res6, res7);
1096cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1097cabdff1aSopenharmony_ci                     plus20b, res4, res5, res6, res7);
1098cabdff1aSopenharmony_ci        SRARI_H4_SH(res0, res1, res2, res3, 5);
1099cabdff1aSopenharmony_ci        SRARI_H4_SH(res4, res5, res6, res7, 5);
1100cabdff1aSopenharmony_ci        SAT_SH4_SH(res0, res1, res2, res3, 7);
1101cabdff1aSopenharmony_ci        SAT_SH4_SH(res4, res5, res6, res7, 7);
1102cabdff1aSopenharmony_ci        PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
1103cabdff1aSopenharmony_ci                    vec2, vec3);
1104cabdff1aSopenharmony_ci        XORI_B4_128_SB(vec0, vec1, vec2, vec3);
1105cabdff1aSopenharmony_ci        ST_SB4(vec0, vec1, vec2, vec3, dst, stride);
1106cabdff1aSopenharmony_ci        dst += (4 * stride);
1107cabdff1aSopenharmony_ci    }
1108cabdff1aSopenharmony_ci}
1109cabdff1aSopenharmony_ci
1110cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
1111cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1112cabdff1aSopenharmony_ci{
1113cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
1114cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1115cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1116cabdff1aSopenharmony_ci    v16i8 vec11;
1117cabdff1aSopenharmony_ci    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1118cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
1119cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
1120cabdff1aSopenharmony_ci
1121cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1122cabdff1aSopenharmony_ci    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1123cabdff1aSopenharmony_ci    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1124cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1125cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1126cabdff1aSopenharmony_ci    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
1127cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
1128cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1129cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1130cabdff1aSopenharmony_ci                 res0, res1, res2, res3);
1131cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
1132cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1133cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1134cabdff1aSopenharmony_ci                 plus20b, res0, res1, res2, res3);
1135cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
1136cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
1137cabdff1aSopenharmony_ci    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
1138cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
1139cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
1140cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1141cabdff1aSopenharmony_ci                 res4, res5, res6, res7);
1142cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
1143cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
1144cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1145cabdff1aSopenharmony_ci                 plus20b, res4, res5, res6, res7);
1146cabdff1aSopenharmony_ci    SRARI_H4_SH(res0, res1, res2, res3, 5);
1147cabdff1aSopenharmony_ci    SRARI_H4_SH(res4, res5, res6, res7, 5);
1148cabdff1aSopenharmony_ci    SAT_SH4_SH(res0, res1, res2, res3, 7);
1149cabdff1aSopenharmony_ci    SAT_SH4_SH(res4, res5, res6, res7, 7);
1150cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(res0, res1);
1151cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(res2, res3);
1152cabdff1aSopenharmony_ci    out2 = PCKEV_XORI128_UB(res4, res5);
1153cabdff1aSopenharmony_ci    out3 = PCKEV_XORI128_UB(res6, res7);
1154cabdff1aSopenharmony_ci    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1155cabdff1aSopenharmony_ci}
1156cabdff1aSopenharmony_ci
1157cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
1158cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1159cabdff1aSopenharmony_ci{
1160cabdff1aSopenharmony_ci    v16u8 out;
1161cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, mask0, mask1, mask2;
1162cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1163cabdff1aSopenharmony_ci    v8i16 res0, res1;
1164cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
1165cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
1166cabdff1aSopenharmony_ci
1167cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1168cabdff1aSopenharmony_ci    LD_SB4(src - 2, stride, src0, src1, src2, src3);
1169cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
1170cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1171cabdff1aSopenharmony_ci    HADD_SB2_SH(vec0, vec1, res0, res1);
1172cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1173cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1174cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1175cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1176cabdff1aSopenharmony_ci    SRARI_H2_SH(res0, res1, 5);
1177cabdff1aSopenharmony_ci    SAT_SH2_SH(res0, res1, 7);
1178cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(res0, res1);
1179cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, stride);
1180cabdff1aSopenharmony_ci}
1181cabdff1aSopenharmony_ci
1182cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
1183cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1184cabdff1aSopenharmony_ci{
1185cabdff1aSopenharmony_ci    int32_t loop_cnt;
1186cabdff1aSopenharmony_ci    int16_t filt_const0 = 0xfb01;
1187cabdff1aSopenharmony_ci    int16_t filt_const1 = 0x1414;
1188cabdff1aSopenharmony_ci    int16_t filt_const2 = 0x1fb;
1189cabdff1aSopenharmony_ci    v16u8 res0, res1, res2, res3;
1190cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1191cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1192cabdff1aSopenharmony_ci    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1193cabdff1aSopenharmony_ci    v16i8 src65_l, src87_l, filt0, filt1, filt2;
1194cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1195cabdff1aSopenharmony_ci
1196cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
1197cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
1198cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
1199cabdff1aSopenharmony_ci
1200cabdff1aSopenharmony_ci    src -= (stride * 2);
1201cabdff1aSopenharmony_ci
1202cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1203cabdff1aSopenharmony_ci    src += (5 * stride);
1204cabdff1aSopenharmony_ci
1205cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1206cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1207cabdff1aSopenharmony_ci               src32_r, src43_r);
1208cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
1209cabdff1aSopenharmony_ci               src32_l, src43_l);
1210cabdff1aSopenharmony_ci
1211cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
1212cabdff1aSopenharmony_ci        LD_SB4(src, stride, src5, src6, src7, src8);
1213cabdff1aSopenharmony_ci        src += (4 * stride);
1214cabdff1aSopenharmony_ci
1215cabdff1aSopenharmony_ci        XORI_B4_128_SB(src5, src6, src7, src8);
1216cabdff1aSopenharmony_ci        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1217cabdff1aSopenharmony_ci                   src65_r, src76_r, src87_r);
1218cabdff1aSopenharmony_ci        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1219cabdff1aSopenharmony_ci                   src65_l, src76_l, src87_l);
1220cabdff1aSopenharmony_ci        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1221cabdff1aSopenharmony_ci        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1222cabdff1aSopenharmony_ci        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1223cabdff1aSopenharmony_ci        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1224cabdff1aSopenharmony_ci        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1225cabdff1aSopenharmony_ci        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1226cabdff1aSopenharmony_ci        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1227cabdff1aSopenharmony_ci        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1228cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1229cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1230cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1231cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1232cabdff1aSopenharmony_ci        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1233cabdff1aSopenharmony_ci                    out3_r, res0, res1, res2, res3);
1234cabdff1aSopenharmony_ci        res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
1235cabdff1aSopenharmony_ci        res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
1236cabdff1aSopenharmony_ci        res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
1237cabdff1aSopenharmony_ci        res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
1238cabdff1aSopenharmony_ci        XORI_B4_128_UB(res0, res1, res2, res3);
1239cabdff1aSopenharmony_ci        ST_UB4(res0, res1, res2, res3, dst, stride);
1240cabdff1aSopenharmony_ci        dst += (4 * stride);
1241cabdff1aSopenharmony_ci
1242cabdff1aSopenharmony_ci        src10_r = src54_r;
1243cabdff1aSopenharmony_ci        src32_r = src76_r;
1244cabdff1aSopenharmony_ci        src21_r = src65_r;
1245cabdff1aSopenharmony_ci        src43_r = src87_r;
1246cabdff1aSopenharmony_ci        src10_l = src54_l;
1247cabdff1aSopenharmony_ci        src32_l = src76_l;
1248cabdff1aSopenharmony_ci        src21_l = src65_l;
1249cabdff1aSopenharmony_ci        src43_l = src87_l;
1250cabdff1aSopenharmony_ci        src2 = src6;
1251cabdff1aSopenharmony_ci        src3 = src7;
1252cabdff1aSopenharmony_ci        src4 = src8;
1253cabdff1aSopenharmony_ci    }
1254cabdff1aSopenharmony_ci}
1255cabdff1aSopenharmony_ci
1256cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
1257cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1258cabdff1aSopenharmony_ci{
1259cabdff1aSopenharmony_ci    int32_t loop_cnt;
1260cabdff1aSopenharmony_ci    int16_t filt_const0 = 0xfb01;
1261cabdff1aSopenharmony_ci    int16_t filt_const1 = 0x1414;
1262cabdff1aSopenharmony_ci    int16_t filt_const2 = 0x1fb;
1263cabdff1aSopenharmony_ci    v16u8 res0, res1, res2, res3;
1264cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1265cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1266cabdff1aSopenharmony_ci    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1267cabdff1aSopenharmony_ci    v16i8 src65_l, src87_l, filt0, filt1, filt2;
1268cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1269cabdff1aSopenharmony_ci
1270cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
1271cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
1272cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
1273cabdff1aSopenharmony_ci
1274cabdff1aSopenharmony_ci    src -= (stride * 2);
1275cabdff1aSopenharmony_ci
1276cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1277cabdff1aSopenharmony_ci    src += (5 * stride);
1278cabdff1aSopenharmony_ci
1279cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1280cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1281cabdff1aSopenharmony_ci               src32_r, src43_r);
1282cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
1283cabdff1aSopenharmony_ci               src32_l, src43_l);
1284cabdff1aSopenharmony_ci
1285cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
1286cabdff1aSopenharmony_ci        LD_SB4(src, stride, src5, src6, src7, src8);
1287cabdff1aSopenharmony_ci        src += (4 * stride);
1288cabdff1aSopenharmony_ci
1289cabdff1aSopenharmony_ci        XORI_B4_128_SB(src5, src6, src7, src8);
1290cabdff1aSopenharmony_ci        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1291cabdff1aSopenharmony_ci                   src65_r, src76_r, src87_r);
1292cabdff1aSopenharmony_ci        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1293cabdff1aSopenharmony_ci                   src65_l, src76_l, src87_l);
1294cabdff1aSopenharmony_ci        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1295cabdff1aSopenharmony_ci        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1296cabdff1aSopenharmony_ci        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1297cabdff1aSopenharmony_ci        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1298cabdff1aSopenharmony_ci        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1299cabdff1aSopenharmony_ci        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1300cabdff1aSopenharmony_ci        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1301cabdff1aSopenharmony_ci        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1302cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1303cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1304cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1305cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1306cabdff1aSopenharmony_ci        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1307cabdff1aSopenharmony_ci                    out3_r, res0, res1, res2, res3);
1308cabdff1aSopenharmony_ci        res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
1309cabdff1aSopenharmony_ci        res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
1310cabdff1aSopenharmony_ci        res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
1311cabdff1aSopenharmony_ci        res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
1312cabdff1aSopenharmony_ci        XORI_B4_128_UB(res0, res1, res2, res3);
1313cabdff1aSopenharmony_ci        ST_UB4(res0, res1, res2, res3, dst, stride);
1314cabdff1aSopenharmony_ci        dst += (4 * stride);
1315cabdff1aSopenharmony_ci
1316cabdff1aSopenharmony_ci        src10_r = src54_r;
1317cabdff1aSopenharmony_ci        src32_r = src76_r;
1318cabdff1aSopenharmony_ci        src21_r = src65_r;
1319cabdff1aSopenharmony_ci        src43_r = src87_r;
1320cabdff1aSopenharmony_ci        src10_l = src54_l;
1321cabdff1aSopenharmony_ci        src32_l = src76_l;
1322cabdff1aSopenharmony_ci        src21_l = src65_l;
1323cabdff1aSopenharmony_ci        src43_l = src87_l;
1324cabdff1aSopenharmony_ci        src3 = src7;
1325cabdff1aSopenharmony_ci        src4 = src8;
1326cabdff1aSopenharmony_ci    }
1327cabdff1aSopenharmony_ci}
1328cabdff1aSopenharmony_ci
1329cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
1330cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1331cabdff1aSopenharmony_ci{
1332cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
1333cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
1334cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
1335cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1336cabdff1aSopenharmony_ci    v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1337cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1338cabdff1aSopenharmony_ci    v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3;
1339cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1340cabdff1aSopenharmony_ci
1341cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
1342cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
1343cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
1344cabdff1aSopenharmony_ci
1345cabdff1aSopenharmony_ci    src -= (stride * 2);
1346cabdff1aSopenharmony_ci
1347cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1348cabdff1aSopenharmony_ci    src += (5 * stride);
1349cabdff1aSopenharmony_ci    LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
1350cabdff1aSopenharmony_ci    XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1351cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1352cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1353cabdff1aSopenharmony_ci               src32_r, src43_r);
1354cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1355cabdff1aSopenharmony_ci               src76_r, src87_r);
1356cabdff1aSopenharmony_ci    ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1357cabdff1aSopenharmony_ci               src109_r, src1110_r, src1211_r);
1358cabdff1aSopenharmony_ci    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1359cabdff1aSopenharmony_ci    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1360cabdff1aSopenharmony_ci    out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1361cabdff1aSopenharmony_ci    out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1362cabdff1aSopenharmony_ci    out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1363cabdff1aSopenharmony_ci    out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1364cabdff1aSopenharmony_ci    out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1365cabdff1aSopenharmony_ci    out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1366cabdff1aSopenharmony_ci    PCKEV_D2_SB(src3, src2, src5, src4, tmp0, tmp1);
1367cabdff1aSopenharmony_ci    PCKEV_D2_SB(src7, src6, src9, src8, tmp2, tmp3);
1368cabdff1aSopenharmony_ci    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1369cabdff1aSopenharmony_ci    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
1370cabdff1aSopenharmony_ci    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1371cabdff1aSopenharmony_ci    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1372cabdff1aSopenharmony_ci    PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1373cabdff1aSopenharmony_ci    PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1374cabdff1aSopenharmony_ci    out0 = __msa_aver_s_b(out0, tmp0);
1375cabdff1aSopenharmony_ci    out1 = __msa_aver_s_b(out1, tmp1);
1376cabdff1aSopenharmony_ci    out2 = __msa_aver_s_b(out2, tmp2);
1377cabdff1aSopenharmony_ci    out3 = __msa_aver_s_b(out3, tmp3);
1378cabdff1aSopenharmony_ci    XORI_B4_128_SB(out0, out1, out2, out3);
1379cabdff1aSopenharmony_ci    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1380cabdff1aSopenharmony_ci}
1381cabdff1aSopenharmony_ci
1382cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
1383cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1384cabdff1aSopenharmony_ci{
1385cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
1386cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
1387cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
1388cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1389cabdff1aSopenharmony_ci    v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1390cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1391cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
1392cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1393cabdff1aSopenharmony_ci
1394cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
1395cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
1396cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
1397cabdff1aSopenharmony_ci
1398cabdff1aSopenharmony_ci    src -= (stride * 2);
1399cabdff1aSopenharmony_ci
1400cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1401cabdff1aSopenharmony_ci    src += (5 * stride);
1402cabdff1aSopenharmony_ci    LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
1403cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1404cabdff1aSopenharmony_ci    XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1405cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1406cabdff1aSopenharmony_ci               src32_r, src43_r);
1407cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1408cabdff1aSopenharmony_ci               src76_r, src87_r);
1409cabdff1aSopenharmony_ci    ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1410cabdff1aSopenharmony_ci               src109_r, src1110_r, src1211_r);
1411cabdff1aSopenharmony_ci    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1412cabdff1aSopenharmony_ci    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1413cabdff1aSopenharmony_ci    out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1414cabdff1aSopenharmony_ci    out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1415cabdff1aSopenharmony_ci    out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1416cabdff1aSopenharmony_ci    out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1417cabdff1aSopenharmony_ci    out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1418cabdff1aSopenharmony_ci    out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1419cabdff1aSopenharmony_ci    PCKEV_D2_SB(src4, src3, src6, src5, tmp0, tmp1);
1420cabdff1aSopenharmony_ci    PCKEV_D2_SB(src8, src7, src10, src9, tmp2, tmp3);
1421cabdff1aSopenharmony_ci    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1422cabdff1aSopenharmony_ci    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
1423cabdff1aSopenharmony_ci    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1424cabdff1aSopenharmony_ci    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1425cabdff1aSopenharmony_ci    PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1426cabdff1aSopenharmony_ci    PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1427cabdff1aSopenharmony_ci    out0 = __msa_aver_s_b(out0, tmp0);
1428cabdff1aSopenharmony_ci    out1 = __msa_aver_s_b(out1, tmp1);
1429cabdff1aSopenharmony_ci    out2 = __msa_aver_s_b(out2, tmp2);
1430cabdff1aSopenharmony_ci    out3 = __msa_aver_s_b(out3, tmp3);
1431cabdff1aSopenharmony_ci    XORI_B4_128_SB(out0, out1, out2, out3);
1432cabdff1aSopenharmony_ci    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1433cabdff1aSopenharmony_ci}
1434cabdff1aSopenharmony_ci
1435cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
1436cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1437cabdff1aSopenharmony_ci{
1438cabdff1aSopenharmony_ci    int16_t filt_const0 = 0xfb01;
1439cabdff1aSopenharmony_ci    int16_t filt_const1 = 0x1414;
1440cabdff1aSopenharmony_ci    int16_t filt_const2 = 0x1fb;
1441cabdff1aSopenharmony_ci    v16u8 out;
1442cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1443cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1444cabdff1aSopenharmony_ci    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1445cabdff1aSopenharmony_ci    v8i16 out10, out32;
1446cabdff1aSopenharmony_ci
1447cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
1448cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
1449cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
1450cabdff1aSopenharmony_ci
1451cabdff1aSopenharmony_ci    src -= (stride * 2);
1452cabdff1aSopenharmony_ci
1453cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1454cabdff1aSopenharmony_ci    src += (5 * stride);
1455cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1456cabdff1aSopenharmony_ci               src32_r, src43_r);
1457cabdff1aSopenharmony_ci    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1458cabdff1aSopenharmony_ci    XORI_B2_128_SB(src2110, src4332);
1459cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
1460cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1461cabdff1aSopenharmony_ci               src76_r, src87_r);
1462cabdff1aSopenharmony_ci    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1463cabdff1aSopenharmony_ci    XORI_B2_128_SB(src6554, src8776);
1464cabdff1aSopenharmony_ci    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1465cabdff1aSopenharmony_ci    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1466cabdff1aSopenharmony_ci    SRARI_H2_SH(out10, out32, 5);
1467cabdff1aSopenharmony_ci    SAT_SH2_SH(out10, out32, 7);
1468cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out10, out32);
1469cabdff1aSopenharmony_ci    src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1470cabdff1aSopenharmony_ci    src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
1471cabdff1aSopenharmony_ci    src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1472cabdff1aSopenharmony_ci    out = __msa_aver_u_b(out, (v16u8) src32_r);
1473cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, stride);
1474cabdff1aSopenharmony_ci}
1475cabdff1aSopenharmony_ci
1476cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
1477cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1478cabdff1aSopenharmony_ci{
1479cabdff1aSopenharmony_ci    int16_t filt_const0 = 0xfb01;
1480cabdff1aSopenharmony_ci    int16_t filt_const1 = 0x1414;
1481cabdff1aSopenharmony_ci    int16_t filt_const2 = 0x1fb;
1482cabdff1aSopenharmony_ci    v16u8 out;
1483cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1484cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1485cabdff1aSopenharmony_ci    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1486cabdff1aSopenharmony_ci    v8i16 out10, out32;
1487cabdff1aSopenharmony_ci
1488cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
1489cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
1490cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
1491cabdff1aSopenharmony_ci
1492cabdff1aSopenharmony_ci    src -= (stride * 2);
1493cabdff1aSopenharmony_ci
1494cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1495cabdff1aSopenharmony_ci    src += (5 * stride);
1496cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1497cabdff1aSopenharmony_ci               src32_r, src43_r);
1498cabdff1aSopenharmony_ci    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1499cabdff1aSopenharmony_ci    XORI_B2_128_SB(src2110, src4332);
1500cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
1501cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1502cabdff1aSopenharmony_ci               src76_r, src87_r);
1503cabdff1aSopenharmony_ci    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1504cabdff1aSopenharmony_ci    XORI_B2_128_SB(src6554, src8776);
1505cabdff1aSopenharmony_ci    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1506cabdff1aSopenharmony_ci    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1507cabdff1aSopenharmony_ci    SRARI_H2_SH(out10, out32, 5);
1508cabdff1aSopenharmony_ci    SAT_SH2_SH(out10, out32, 7);
1509cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out10, out32);
1510cabdff1aSopenharmony_ci    src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
1511cabdff1aSopenharmony_ci    src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
1512cabdff1aSopenharmony_ci    src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1513cabdff1aSopenharmony_ci    out = __msa_aver_u_b(out, (v16u8) src32_r);
1514cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, stride);
1515cabdff1aSopenharmony_ci}
1516cabdff1aSopenharmony_ci
1517cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
1518cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1519cabdff1aSopenharmony_ci{
1520cabdff1aSopenharmony_ci    avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2), dst, stride);
1521cabdff1aSopenharmony_ci}
1522cabdff1aSopenharmony_ci
1523cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
1524cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1525cabdff1aSopenharmony_ci{
1526cabdff1aSopenharmony_ci    avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1527cabdff1aSopenharmony_ci}
1528cabdff1aSopenharmony_ci
1529cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
1530cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1531cabdff1aSopenharmony_ci{
1532cabdff1aSopenharmony_ci    avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2), dst,
1533cabdff1aSopenharmony_ci                              stride);
1534cabdff1aSopenharmony_ci}
1535cabdff1aSopenharmony_ci
1536cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
1537cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1538cabdff1aSopenharmony_ci{
1539cabdff1aSopenharmony_ci    avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1540cabdff1aSopenharmony_ci                              stride);
1541cabdff1aSopenharmony_ci}
1542cabdff1aSopenharmony_ci
1543cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
1544cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1545cabdff1aSopenharmony_ci{
1546cabdff1aSopenharmony_ci    avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2), dst, stride);
1547cabdff1aSopenharmony_ci}
1548cabdff1aSopenharmony_ci
1549cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
1550cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1551cabdff1aSopenharmony_ci{
1552cabdff1aSopenharmony_ci    avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1553cabdff1aSopenharmony_ci}
1554cabdff1aSopenharmony_ci
1555cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
1556cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1557cabdff1aSopenharmony_ci{
1558cabdff1aSopenharmony_ci    avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2), dst, stride);
1559cabdff1aSopenharmony_ci}
1560cabdff1aSopenharmony_ci
1561cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
1562cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1563cabdff1aSopenharmony_ci{
1564cabdff1aSopenharmony_ci    avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1565cabdff1aSopenharmony_ci                            stride);
1566cabdff1aSopenharmony_ci}
1567cabdff1aSopenharmony_ci
1568cabdff1aSopenharmony_ci
1569cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
1570cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1571cabdff1aSopenharmony_ci{
1572cabdff1aSopenharmony_ci    avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2), dst, stride);
1573cabdff1aSopenharmony_ci}
1574cabdff1aSopenharmony_ci
1575cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
1576cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1577cabdff1aSopenharmony_ci{
1578cabdff1aSopenharmony_ci    avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1579cabdff1aSopenharmony_ci}
1580cabdff1aSopenharmony_ci
1581cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
1582cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1583cabdff1aSopenharmony_ci{
1584cabdff1aSopenharmony_ci    avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2), dst, stride);
1585cabdff1aSopenharmony_ci}
1586cabdff1aSopenharmony_ci
1587cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
1588cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1589cabdff1aSopenharmony_ci{
1590cabdff1aSopenharmony_ci    avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1591cabdff1aSopenharmony_ci                            stride);
1592cabdff1aSopenharmony_ci}
1593cabdff1aSopenharmony_ci
1594cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
1595cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1596cabdff1aSopenharmony_ci{
1597cabdff1aSopenharmony_ci    uint8_t *dst_tmp = dst;
1598cabdff1aSopenharmony_ci    const uint8_t *src_tmp = src - (2 * stride) - 2;
1599cabdff1aSopenharmony_ci    uint32_t multiple8_cnt, loop_cnt;
1600cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
1601cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
1602cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
1603cabdff1aSopenharmony_ci    v16u8 out0, out1;
1604cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1605cabdff1aSopenharmony_ci    v16i8 mask2;
1606cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1607cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1608cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1609cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1610cabdff1aSopenharmony_ci    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1611cabdff1aSopenharmony_ci    v8i16 hz_out87_l, filt0, filt1, filt2;
1612cabdff1aSopenharmony_ci    v4i32 tmp0, tmp1;
1613cabdff1aSopenharmony_ci
1614cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
1615cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
1616cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
1617cabdff1aSopenharmony_ci
1618cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1619cabdff1aSopenharmony_ci
1620cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
1621cabdff1aSopenharmony_ci        dst = dst_tmp;
1622cabdff1aSopenharmony_ci        src = src_tmp;
1623cabdff1aSopenharmony_ci
1624cabdff1aSopenharmony_ci        LD_SB5(src, stride, src0, src1, src2, src3, src4);
1625cabdff1aSopenharmony_ci        XORI_B5_128_SB(src0, src1, src2, src3, src4);
1626cabdff1aSopenharmony_ci        src += (5 * stride);
1627cabdff1aSopenharmony_ci
1628cabdff1aSopenharmony_ci        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1629cabdff1aSopenharmony_ci        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1630cabdff1aSopenharmony_ci        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1631cabdff1aSopenharmony_ci        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1632cabdff1aSopenharmony_ci        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1633cabdff1aSopenharmony_ci
1634cabdff1aSopenharmony_ci        for (loop_cnt = 4; loop_cnt--;) {
1635cabdff1aSopenharmony_ci            LD_SB4(src, stride, src5, src6, src7, src8);
1636cabdff1aSopenharmony_ci            src += (4 * stride);
1637cabdff1aSopenharmony_ci
1638cabdff1aSopenharmony_ci            XORI_B4_128_SB(src5, src6, src7, src8);
1639cabdff1aSopenharmony_ci
1640cabdff1aSopenharmony_ci            hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1641cabdff1aSopenharmony_ci            hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1642cabdff1aSopenharmony_ci            hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1643cabdff1aSopenharmony_ci            hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1644cabdff1aSopenharmony_ci
1645cabdff1aSopenharmony_ci            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1646cabdff1aSopenharmony_ci                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1647cabdff1aSopenharmony_ci                       hz_out43_r);
1648cabdff1aSopenharmony_ci            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1649cabdff1aSopenharmony_ci                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1650cabdff1aSopenharmony_ci                       hz_out43_l);
1651cabdff1aSopenharmony_ci            ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1652cabdff1aSopenharmony_ci                       hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1653cabdff1aSopenharmony_ci                       hz_out87_r);
1654cabdff1aSopenharmony_ci            ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1655cabdff1aSopenharmony_ci                       hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1656cabdff1aSopenharmony_ci                       hz_out87_l);
1657cabdff1aSopenharmony_ci
1658cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
1659cabdff1aSopenharmony_ci                                  filt1, filt2);
1660cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
1661cabdff1aSopenharmony_ci                                  filt1, filt2);
1662cabdff1aSopenharmony_ci            dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1663cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
1664cabdff1aSopenharmony_ci                                  filt1, filt2);
1665cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
1666cabdff1aSopenharmony_ci                                  filt1, filt2);
1667cabdff1aSopenharmony_ci            dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1668cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
1669cabdff1aSopenharmony_ci                                  filt1, filt2);
1670cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
1671cabdff1aSopenharmony_ci                                  filt1, filt2);
1672cabdff1aSopenharmony_ci            dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1673cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
1674cabdff1aSopenharmony_ci                                  filt1, filt2);
1675cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
1676cabdff1aSopenharmony_ci                                  filt1, filt2);
1677cabdff1aSopenharmony_ci            dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1678cabdff1aSopenharmony_ci
1679cabdff1aSopenharmony_ci            dst1 = __msa_srari_h(hz_out2, 5);
1680cabdff1aSopenharmony_ci            dst3 = __msa_srari_h(hz_out3, 5);
1681cabdff1aSopenharmony_ci            dst5 = __msa_srari_h(hz_out4, 5);
1682cabdff1aSopenharmony_ci            dst7 = __msa_srari_h(hz_out5, 5);
1683cabdff1aSopenharmony_ci            SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1684cabdff1aSopenharmony_ci
1685cabdff1aSopenharmony_ci            dst0 = __msa_aver_s_h(dst0, dst1);
1686cabdff1aSopenharmony_ci            dst1 = __msa_aver_s_h(dst2, dst3);
1687cabdff1aSopenharmony_ci            dst2 = __msa_aver_s_h(dst4, dst5);
1688cabdff1aSopenharmony_ci            dst3 = __msa_aver_s_h(dst6, dst7);
1689cabdff1aSopenharmony_ci
1690cabdff1aSopenharmony_ci            out0 = PCKEV_XORI128_UB(dst0, dst1);
1691cabdff1aSopenharmony_ci            out1 = PCKEV_XORI128_UB(dst2, dst3);
1692cabdff1aSopenharmony_ci            ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1693cabdff1aSopenharmony_ci            dst += (4 * stride);
1694cabdff1aSopenharmony_ci
1695cabdff1aSopenharmony_ci            hz_out0 = hz_out4;
1696cabdff1aSopenharmony_ci            hz_out1 = hz_out5;
1697cabdff1aSopenharmony_ci            hz_out2 = hz_out6;
1698cabdff1aSopenharmony_ci            hz_out3 = hz_out7;
1699cabdff1aSopenharmony_ci            hz_out4 = hz_out8;
1700cabdff1aSopenharmony_ci        }
1701cabdff1aSopenharmony_ci
1702cabdff1aSopenharmony_ci        src_tmp += 8;
1703cabdff1aSopenharmony_ci        dst_tmp += 8;
1704cabdff1aSopenharmony_ci    }
1705cabdff1aSopenharmony_ci}
1706cabdff1aSopenharmony_ci
1707cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
1708cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
1709cabdff1aSopenharmony_ci{
1710cabdff1aSopenharmony_ci    uint8_t *dst_tmp = dst;
1711cabdff1aSopenharmony_ci    const uint8_t *src_tmp = src - (2 * stride) - 2;
1712cabdff1aSopenharmony_ci    uint32_t multiple8_cnt, loop_cnt;
1713cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
1714cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
1715cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
1716cabdff1aSopenharmony_ci    v16u8 out0, out1;
1717cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1718cabdff1aSopenharmony_ci    v16i8 mask2;
1719cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1720cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1721cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1722cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1723cabdff1aSopenharmony_ci    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1724cabdff1aSopenharmony_ci    v8i16 hz_out87_l, filt0, filt1, filt2;
1725cabdff1aSopenharmony_ci    v4i32 tmp0, tmp1;
1726cabdff1aSopenharmony_ci
1727cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
1728cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
1729cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
1730cabdff1aSopenharmony_ci
1731cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1732cabdff1aSopenharmony_ci
1733cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
1734cabdff1aSopenharmony_ci        dst = dst_tmp;
1735cabdff1aSopenharmony_ci        src = src_tmp;
1736cabdff1aSopenharmony_ci
1737cabdff1aSopenharmony_ci        LD_SB5(src, stride, src0, src1, src2, src3, src4);
1738cabdff1aSopenharmony_ci        XORI_B5_128_SB(src0, src1, src2, src3, src4);
1739cabdff1aSopenharmony_ci        src += (5 * stride);
1740cabdff1aSopenharmony_ci
1741cabdff1aSopenharmony_ci        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1742cabdff1aSopenharmony_ci        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1743cabdff1aSopenharmony_ci        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1744cabdff1aSopenharmony_ci        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1745cabdff1aSopenharmony_ci        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1746cabdff1aSopenharmony_ci
1747cabdff1aSopenharmony_ci        for (loop_cnt = 4; loop_cnt--;) {
1748cabdff1aSopenharmony_ci            LD_SB4(src, stride, src5, src6, src7, src8);
1749cabdff1aSopenharmony_ci            src += (4 * stride);
1750cabdff1aSopenharmony_ci
1751cabdff1aSopenharmony_ci            XORI_B4_128_SB(src5, src6, src7, src8);
1752cabdff1aSopenharmony_ci
1753cabdff1aSopenharmony_ci            hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1754cabdff1aSopenharmony_ci            hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1755cabdff1aSopenharmony_ci            hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1756cabdff1aSopenharmony_ci            hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1757cabdff1aSopenharmony_ci
1758cabdff1aSopenharmony_ci            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1759cabdff1aSopenharmony_ci                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1760cabdff1aSopenharmony_ci                       hz_out43_r);
1761cabdff1aSopenharmony_ci            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1762cabdff1aSopenharmony_ci                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1763cabdff1aSopenharmony_ci                       hz_out43_l);
1764cabdff1aSopenharmony_ci            ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1765cabdff1aSopenharmony_ci                       hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1766cabdff1aSopenharmony_ci                       hz_out87_r);
1767cabdff1aSopenharmony_ci            ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1768cabdff1aSopenharmony_ci                       hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1769cabdff1aSopenharmony_ci                       hz_out87_l);
1770cabdff1aSopenharmony_ci
1771cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
1772cabdff1aSopenharmony_ci                                  filt1, filt2);
1773cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
1774cabdff1aSopenharmony_ci                                  filt1, filt2);
1775cabdff1aSopenharmony_ci            dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1776cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
1777cabdff1aSopenharmony_ci                                  filt1, filt2);
1778cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
1779cabdff1aSopenharmony_ci                                  filt1, filt2);
1780cabdff1aSopenharmony_ci            dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1781cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
1782cabdff1aSopenharmony_ci                                  filt1, filt2);
1783cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
1784cabdff1aSopenharmony_ci                                  filt1, filt2);
1785cabdff1aSopenharmony_ci            dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1786cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
1787cabdff1aSopenharmony_ci                                  filt1, filt2);
1788cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
1789cabdff1aSopenharmony_ci                                  filt1, filt2);
1790cabdff1aSopenharmony_ci            dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1791cabdff1aSopenharmony_ci
1792cabdff1aSopenharmony_ci            dst1 = __msa_srari_h(hz_out3, 5);
1793cabdff1aSopenharmony_ci            dst3 = __msa_srari_h(hz_out4, 5);
1794cabdff1aSopenharmony_ci            dst5 = __msa_srari_h(hz_out5, 5);
1795cabdff1aSopenharmony_ci            dst7 = __msa_srari_h(hz_out6, 5);
1796cabdff1aSopenharmony_ci            SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1797cabdff1aSopenharmony_ci
1798cabdff1aSopenharmony_ci            dst0 = __msa_aver_s_h(dst0, dst1);
1799cabdff1aSopenharmony_ci            dst1 = __msa_aver_s_h(dst2, dst3);
1800cabdff1aSopenharmony_ci            dst2 = __msa_aver_s_h(dst4, dst5);
1801cabdff1aSopenharmony_ci            dst3 = __msa_aver_s_h(dst6, dst7);
1802cabdff1aSopenharmony_ci
1803cabdff1aSopenharmony_ci            out0 = PCKEV_XORI128_UB(dst0, dst1);
1804cabdff1aSopenharmony_ci            out1 = PCKEV_XORI128_UB(dst2, dst3);
1805cabdff1aSopenharmony_ci            ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1806cabdff1aSopenharmony_ci            dst += (4 * stride);
1807cabdff1aSopenharmony_ci
1808cabdff1aSopenharmony_ci            hz_out0 = hz_out4;
1809cabdff1aSopenharmony_ci            hz_out1 = hz_out5;
1810cabdff1aSopenharmony_ci            hz_out2 = hz_out6;
1811cabdff1aSopenharmony_ci            hz_out3 = hz_out7;
1812cabdff1aSopenharmony_ci            hz_out4 = hz_out8;
1813cabdff1aSopenharmony_ci        }
1814cabdff1aSopenharmony_ci
1815cabdff1aSopenharmony_ci        src_tmp += 8;
1816cabdff1aSopenharmony_ci        dst_tmp += 8;
1817cabdff1aSopenharmony_ci    }
1818cabdff1aSopenharmony_ci}
1819cabdff1aSopenharmony_ci
1820cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
1821cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1822cabdff1aSopenharmony_ci{
1823cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
1824cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
1825cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
1826cabdff1aSopenharmony_ci    v16u8 out0, out1;
1827cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1828cabdff1aSopenharmony_ci    v16i8 src11, src12, mask0, mask1, mask2;
1829cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1830cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1831cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1832cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1833cabdff1aSopenharmony_ci    v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1834cabdff1aSopenharmony_ci    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1835cabdff1aSopenharmony_ci    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1836cabdff1aSopenharmony_ci    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1837cabdff1aSopenharmony_ci    v4i32 tmp0, tmp1;
1838cabdff1aSopenharmony_ci
1839cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1840cabdff1aSopenharmony_ci
1841cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
1842cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
1843cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
1844cabdff1aSopenharmony_ci
1845cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
1846cabdff1aSopenharmony_ci
1847cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1848cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1849cabdff1aSopenharmony_ci    src += (5 * stride);
1850cabdff1aSopenharmony_ci
1851cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1852cabdff1aSopenharmony_ci    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1853cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1854cabdff1aSopenharmony_ci    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1855cabdff1aSopenharmony_ci    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1856cabdff1aSopenharmony_ci
1857cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
1858cabdff1aSopenharmony_ci    src += (4 * stride);
1859cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
1860cabdff1aSopenharmony_ci
1861cabdff1aSopenharmony_ci    hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1862cabdff1aSopenharmony_ci    hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1863cabdff1aSopenharmony_ci    hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1864cabdff1aSopenharmony_ci    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1865cabdff1aSopenharmony_ci
1866cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1867cabdff1aSopenharmony_ci               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
1868cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1869cabdff1aSopenharmony_ci               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
1870cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1871cabdff1aSopenharmony_ci               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
1872cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1873cabdff1aSopenharmony_ci               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
1874cabdff1aSopenharmony_ci
1875cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
1876cabdff1aSopenharmony_ci                          filt2);
1877cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
1878cabdff1aSopenharmony_ci                          filt2);
1879cabdff1aSopenharmony_ci    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1880cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
1881cabdff1aSopenharmony_ci                          filt2);
1882cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
1883cabdff1aSopenharmony_ci                          filt2);
1884cabdff1aSopenharmony_ci    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1885cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
1886cabdff1aSopenharmony_ci                          filt2);
1887cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
1888cabdff1aSopenharmony_ci                          filt2);
1889cabdff1aSopenharmony_ci    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1890cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
1891cabdff1aSopenharmony_ci                          filt2);
1892cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
1893cabdff1aSopenharmony_ci                          filt2);
1894cabdff1aSopenharmony_ci    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1895cabdff1aSopenharmony_ci
1896cabdff1aSopenharmony_ci    SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
1897cabdff1aSopenharmony_ci    SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
1898cabdff1aSopenharmony_ci
1899cabdff1aSopenharmony_ci    dst0 = __msa_aver_s_h(dst0, hz_out2);
1900cabdff1aSopenharmony_ci    dst1 = __msa_aver_s_h(dst1, hz_out3);
1901cabdff1aSopenharmony_ci    dst2 = __msa_aver_s_h(dst2, hz_out4);
1902cabdff1aSopenharmony_ci    dst3 = __msa_aver_s_h(dst3, hz_out5);
1903cabdff1aSopenharmony_ci
1904cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(dst0, dst1);
1905cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(dst2, dst3);
1906cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1907cabdff1aSopenharmony_ci    dst += (4 * stride);
1908cabdff1aSopenharmony_ci
1909cabdff1aSopenharmony_ci    LD_SB4(src, stride, src9, src10, src11, src12);
1910cabdff1aSopenharmony_ci    XORI_B4_128_SB(src9, src10, src11, src12);
1911cabdff1aSopenharmony_ci    hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
1912cabdff1aSopenharmony_ci    hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
1913cabdff1aSopenharmony_ci    hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
1914cabdff1aSopenharmony_ci    hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
1915cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1916cabdff1aSopenharmony_ci               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
1917cabdff1aSopenharmony_ci               hz_out1211_r);
1918cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1919cabdff1aSopenharmony_ci               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
1920cabdff1aSopenharmony_ci               hz_out1211_l);
1921cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
1922cabdff1aSopenharmony_ci                          filt2);
1923cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
1924cabdff1aSopenharmony_ci                          filt2);
1925cabdff1aSopenharmony_ci    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1926cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
1927cabdff1aSopenharmony_ci                          filt2);
1928cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
1929cabdff1aSopenharmony_ci                          filt2);
1930cabdff1aSopenharmony_ci    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1931cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
1932cabdff1aSopenharmony_ci                          filt2);
1933cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
1934cabdff1aSopenharmony_ci                          filt2);
1935cabdff1aSopenharmony_ci    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1936cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
1937cabdff1aSopenharmony_ci                          filt2);
1938cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
1939cabdff1aSopenharmony_ci                          filt2);
1940cabdff1aSopenharmony_ci    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1941cabdff1aSopenharmony_ci
1942cabdff1aSopenharmony_ci    SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
1943cabdff1aSopenharmony_ci    SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
1944cabdff1aSopenharmony_ci
1945cabdff1aSopenharmony_ci    dst0 = __msa_aver_s_h(dst0, hz_out6);
1946cabdff1aSopenharmony_ci    dst1 = __msa_aver_s_h(dst1, hz_out7);
1947cabdff1aSopenharmony_ci    dst2 = __msa_aver_s_h(dst2, hz_out8);
1948cabdff1aSopenharmony_ci    dst3 = __msa_aver_s_h(dst3, hz_out9);
1949cabdff1aSopenharmony_ci
1950cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(dst0, dst1);
1951cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(dst2, dst3);
1952cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1953cabdff1aSopenharmony_ci}
1954cabdff1aSopenharmony_ci
1955cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
1956cabdff1aSopenharmony_ci                                ptrdiff_t stride)
1957cabdff1aSopenharmony_ci{
1958cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
1959cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
1960cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
1961cabdff1aSopenharmony_ci    v16u8 out0, out1;
1962cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1963cabdff1aSopenharmony_ci    v16i8 src11, src12, mask0, mask1, mask2;
1964cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1965cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1966cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1967cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1968cabdff1aSopenharmony_ci    v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1969cabdff1aSopenharmony_ci    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1970cabdff1aSopenharmony_ci    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1971cabdff1aSopenharmony_ci    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1972cabdff1aSopenharmony_ci    v4i32 tmp0, tmp1;
1973cabdff1aSopenharmony_ci
1974cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1975cabdff1aSopenharmony_ci
1976cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
1977cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
1978cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
1979cabdff1aSopenharmony_ci
1980cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
1981cabdff1aSopenharmony_ci
1982cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1983cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1984cabdff1aSopenharmony_ci    src += (5 * stride);
1985cabdff1aSopenharmony_ci
1986cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1987cabdff1aSopenharmony_ci    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1988cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1989cabdff1aSopenharmony_ci    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1990cabdff1aSopenharmony_ci    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1991cabdff1aSopenharmony_ci
1992cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
1993cabdff1aSopenharmony_ci    src += (4 * stride);
1994cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
1995cabdff1aSopenharmony_ci
1996cabdff1aSopenharmony_ci    hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1997cabdff1aSopenharmony_ci    hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1998cabdff1aSopenharmony_ci    hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1999cabdff1aSopenharmony_ci    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2000cabdff1aSopenharmony_ci
2001cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2002cabdff1aSopenharmony_ci               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2003cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2004cabdff1aSopenharmony_ci               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
2005cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2006cabdff1aSopenharmony_ci               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2007cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2008cabdff1aSopenharmony_ci               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
2009cabdff1aSopenharmony_ci
2010cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2011cabdff1aSopenharmony_ci                          filt2);
2012cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
2013cabdff1aSopenharmony_ci                          filt2);
2014cabdff1aSopenharmony_ci    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2015cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2016cabdff1aSopenharmony_ci                          filt2);
2017cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
2018cabdff1aSopenharmony_ci                          filt2);
2019cabdff1aSopenharmony_ci    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2020cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2021cabdff1aSopenharmony_ci                          filt2);
2022cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
2023cabdff1aSopenharmony_ci                          filt2);
2024cabdff1aSopenharmony_ci    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2025cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2026cabdff1aSopenharmony_ci                          filt2);
2027cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
2028cabdff1aSopenharmony_ci                          filt2);
2029cabdff1aSopenharmony_ci    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2030cabdff1aSopenharmony_ci
2031cabdff1aSopenharmony_ci    SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
2032cabdff1aSopenharmony_ci    SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
2033cabdff1aSopenharmony_ci
2034cabdff1aSopenharmony_ci    dst0 = __msa_aver_s_h(dst0, hz_out3);
2035cabdff1aSopenharmony_ci    dst1 = __msa_aver_s_h(dst1, hz_out4);
2036cabdff1aSopenharmony_ci    dst2 = __msa_aver_s_h(dst2, hz_out5);
2037cabdff1aSopenharmony_ci    dst3 = __msa_aver_s_h(dst3, hz_out6);
2038cabdff1aSopenharmony_ci
2039cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(dst0, dst1);
2040cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(dst2, dst3);
2041cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
2042cabdff1aSopenharmony_ci    dst += (4 * stride);
2043cabdff1aSopenharmony_ci
2044cabdff1aSopenharmony_ci    LD_SB4(src, stride, src9, src10, src11, src12);
2045cabdff1aSopenharmony_ci    XORI_B4_128_SB(src9, src10, src11, src12);
2046cabdff1aSopenharmony_ci    hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
2047cabdff1aSopenharmony_ci    hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
2048cabdff1aSopenharmony_ci    hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
2049cabdff1aSopenharmony_ci    hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
2050cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2051cabdff1aSopenharmony_ci               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
2052cabdff1aSopenharmony_ci               hz_out1211_r);
2053cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2054cabdff1aSopenharmony_ci               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
2055cabdff1aSopenharmony_ci               hz_out1211_l);
2056cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
2057cabdff1aSopenharmony_ci                          filt2);
2058cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
2059cabdff1aSopenharmony_ci                          filt2);
2060cabdff1aSopenharmony_ci    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2061cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
2062cabdff1aSopenharmony_ci                          filt2);
2063cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
2064cabdff1aSopenharmony_ci                          filt2);
2065cabdff1aSopenharmony_ci    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2066cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
2067cabdff1aSopenharmony_ci                          filt2);
2068cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
2069cabdff1aSopenharmony_ci                          filt2);
2070cabdff1aSopenharmony_ci    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2071cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
2072cabdff1aSopenharmony_ci                          filt2);
2073cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
2074cabdff1aSopenharmony_ci                          filt2);
2075cabdff1aSopenharmony_ci    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2076cabdff1aSopenharmony_ci
2077cabdff1aSopenharmony_ci    SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
2078cabdff1aSopenharmony_ci    SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
2079cabdff1aSopenharmony_ci
2080cabdff1aSopenharmony_ci    dst0 = __msa_aver_s_h(dst0, hz_out7);
2081cabdff1aSopenharmony_ci    dst1 = __msa_aver_s_h(dst1, hz_out8);
2082cabdff1aSopenharmony_ci    dst2 = __msa_aver_s_h(dst2, hz_out9);
2083cabdff1aSopenharmony_ci    dst3 = __msa_aver_s_h(dst3, hz_out10);
2084cabdff1aSopenharmony_ci
2085cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(dst0, dst1);
2086cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(dst2, dst3);
2087cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
2088cabdff1aSopenharmony_ci}
2089cabdff1aSopenharmony_ci
2090cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
2091cabdff1aSopenharmony_ci                                ptrdiff_t stride)
2092cabdff1aSopenharmony_ci{
2093cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
2094cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
2095cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
2096cabdff1aSopenharmony_ci    v16u8 res;
2097cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2098cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2;
2099cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2100cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2101cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2102cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2103cabdff1aSopenharmony_ci    v4i32 tmp0, tmp1;
2104cabdff1aSopenharmony_ci
2105cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2106cabdff1aSopenharmony_ci
2107cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
2108cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
2109cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
2110cabdff1aSopenharmony_ci
2111cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
2112cabdff1aSopenharmony_ci
2113cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2114cabdff1aSopenharmony_ci    src += (5 * stride);
2115cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
2116cabdff1aSopenharmony_ci
2117cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2118cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
2119cabdff1aSopenharmony_ci
2120cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
2121cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
2122cabdff1aSopenharmony_ci    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
2123cabdff1aSopenharmony_ci    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
2124cabdff1aSopenharmony_ci    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2125cabdff1aSopenharmony_ci    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2126cabdff1aSopenharmony_ci    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2127cabdff1aSopenharmony_ci
2128cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2129cabdff1aSopenharmony_ci               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2130cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2131cabdff1aSopenharmony_ci               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2132cabdff1aSopenharmony_ci
2133cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2134cabdff1aSopenharmony_ci                          filt2);
2135cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2136cabdff1aSopenharmony_ci                          filt2);
2137cabdff1aSopenharmony_ci    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2138cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2139cabdff1aSopenharmony_ci                          filt2);
2140cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2141cabdff1aSopenharmony_ci                          filt2);
2142cabdff1aSopenharmony_ci    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2143cabdff1aSopenharmony_ci
2144cabdff1aSopenharmony_ci    SRARI_H2_SH(hz_out2, hz_out4, 5);
2145cabdff1aSopenharmony_ci    SAT_SH2_SH(hz_out2, hz_out4, 7);
2146cabdff1aSopenharmony_ci
2147cabdff1aSopenharmony_ci    dst0 = __msa_aver_s_h(dst0, hz_out2);
2148cabdff1aSopenharmony_ci    dst1 = __msa_aver_s_h(dst1, hz_out4);
2149cabdff1aSopenharmony_ci
2150cabdff1aSopenharmony_ci    res = PCKEV_XORI128_UB(dst0, dst1);
2151cabdff1aSopenharmony_ci    ST_W4(res, 0, 1, 2, 3, dst, stride);
2152cabdff1aSopenharmony_ci}
2153cabdff1aSopenharmony_ci
2154cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
2155cabdff1aSopenharmony_ci                                ptrdiff_t stride)
2156cabdff1aSopenharmony_ci{
2157cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
2158cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
2159cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
2160cabdff1aSopenharmony_ci    v16u8 res;
2161cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2162cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2;
2163cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2164cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2165cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2166cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2167cabdff1aSopenharmony_ci    v4i32 tmp0, tmp1;
2168cabdff1aSopenharmony_ci
2169cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2170cabdff1aSopenharmony_ci
2171cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
2172cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
2173cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
2174cabdff1aSopenharmony_ci
2175cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
2176cabdff1aSopenharmony_ci
2177cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2178cabdff1aSopenharmony_ci    src += (5 * stride);
2179cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
2180cabdff1aSopenharmony_ci
2181cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2182cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
2183cabdff1aSopenharmony_ci
2184cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
2185cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
2186cabdff1aSopenharmony_ci    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
2187cabdff1aSopenharmony_ci    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
2188cabdff1aSopenharmony_ci    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2189cabdff1aSopenharmony_ci    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2190cabdff1aSopenharmony_ci    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2191cabdff1aSopenharmony_ci
2192cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2193cabdff1aSopenharmony_ci               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2194cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2195cabdff1aSopenharmony_ci               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2196cabdff1aSopenharmony_ci
2197cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2198cabdff1aSopenharmony_ci                          filt2);
2199cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2200cabdff1aSopenharmony_ci                          filt2);
2201cabdff1aSopenharmony_ci    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2202cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2203cabdff1aSopenharmony_ci                          filt2);
2204cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2205cabdff1aSopenharmony_ci                          filt2);
2206cabdff1aSopenharmony_ci    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2207cabdff1aSopenharmony_ci
2208cabdff1aSopenharmony_ci    PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
2209cabdff1aSopenharmony_ci    SRARI_H2_SH(hz_out0, hz_out1, 5);
2210cabdff1aSopenharmony_ci    SAT_SH2_SH(hz_out0, hz_out1, 7);
2211cabdff1aSopenharmony_ci
2212cabdff1aSopenharmony_ci    dst0 = __msa_aver_s_h(dst0, hz_out0);
2213cabdff1aSopenharmony_ci    dst1 = __msa_aver_s_h(dst1, hz_out1);
2214cabdff1aSopenharmony_ci
2215cabdff1aSopenharmony_ci    res = PCKEV_XORI128_UB(dst0, dst1);
2216cabdff1aSopenharmony_ci    ST_W4(res, 0, 1, 2, 3, dst, stride);
2217cabdff1aSopenharmony_ci}
2218cabdff1aSopenharmony_ci
2219cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
2220cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
2221cabdff1aSopenharmony_ci{
2222cabdff1aSopenharmony_ci    int32_t loop_cnt;
2223cabdff1aSopenharmony_ci    int16_t filt_const0 = 0xfb01;
2224cabdff1aSopenharmony_ci    int16_t filt_const1 = 0x1414;
2225cabdff1aSopenharmony_ci    int16_t filt_const2 = 0x1fb;
2226cabdff1aSopenharmony_ci    v16u8 res0, res1, res2, res3;
2227cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2228cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2229cabdff1aSopenharmony_ci    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2230cabdff1aSopenharmony_ci    v16i8 src65_l, src87_l, filt0, filt1, filt2;
2231cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2232cabdff1aSopenharmony_ci
2233cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
2234cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
2235cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
2236cabdff1aSopenharmony_ci    src -= (stride * 2);
2237cabdff1aSopenharmony_ci
2238cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2239cabdff1aSopenharmony_ci    src += (5 * stride);
2240cabdff1aSopenharmony_ci
2241cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2242cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2243cabdff1aSopenharmony_ci               src32_r, src43_r);
2244cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2245cabdff1aSopenharmony_ci               src32_l, src43_l);
2246cabdff1aSopenharmony_ci
2247cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
2248cabdff1aSopenharmony_ci        LD_SB4(src, stride, src5, src6, src7, src8);
2249cabdff1aSopenharmony_ci        src += (4 * stride);
2250cabdff1aSopenharmony_ci
2251cabdff1aSopenharmony_ci        XORI_B4_128_SB(src5, src6, src7, src8);
2252cabdff1aSopenharmony_ci        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2253cabdff1aSopenharmony_ci                   src65_r, src76_r, src87_r);
2254cabdff1aSopenharmony_ci        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2255cabdff1aSopenharmony_ci                   src65_l, src76_l, src87_l);
2256cabdff1aSopenharmony_ci        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2257cabdff1aSopenharmony_ci        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2258cabdff1aSopenharmony_ci        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2259cabdff1aSopenharmony_ci        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2260cabdff1aSopenharmony_ci        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2261cabdff1aSopenharmony_ci        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2262cabdff1aSopenharmony_ci        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2263cabdff1aSopenharmony_ci        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2264cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2265cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2266cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
2267cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2268cabdff1aSopenharmony_ci        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2269cabdff1aSopenharmony_ci                    out3_r, res0, res1, res2, res3);
2270cabdff1aSopenharmony_ci        XORI_B4_128_UB(res0, res1, res2, res3);
2271cabdff1aSopenharmony_ci        ST_UB4(res0, res1, res2, res3, dst, stride);
2272cabdff1aSopenharmony_ci        dst += (4 * stride);
2273cabdff1aSopenharmony_ci
2274cabdff1aSopenharmony_ci        src10_r = src54_r;
2275cabdff1aSopenharmony_ci        src32_r = src76_r;
2276cabdff1aSopenharmony_ci        src21_r = src65_r;
2277cabdff1aSopenharmony_ci        src43_r = src87_r;
2278cabdff1aSopenharmony_ci        src10_l = src54_l;
2279cabdff1aSopenharmony_ci        src32_l = src76_l;
2280cabdff1aSopenharmony_ci        src21_l = src65_l;
2281cabdff1aSopenharmony_ci        src43_l = src87_l;
2282cabdff1aSopenharmony_ci        src4 = src8;
2283cabdff1aSopenharmony_ci    }
2284cabdff1aSopenharmony_ci}
2285cabdff1aSopenharmony_ci
2286cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
2287cabdff1aSopenharmony_ci                                ptrdiff_t stride)
2288cabdff1aSopenharmony_ci{
2289cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
2290cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
2291cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
2292cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
2293cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2294cabdff1aSopenharmony_ci    v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r;
2295cabdff1aSopenharmony_ci    v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r;
2296cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2;
2297cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2298cabdff1aSopenharmony_ci
2299cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
2300cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
2301cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
2302cabdff1aSopenharmony_ci
2303cabdff1aSopenharmony_ci    src -= (stride * 2);
2304cabdff1aSopenharmony_ci
2305cabdff1aSopenharmony_ci    LD_SB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2306cabdff1aSopenharmony_ci    src += (8 * stride);
2307cabdff1aSopenharmony_ci    LD_SB5(src, stride, src8, src9, src10, src11, src12);
2308cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2309cabdff1aSopenharmony_ci               src32_r, src43_r);
2310cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r,
2311cabdff1aSopenharmony_ci               src98_r, src109_r);
2312cabdff1aSopenharmony_ci    ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r,
2313cabdff1aSopenharmony_ci               src910_r, src1110_r, src1211_r);
2314cabdff1aSopenharmony_ci    XORI_B4_128_SB(src10_r, src21_r, src32_r, src43_r);
2315cabdff1aSopenharmony_ci    XORI_B4_128_SB(src76_r, src87_r, src98_r, src109_r);
2316cabdff1aSopenharmony_ci    XORI_B4_128_SB(src89_r, src910_r, src1110_r, src1211_r);
2317cabdff1aSopenharmony_ci    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
2318cabdff1aSopenharmony_ci    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
2319cabdff1aSopenharmony_ci    out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
2320cabdff1aSopenharmony_ci    out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
2321cabdff1aSopenharmony_ci    out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2);
2322cabdff1aSopenharmony_ci    out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2);
2323cabdff1aSopenharmony_ci    out6_r = AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2);
2324cabdff1aSopenharmony_ci    out7_r = AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2);
2325cabdff1aSopenharmony_ci    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2326cabdff1aSopenharmony_ci    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
2327cabdff1aSopenharmony_ci    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2328cabdff1aSopenharmony_ci    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2329cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(out0_r, out1_r);
2330cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(out2_r, out3_r);
2331cabdff1aSopenharmony_ci    out2 = PCKEV_XORI128_UB(out4_r, out5_r);
2332cabdff1aSopenharmony_ci    out3 = PCKEV_XORI128_UB(out6_r, out7_r);
2333cabdff1aSopenharmony_ci    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
2334cabdff1aSopenharmony_ci}
2335cabdff1aSopenharmony_ci
2336cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
2337cabdff1aSopenharmony_ci                                ptrdiff_t stride)
2338cabdff1aSopenharmony_ci{
2339cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
2340cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
2341cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
2342cabdff1aSopenharmony_ci    v16u8 out;
2343cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2344cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2345cabdff1aSopenharmony_ci    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2346cabdff1aSopenharmony_ci    v8i16 out10, out32;
2347cabdff1aSopenharmony_ci
2348cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
2349cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
2350cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
2351cabdff1aSopenharmony_ci
2352cabdff1aSopenharmony_ci    src -= (stride * 2);
2353cabdff1aSopenharmony_ci
2354cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2355cabdff1aSopenharmony_ci    src += (5 * stride);
2356cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
2357cabdff1aSopenharmony_ci
2358cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2359cabdff1aSopenharmony_ci               src32_r, src43_r);
2360cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2361cabdff1aSopenharmony_ci               src76_r, src87_r);
2362cabdff1aSopenharmony_ci    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
2363cabdff1aSopenharmony_ci               src76_r, src2110, src4332, src6554, src8776);
2364cabdff1aSopenharmony_ci    XORI_B4_128_SB(src2110, src4332, src6554, src8776);
2365cabdff1aSopenharmony_ci    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2366cabdff1aSopenharmony_ci    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2367cabdff1aSopenharmony_ci    SRARI_H2_SH(out10, out32, 5);
2368cabdff1aSopenharmony_ci    SAT_SH2_SH(out10, out32, 7);
2369cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out10, out32);
2370cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, stride);
2371cabdff1aSopenharmony_ci}
2372cabdff1aSopenharmony_ci
2373cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
2374cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
2375cabdff1aSopenharmony_ci{
2376cabdff1aSopenharmony_ci    uint32_t row;
2377cabdff1aSopenharmony_ci    v16u8 out;
2378cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2379cabdff1aSopenharmony_ci    v16i8 src11;
2380cabdff1aSopenharmony_ci    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2381cabdff1aSopenharmony_ci    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2382cabdff1aSopenharmony_ci    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2383cabdff1aSopenharmony_ci    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2384cabdff1aSopenharmony_ci    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2385cabdff1aSopenharmony_ci    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2386cabdff1aSopenharmony_ci    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2387cabdff1aSopenharmony_ci    v8i16 minus5h = __msa_ldi_h(-5);
2388cabdff1aSopenharmony_ci    v8i16 plus20h = __msa_ldi_h(20);
2389cabdff1aSopenharmony_ci
2390cabdff1aSopenharmony_ci    mask3 = mask0 + 4;
2391cabdff1aSopenharmony_ci    mask4 = mask1 + 4;
2392cabdff1aSopenharmony_ci    mask5 = mask2 + 4;
2393cabdff1aSopenharmony_ci
2394cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
2395cabdff1aSopenharmony_ci
2396cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2397cabdff1aSopenharmony_ci    LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
2398cabdff1aSopenharmony_ci    src += (5 * stride);
2399cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2400cabdff1aSopenharmony_ci    XORI_B5_128_SB(src7, src8, src9, src10, src11);
2401cabdff1aSopenharmony_ci
2402cabdff1aSopenharmony_ci    for (row = 16; row--;) {
2403cabdff1aSopenharmony_ci        LD_SB2(src, 8, src5, src6);
2404cabdff1aSopenharmony_ci        src += stride;
2405cabdff1aSopenharmony_ci        XORI_B2_128_SB(src5, src6);
2406cabdff1aSopenharmony_ci
2407cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2408cabdff1aSopenharmony_ci                                        vt_res0, vt_res1);
2409cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
2410cabdff1aSopenharmony_ci                                        vt_res2, vt_res3);
2411cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2412cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2413cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2414cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2415cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2416cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2417cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2418cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2419cabdff1aSopenharmony_ci        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2420cabdff1aSopenharmony_ci        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2421cabdff1aSopenharmony_ci        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2422cabdff1aSopenharmony_ci        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2423cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2424cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2425cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2426cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2427cabdff1aSopenharmony_ci        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2428cabdff1aSopenharmony_ci        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2429cabdff1aSopenharmony_ci        dst0 = __msa_srari_h(shf_vec2, 5);
2430cabdff1aSopenharmony_ci        dst1 = __msa_srari_h(shf_vec5, 5);
2431cabdff1aSopenharmony_ci        dst2 = __msa_srari_h(shf_vec8, 5);
2432cabdff1aSopenharmony_ci        dst3 = __msa_srari_h(shf_vec11, 5);
2433cabdff1aSopenharmony_ci        SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2434cabdff1aSopenharmony_ci        PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
2435cabdff1aSopenharmony_ci        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2436cabdff1aSopenharmony_ci        dst0 = __msa_aver_s_h(dst2, dst0);
2437cabdff1aSopenharmony_ci        dst1 = __msa_aver_s_h(dst3, dst1);
2438cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(dst0, dst1);
2439cabdff1aSopenharmony_ci        ST_UB(out, dst);
2440cabdff1aSopenharmony_ci        dst += stride;
2441cabdff1aSopenharmony_ci
2442cabdff1aSopenharmony_ci        src0 = src1;
2443cabdff1aSopenharmony_ci        src1 = src2;
2444cabdff1aSopenharmony_ci        src2 = src3;
2445cabdff1aSopenharmony_ci        src3 = src4;
2446cabdff1aSopenharmony_ci        src4 = src5;
2447cabdff1aSopenharmony_ci        src7 = src8;
2448cabdff1aSopenharmony_ci        src8 = src9;
2449cabdff1aSopenharmony_ci        src9 = src10;
2450cabdff1aSopenharmony_ci        src10 = src11;
2451cabdff1aSopenharmony_ci        src11 = src6;
2452cabdff1aSopenharmony_ci    }
2453cabdff1aSopenharmony_ci}
2454cabdff1aSopenharmony_ci
2455cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
2456cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
2457cabdff1aSopenharmony_ci{
2458cabdff1aSopenharmony_ci    uint32_t row;
2459cabdff1aSopenharmony_ci    v16u8 out;
2460cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2461cabdff1aSopenharmony_ci    v16i8 src11;
2462cabdff1aSopenharmony_ci    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2463cabdff1aSopenharmony_ci    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2464cabdff1aSopenharmony_ci    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2465cabdff1aSopenharmony_ci    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2466cabdff1aSopenharmony_ci    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2467cabdff1aSopenharmony_ci    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2468cabdff1aSopenharmony_ci    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2469cabdff1aSopenharmony_ci    v8i16 minus5h = __msa_ldi_h(-5);
2470cabdff1aSopenharmony_ci    v8i16 plus20h = __msa_ldi_h(20);
2471cabdff1aSopenharmony_ci
2472cabdff1aSopenharmony_ci    mask3 = mask0 + 4;
2473cabdff1aSopenharmony_ci    mask4 = mask1 + 4;
2474cabdff1aSopenharmony_ci    mask5 = mask2 + 4;
2475cabdff1aSopenharmony_ci
2476cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
2477cabdff1aSopenharmony_ci
2478cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2479cabdff1aSopenharmony_ci    LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
2480cabdff1aSopenharmony_ci    src += (5 * stride);
2481cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2482cabdff1aSopenharmony_ci    XORI_B5_128_SB(src7, src8, src9, src10, src11);
2483cabdff1aSopenharmony_ci
2484cabdff1aSopenharmony_ci    for (row = 16; row--;) {
2485cabdff1aSopenharmony_ci        LD_SB2(src, 8, src5, src6);
2486cabdff1aSopenharmony_ci        src += stride;
2487cabdff1aSopenharmony_ci        XORI_B2_128_SB(src5, src6);
2488cabdff1aSopenharmony_ci
2489cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2490cabdff1aSopenharmony_ci                                        vt_res0, vt_res1);
2491cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
2492cabdff1aSopenharmony_ci                                        vt_res2, vt_res3);
2493cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2494cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2495cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2496cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2497cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2498cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2499cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2500cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2501cabdff1aSopenharmony_ci        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2502cabdff1aSopenharmony_ci        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2503cabdff1aSopenharmony_ci        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2504cabdff1aSopenharmony_ci        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2505cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2506cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2507cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2508cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2509cabdff1aSopenharmony_ci        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2510cabdff1aSopenharmony_ci        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2511cabdff1aSopenharmony_ci        dst0 = __msa_srari_h(shf_vec2, 5);
2512cabdff1aSopenharmony_ci        dst1 = __msa_srari_h(shf_vec5, 5);
2513cabdff1aSopenharmony_ci        dst2 = __msa_srari_h(shf_vec8, 5);
2514cabdff1aSopenharmony_ci        dst3 = __msa_srari_h(shf_vec11, 5);
2515cabdff1aSopenharmony_ci        SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2516cabdff1aSopenharmony_ci        dst0 = __msa_pckod_h(dst2, dst0);
2517cabdff1aSopenharmony_ci        dst1 = __msa_pckod_h(dst3, dst1);
2518cabdff1aSopenharmony_ci        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2519cabdff1aSopenharmony_ci        dst0 = __msa_aver_s_h(dst2, dst0);
2520cabdff1aSopenharmony_ci        dst1 = __msa_aver_s_h(dst3, dst1);
2521cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(dst0, dst1);
2522cabdff1aSopenharmony_ci        ST_UB(out, dst);
2523cabdff1aSopenharmony_ci        dst += stride;
2524cabdff1aSopenharmony_ci
2525cabdff1aSopenharmony_ci        src0 = src1;
2526cabdff1aSopenharmony_ci        src1 = src2;
2527cabdff1aSopenharmony_ci        src2 = src3;
2528cabdff1aSopenharmony_ci        src3 = src4;
2529cabdff1aSopenharmony_ci        src4 = src5;
2530cabdff1aSopenharmony_ci        src7 = src8;
2531cabdff1aSopenharmony_ci        src8 = src9;
2532cabdff1aSopenharmony_ci        src9 = src10;
2533cabdff1aSopenharmony_ci        src10 = src11;
2534cabdff1aSopenharmony_ci        src11 = src6;
2535cabdff1aSopenharmony_ci    }
2536cabdff1aSopenharmony_ci}
2537cabdff1aSopenharmony_ci
2538cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
2539cabdff1aSopenharmony_ci                                ptrdiff_t stride)
2540cabdff1aSopenharmony_ci{
2541cabdff1aSopenharmony_ci    uint32_t row;
2542cabdff1aSopenharmony_ci    v16u8 out;
2543cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
2544cabdff1aSopenharmony_ci    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2545cabdff1aSopenharmony_ci    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2546cabdff1aSopenharmony_ci    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2547cabdff1aSopenharmony_ci    v8i16 mask3, mask4, mask5;
2548cabdff1aSopenharmony_ci    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2549cabdff1aSopenharmony_ci    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2550cabdff1aSopenharmony_ci    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2551cabdff1aSopenharmony_ci    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2552cabdff1aSopenharmony_ci    v8i16 minus5h = __msa_ldi_h(-5);
2553cabdff1aSopenharmony_ci    v8i16 plus20h = __msa_ldi_h(20);
2554cabdff1aSopenharmony_ci
2555cabdff1aSopenharmony_ci    mask3 = mask0 + 4;
2556cabdff1aSopenharmony_ci    mask4 = mask1 + 4;
2557cabdff1aSopenharmony_ci    mask5 = mask2 + 4;
2558cabdff1aSopenharmony_ci
2559cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
2560cabdff1aSopenharmony_ci
2561cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2562cabdff1aSopenharmony_ci    src += (5 * stride);
2563cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2564cabdff1aSopenharmony_ci
2565cabdff1aSopenharmony_ci    for (row = 4; row--;) {
2566cabdff1aSopenharmony_ci        LD_SB2(src, stride, src5, src6);
2567cabdff1aSopenharmony_ci        src += (2 * stride);
2568cabdff1aSopenharmony_ci        XORI_B2_128_SB(src5, src6);
2569cabdff1aSopenharmony_ci
2570cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2571cabdff1aSopenharmony_ci                                        vt_res0, vt_res1);
2572cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
2573cabdff1aSopenharmony_ci                                        vt_res2, vt_res3);
2574cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2575cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2576cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2577cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2578cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2579cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2580cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2581cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2582cabdff1aSopenharmony_ci        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2583cabdff1aSopenharmony_ci        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2584cabdff1aSopenharmony_ci        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2585cabdff1aSopenharmony_ci        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2586cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2587cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2588cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2589cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2590cabdff1aSopenharmony_ci        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2591cabdff1aSopenharmony_ci        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2592cabdff1aSopenharmony_ci        dst0 = __msa_srari_h(shf_vec2, 5);
2593cabdff1aSopenharmony_ci        dst1 = __msa_srari_h(shf_vec5, 5);
2594cabdff1aSopenharmony_ci        dst2 = __msa_srari_h(shf_vec8, 5);
2595cabdff1aSopenharmony_ci        dst3 = __msa_srari_h(shf_vec11, 5);
2596cabdff1aSopenharmony_ci        SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2597cabdff1aSopenharmony_ci        PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
2598cabdff1aSopenharmony_ci        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2599cabdff1aSopenharmony_ci        dst0 = __msa_aver_s_h(dst2, dst0);
2600cabdff1aSopenharmony_ci        dst1 = __msa_aver_s_h(dst3, dst1);
2601cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(dst0, dst1);
2602cabdff1aSopenharmony_ci        ST_D2(out, 0, 1, dst, stride);
2603cabdff1aSopenharmony_ci        dst += (2 * stride);
2604cabdff1aSopenharmony_ci
2605cabdff1aSopenharmony_ci        src0 = src2;
2606cabdff1aSopenharmony_ci        src1 = src3;
2607cabdff1aSopenharmony_ci        src2 = src4;
2608cabdff1aSopenharmony_ci        src3 = src5;
2609cabdff1aSopenharmony_ci        src4 = src6;
2610cabdff1aSopenharmony_ci    }
2611cabdff1aSopenharmony_ci}
2612cabdff1aSopenharmony_ci
2613cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
2614cabdff1aSopenharmony_ci                                ptrdiff_t stride)
2615cabdff1aSopenharmony_ci{
2616cabdff1aSopenharmony_ci    uint32_t row;
2617cabdff1aSopenharmony_ci    v16u8 out;
2618cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
2619cabdff1aSopenharmony_ci    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2620cabdff1aSopenharmony_ci    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2621cabdff1aSopenharmony_ci    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2622cabdff1aSopenharmony_ci    v8i16 mask3, mask4, mask5;
2623cabdff1aSopenharmony_ci    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2624cabdff1aSopenharmony_ci    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2625cabdff1aSopenharmony_ci    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2626cabdff1aSopenharmony_ci    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2627cabdff1aSopenharmony_ci    v8i16 minus5h = __msa_ldi_h(-5);
2628cabdff1aSopenharmony_ci    v8i16 plus20h = __msa_ldi_h(20);
2629cabdff1aSopenharmony_ci
2630cabdff1aSopenharmony_ci    mask3 = mask0 + 4;
2631cabdff1aSopenharmony_ci    mask4 = mask1 + 4;
2632cabdff1aSopenharmony_ci    mask5 = mask2 + 4;
2633cabdff1aSopenharmony_ci
2634cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
2635cabdff1aSopenharmony_ci
2636cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2637cabdff1aSopenharmony_ci    src += (5 * stride);
2638cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2639cabdff1aSopenharmony_ci
2640cabdff1aSopenharmony_ci    for (row = 4; row--;) {
2641cabdff1aSopenharmony_ci        LD_SB2(src, stride, src5, src6);
2642cabdff1aSopenharmony_ci        src += (2 * stride);
2643cabdff1aSopenharmony_ci        XORI_B2_128_SB(src5, src6);
2644cabdff1aSopenharmony_ci
2645cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2646cabdff1aSopenharmony_ci                                        vt_res0, vt_res1);
2647cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
2648cabdff1aSopenharmony_ci                                        vt_res2, vt_res3);
2649cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2650cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2651cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2652cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2653cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2654cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2655cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2656cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2657cabdff1aSopenharmony_ci        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2658cabdff1aSopenharmony_ci        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2659cabdff1aSopenharmony_ci        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2660cabdff1aSopenharmony_ci        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2661cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2662cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2663cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2664cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2665cabdff1aSopenharmony_ci        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2666cabdff1aSopenharmony_ci        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2667cabdff1aSopenharmony_ci        dst0 = __msa_srari_h(shf_vec2, 5);
2668cabdff1aSopenharmony_ci        dst1 = __msa_srari_h(shf_vec5, 5);
2669cabdff1aSopenharmony_ci        dst2 = __msa_srari_h(shf_vec8, 5);
2670cabdff1aSopenharmony_ci        dst3 = __msa_srari_h(shf_vec11, 5);
2671cabdff1aSopenharmony_ci        SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2672cabdff1aSopenharmony_ci        dst0 = __msa_pckod_h(dst2, dst0);
2673cabdff1aSopenharmony_ci        dst1 = __msa_pckod_h(dst3, dst1);
2674cabdff1aSopenharmony_ci        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2675cabdff1aSopenharmony_ci        dst0 = __msa_aver_s_h(dst2, dst0);
2676cabdff1aSopenharmony_ci        dst1 = __msa_aver_s_h(dst3, dst1);
2677cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(dst0, dst1);
2678cabdff1aSopenharmony_ci        ST_D2(out, 0, 1, dst, stride);
2679cabdff1aSopenharmony_ci        dst += (2 * stride);
2680cabdff1aSopenharmony_ci
2681cabdff1aSopenharmony_ci        src0 = src2;
2682cabdff1aSopenharmony_ci        src1 = src3;
2683cabdff1aSopenharmony_ci        src2 = src4;
2684cabdff1aSopenharmony_ci        src3 = src5;
2685cabdff1aSopenharmony_ci        src4 = src6;
2686cabdff1aSopenharmony_ci    }
2687cabdff1aSopenharmony_ci}
2688cabdff1aSopenharmony_ci
2689cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
2690cabdff1aSopenharmony_ci                                ptrdiff_t stride)
2691cabdff1aSopenharmony_ci{
2692cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
2693cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
2694cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
2695cabdff1aSopenharmony_ci    v16u8 out;
2696cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2697cabdff1aSopenharmony_ci    v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2698cabdff1aSopenharmony_ci    v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2699cabdff1aSopenharmony_ci    v16i8 src76_l, src87_l, filt0, filt1, filt2;
2700cabdff1aSopenharmony_ci    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2701cabdff1aSopenharmony_ci    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2702cabdff1aSopenharmony_ci    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2703cabdff1aSopenharmony_ci    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2704cabdff1aSopenharmony_ci    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2705cabdff1aSopenharmony_ci    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2706cabdff1aSopenharmony_ci    v8i16 minus5h = __msa_ldi_h(-5);
2707cabdff1aSopenharmony_ci    v8i16 plus20h = __msa_ldi_h(20);
2708cabdff1aSopenharmony_ci    v8i16 zeros = { 0 };
2709cabdff1aSopenharmony_ci
2710cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
2711cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
2712cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
2713cabdff1aSopenharmony_ci
2714cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
2715cabdff1aSopenharmony_ci
2716cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2717cabdff1aSopenharmony_ci    src += (5 * stride);
2718cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2719cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
2720cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
2721cabdff1aSopenharmony_ci
2722cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2723cabdff1aSopenharmony_ci               src32_r, src43_r);
2724cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2725cabdff1aSopenharmony_ci               src76_r, src87_r);
2726cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2727cabdff1aSopenharmony_ci               src32_l, src43_l);
2728cabdff1aSopenharmony_ci    ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2729cabdff1aSopenharmony_ci               src76_l, src87_l);
2730cabdff1aSopenharmony_ci    vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2731cabdff1aSopenharmony_ci    vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2732cabdff1aSopenharmony_ci    vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2733cabdff1aSopenharmony_ci    vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2734cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2735cabdff1aSopenharmony_ci               mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2736cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2737cabdff1aSopenharmony_ci               mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2738cabdff1aSopenharmony_ci    hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2739cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2740cabdff1aSopenharmony_ci    hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2741cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2742cabdff1aSopenharmony_ci
2743cabdff1aSopenharmony_ci    vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2744cabdff1aSopenharmony_ci    vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2745cabdff1aSopenharmony_ci    vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2746cabdff1aSopenharmony_ci    vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2747cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2748cabdff1aSopenharmony_ci               mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2749cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2750cabdff1aSopenharmony_ci               mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2751cabdff1aSopenharmony_ci    hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2752cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2753cabdff1aSopenharmony_ci    hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2754cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2755cabdff1aSopenharmony_ci
2756cabdff1aSopenharmony_ci    SRARI_W2_SW(hz_res0, hz_res1, 10);
2757cabdff1aSopenharmony_ci    SAT_SW2_SW(hz_res0, hz_res1, 7);
2758cabdff1aSopenharmony_ci    SRARI_W2_SW(hz_res2, hz_res3, 10);
2759cabdff1aSopenharmony_ci    SAT_SW2_SW(hz_res2, hz_res3, 7);
2760cabdff1aSopenharmony_ci
2761cabdff1aSopenharmony_ci    dst0 = __msa_srari_h(shf_vec2, 5);
2762cabdff1aSopenharmony_ci    dst1 = __msa_srari_h(shf_vec5, 5);
2763cabdff1aSopenharmony_ci    dst2 = __msa_srari_h(shf_vec6, 5);
2764cabdff1aSopenharmony_ci    dst3 = __msa_srari_h(shf_vec7, 5);
2765cabdff1aSopenharmony_ci
2766cabdff1aSopenharmony_ci    SAT_SH2_SH(dst0, dst1, 7);
2767cabdff1aSopenharmony_ci    SAT_SH2_SH(dst2, dst3, 7);
2768cabdff1aSopenharmony_ci    ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
2769cabdff1aSopenharmony_ci    ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
2770cabdff1aSopenharmony_ci
2771cabdff1aSopenharmony_ci    hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2772cabdff1aSopenharmony_ci    hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2773cabdff1aSopenharmony_ci    hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2774cabdff1aSopenharmony_ci    hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2775cabdff1aSopenharmony_ci
2776cabdff1aSopenharmony_ci    PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2777cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(dst0, dst2);
2778cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, stride);
2779cabdff1aSopenharmony_ci}
2780cabdff1aSopenharmony_ci
2781cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
2782cabdff1aSopenharmony_ci                                ptrdiff_t stride)
2783cabdff1aSopenharmony_ci{
2784cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
2785cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
2786cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
2787cabdff1aSopenharmony_ci    v16u8 out;
2788cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2789cabdff1aSopenharmony_ci    v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2790cabdff1aSopenharmony_ci    v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2791cabdff1aSopenharmony_ci    v16i8 src76_l, src87_l, filt0, filt1, filt2;
2792cabdff1aSopenharmony_ci    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2793cabdff1aSopenharmony_ci    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2794cabdff1aSopenharmony_ci    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2795cabdff1aSopenharmony_ci    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2796cabdff1aSopenharmony_ci    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2797cabdff1aSopenharmony_ci    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2798cabdff1aSopenharmony_ci    v8i16 minus5h = __msa_ldi_h(-5);
2799cabdff1aSopenharmony_ci    v8i16 plus20h = __msa_ldi_h(20);
2800cabdff1aSopenharmony_ci    v8i16 zeros = { 0 };
2801cabdff1aSopenharmony_ci
2802cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
2803cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
2804cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
2805cabdff1aSopenharmony_ci
2806cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
2807cabdff1aSopenharmony_ci
2808cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2809cabdff1aSopenharmony_ci    src += (5 * stride);
2810cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2811cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
2812cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
2813cabdff1aSopenharmony_ci
2814cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2815cabdff1aSopenharmony_ci               src32_r, src43_r);
2816cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2817cabdff1aSopenharmony_ci               src76_r, src87_r);
2818cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2819cabdff1aSopenharmony_ci               src32_l, src43_l);
2820cabdff1aSopenharmony_ci    ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2821cabdff1aSopenharmony_ci               src76_l, src87_l);
2822cabdff1aSopenharmony_ci
2823cabdff1aSopenharmony_ci    vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2824cabdff1aSopenharmony_ci    vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2825cabdff1aSopenharmony_ci    vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2826cabdff1aSopenharmony_ci    vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2827cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2828cabdff1aSopenharmony_ci               mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2829cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2830cabdff1aSopenharmony_ci               mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2831cabdff1aSopenharmony_ci    hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2832cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2833cabdff1aSopenharmony_ci    hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2834cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2835cabdff1aSopenharmony_ci
2836cabdff1aSopenharmony_ci    vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2837cabdff1aSopenharmony_ci    vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2838cabdff1aSopenharmony_ci    vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2839cabdff1aSopenharmony_ci    vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2840cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2841cabdff1aSopenharmony_ci               mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2842cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2843cabdff1aSopenharmony_ci               mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2844cabdff1aSopenharmony_ci    hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2845cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2846cabdff1aSopenharmony_ci    hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2847cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2848cabdff1aSopenharmony_ci
2849cabdff1aSopenharmony_ci    SRARI_W2_SW(hz_res0, hz_res1, 10);
2850cabdff1aSopenharmony_ci    SAT_SW2_SW(hz_res0, hz_res1, 7);
2851cabdff1aSopenharmony_ci    SRARI_W2_SW(hz_res2, hz_res3, 10);
2852cabdff1aSopenharmony_ci    SAT_SW2_SW(hz_res2, hz_res3, 7);
2853cabdff1aSopenharmony_ci
2854cabdff1aSopenharmony_ci    dst0 = __msa_srari_h(shf_vec2, 5);
2855cabdff1aSopenharmony_ci    dst1 = __msa_srari_h(shf_vec5, 5);
2856cabdff1aSopenharmony_ci    dst2 = __msa_srari_h(shf_vec6, 5);
2857cabdff1aSopenharmony_ci    dst3 = __msa_srari_h(shf_vec7, 5);
2858cabdff1aSopenharmony_ci
2859cabdff1aSopenharmony_ci    SAT_SH2_SH(dst0, dst1, 7);
2860cabdff1aSopenharmony_ci    SAT_SH2_SH(dst2, dst3, 7);
2861cabdff1aSopenharmony_ci
2862cabdff1aSopenharmony_ci    dst0 = __msa_ilvod_h(zeros, dst0);
2863cabdff1aSopenharmony_ci    dst1 = __msa_ilvod_h(zeros, dst1);
2864cabdff1aSopenharmony_ci    dst2 = __msa_ilvod_h(zeros, dst2);
2865cabdff1aSopenharmony_ci    dst3 = __msa_ilvod_h(zeros, dst3);
2866cabdff1aSopenharmony_ci
2867cabdff1aSopenharmony_ci    hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2868cabdff1aSopenharmony_ci    hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2869cabdff1aSopenharmony_ci    hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2870cabdff1aSopenharmony_ci    hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2871cabdff1aSopenharmony_ci
2872cabdff1aSopenharmony_ci    PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2873cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(dst0, dst2);
2874cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, stride);
2875cabdff1aSopenharmony_ci}
2876cabdff1aSopenharmony_ci
2877cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
2878cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
2879cabdff1aSopenharmony_ci{
2880cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
2881cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
2882cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
2883cabdff1aSopenharmony_ci    const uint8_t *src_tmp = src - (2 * stride) - 2;
2884cabdff1aSopenharmony_ci    uint8_t *dst_tmp = dst;
2885cabdff1aSopenharmony_ci    uint32_t multiple8_cnt, loop_cnt;
2886cabdff1aSopenharmony_ci    v16u8 out0, out1;
2887cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
2888cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2889cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3;
2890cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2891cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2892cabdff1aSopenharmony_ci    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2893cabdff1aSopenharmony_ci    v8i16 hz_out87_l, filt0, filt1, filt2;
2894cabdff1aSopenharmony_ci    v4i32 tmp0, tmp1;
2895cabdff1aSopenharmony_ci
2896cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
2897cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
2898cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
2899cabdff1aSopenharmony_ci
2900cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2901cabdff1aSopenharmony_ci
2902cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
2903cabdff1aSopenharmony_ci        src = src_tmp;
2904cabdff1aSopenharmony_ci        dst = dst_tmp;
2905cabdff1aSopenharmony_ci
2906cabdff1aSopenharmony_ci        LD_SB5(src, stride, src0, src1, src2, src3, src4);
2907cabdff1aSopenharmony_ci        XORI_B5_128_SB(src0, src1, src2, src3, src4);
2908cabdff1aSopenharmony_ci        src += (5 * stride);
2909cabdff1aSopenharmony_ci
2910cabdff1aSopenharmony_ci        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2911cabdff1aSopenharmony_ci        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2912cabdff1aSopenharmony_ci        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2913cabdff1aSopenharmony_ci        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2914cabdff1aSopenharmony_ci        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
2915cabdff1aSopenharmony_ci
2916cabdff1aSopenharmony_ci        for (loop_cnt = 4; loop_cnt--;) {
2917cabdff1aSopenharmony_ci            LD_SB4(src, stride, src0, src1, src2, src3);
2918cabdff1aSopenharmony_ci            XORI_B4_128_SB(src0, src1, src2, src3);
2919cabdff1aSopenharmony_ci            src += (4 * stride);
2920cabdff1aSopenharmony_ci
2921cabdff1aSopenharmony_ci            hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2922cabdff1aSopenharmony_ci            hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2923cabdff1aSopenharmony_ci            hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2924cabdff1aSopenharmony_ci            hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2925cabdff1aSopenharmony_ci
2926cabdff1aSopenharmony_ci            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2927cabdff1aSopenharmony_ci                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
2928cabdff1aSopenharmony_ci                       hz_out43_r);
2929cabdff1aSopenharmony_ci            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2930cabdff1aSopenharmony_ci                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
2931cabdff1aSopenharmony_ci                       hz_out43_l);
2932cabdff1aSopenharmony_ci            ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2933cabdff1aSopenharmony_ci                       hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
2934cabdff1aSopenharmony_ci                       hz_out87_r);
2935cabdff1aSopenharmony_ci            ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2936cabdff1aSopenharmony_ci                       hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
2937cabdff1aSopenharmony_ci                       hz_out87_l);
2938cabdff1aSopenharmony_ci
2939cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
2940cabdff1aSopenharmony_ci                                  filt1, filt2);
2941cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
2942cabdff1aSopenharmony_ci                                  filt1, filt2);
2943cabdff1aSopenharmony_ci            dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2944cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
2945cabdff1aSopenharmony_ci                                  filt1, filt2);
2946cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
2947cabdff1aSopenharmony_ci                                  filt1, filt2);
2948cabdff1aSopenharmony_ci            dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2949cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
2950cabdff1aSopenharmony_ci                                  filt1, filt2);
2951cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
2952cabdff1aSopenharmony_ci                                  filt1, filt2);
2953cabdff1aSopenharmony_ci            dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2954cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
2955cabdff1aSopenharmony_ci                                  filt1, filt2);
2956cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
2957cabdff1aSopenharmony_ci                                  filt1, filt2);
2958cabdff1aSopenharmony_ci            dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2959cabdff1aSopenharmony_ci
2960cabdff1aSopenharmony_ci            out0 = PCKEV_XORI128_UB(dst0, dst1);
2961cabdff1aSopenharmony_ci            out1 = PCKEV_XORI128_UB(dst2, dst3);
2962cabdff1aSopenharmony_ci            ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
2963cabdff1aSopenharmony_ci            dst += (4 * stride);
2964cabdff1aSopenharmony_ci
2965cabdff1aSopenharmony_ci            hz_out0 = hz_out4;
2966cabdff1aSopenharmony_ci            hz_out1 = hz_out5;
2967cabdff1aSopenharmony_ci            hz_out2 = hz_out6;
2968cabdff1aSopenharmony_ci            hz_out3 = hz_out7;
2969cabdff1aSopenharmony_ci            hz_out4 = hz_out8;
2970cabdff1aSopenharmony_ci        }
2971cabdff1aSopenharmony_ci
2972cabdff1aSopenharmony_ci        src_tmp += 8;
2973cabdff1aSopenharmony_ci        dst_tmp += 8;
2974cabdff1aSopenharmony_ci    }
2975cabdff1aSopenharmony_ci}
2976cabdff1aSopenharmony_ci
2977cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
2978cabdff1aSopenharmony_ci                                ptrdiff_t stride)
2979cabdff1aSopenharmony_ci{
2980cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
2981cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
2982cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
2983cabdff1aSopenharmony_ci    v16u8 out0, out1;
2984cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
2985cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2986cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
2987cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2988cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
2989cabdff1aSopenharmony_ci    v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
2990cabdff1aSopenharmony_ci    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
2991cabdff1aSopenharmony_ci    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
2992cabdff1aSopenharmony_ci    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
2993cabdff1aSopenharmony_ci    v4i32 tmp0, tmp1;
2994cabdff1aSopenharmony_ci
2995cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
2996cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
2997cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
2998cabdff1aSopenharmony_ci
2999cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3000cabdff1aSopenharmony_ci
3001cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
3002cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3003cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3004cabdff1aSopenharmony_ci    src += (5 * stride);
3005cabdff1aSopenharmony_ci
3006cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3007cabdff1aSopenharmony_ci    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3008cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3009cabdff1aSopenharmony_ci    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3010cabdff1aSopenharmony_ci    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
3011cabdff1aSopenharmony_ci
3012cabdff1aSopenharmony_ci    LD_SB4(src, stride, src0, src1, src2, src3);
3013cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
3014cabdff1aSopenharmony_ci    src += (4 * stride);
3015cabdff1aSopenharmony_ci    hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3016cabdff1aSopenharmony_ci    hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3017cabdff1aSopenharmony_ci    hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3018cabdff1aSopenharmony_ci    hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3019cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3020cabdff1aSopenharmony_ci               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3021cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3022cabdff1aSopenharmony_ci               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
3023cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3024cabdff1aSopenharmony_ci               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3025cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3026cabdff1aSopenharmony_ci               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
3027cabdff1aSopenharmony_ci
3028cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3029cabdff1aSopenharmony_ci                          filt2);
3030cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
3031cabdff1aSopenharmony_ci                          filt2);
3032cabdff1aSopenharmony_ci    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3033cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3034cabdff1aSopenharmony_ci                          filt2);
3035cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
3036cabdff1aSopenharmony_ci                          filt2);
3037cabdff1aSopenharmony_ci    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3038cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3039cabdff1aSopenharmony_ci                          filt2);
3040cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
3041cabdff1aSopenharmony_ci                          filt2);
3042cabdff1aSopenharmony_ci    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3043cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3044cabdff1aSopenharmony_ci                          filt2);
3045cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
3046cabdff1aSopenharmony_ci                          filt2);
3047cabdff1aSopenharmony_ci    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3048cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(dst0, dst1);
3049cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(dst2, dst3);
3050cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
3051cabdff1aSopenharmony_ci    dst += (4 * stride);
3052cabdff1aSopenharmony_ci
3053cabdff1aSopenharmony_ci    LD_SB4(src, stride, src0, src1, src2, src3);
3054cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
3055cabdff1aSopenharmony_ci    hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3056cabdff1aSopenharmony_ci    hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3057cabdff1aSopenharmony_ci    hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3058cabdff1aSopenharmony_ci    hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3059cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3060cabdff1aSopenharmony_ci               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
3061cabdff1aSopenharmony_ci               hz_out1211_r);
3062cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3063cabdff1aSopenharmony_ci               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
3064cabdff1aSopenharmony_ci               hz_out1211_l);
3065cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
3066cabdff1aSopenharmony_ci                          filt2);
3067cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
3068cabdff1aSopenharmony_ci                          filt2);
3069cabdff1aSopenharmony_ci    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3070cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
3071cabdff1aSopenharmony_ci                          filt2);
3072cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
3073cabdff1aSopenharmony_ci                          filt2);
3074cabdff1aSopenharmony_ci    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3075cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
3076cabdff1aSopenharmony_ci                          filt2);
3077cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
3078cabdff1aSopenharmony_ci                          filt2);
3079cabdff1aSopenharmony_ci    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3080cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
3081cabdff1aSopenharmony_ci                          filt2);
3082cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
3083cabdff1aSopenharmony_ci                          filt2);
3084cabdff1aSopenharmony_ci    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3085cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(dst0, dst1);
3086cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(dst2, dst3);
3087cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
3088cabdff1aSopenharmony_ci}
3089cabdff1aSopenharmony_ci
3090cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
3091cabdff1aSopenharmony_ci                                ptrdiff_t stride)
3092cabdff1aSopenharmony_ci{
3093cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
3094cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
3095cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
3096cabdff1aSopenharmony_ci    v16u8 res;
3097cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3098cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2;
3099cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3100cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3101cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3102cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3103cabdff1aSopenharmony_ci    v4i32 tmp0, tmp1;
3104cabdff1aSopenharmony_ci
3105cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3106cabdff1aSopenharmony_ci
3107cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
3108cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
3109cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
3110cabdff1aSopenharmony_ci
3111cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
3112cabdff1aSopenharmony_ci
3113cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3114cabdff1aSopenharmony_ci    src += (5 * stride);
3115cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
3116cabdff1aSopenharmony_ci
3117cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3118cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
3119cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
3120cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
3121cabdff1aSopenharmony_ci    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
3122cabdff1aSopenharmony_ci    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
3123cabdff1aSopenharmony_ci    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3124cabdff1aSopenharmony_ci    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3125cabdff1aSopenharmony_ci    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3126cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3127cabdff1aSopenharmony_ci               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3128cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3129cabdff1aSopenharmony_ci               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3130cabdff1aSopenharmony_ci
3131cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3132cabdff1aSopenharmony_ci                          filt2);
3133cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3134cabdff1aSopenharmony_ci                          filt2);
3135cabdff1aSopenharmony_ci    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3136cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3137cabdff1aSopenharmony_ci                          filt2);
3138cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3139cabdff1aSopenharmony_ci                          filt2);
3140cabdff1aSopenharmony_ci    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3141cabdff1aSopenharmony_ci    res = PCKEV_XORI128_UB(dst0, dst1);
3142cabdff1aSopenharmony_ci    ST_W4(res, 0, 1, 2, 3, dst, stride);
3143cabdff1aSopenharmony_ci}
3144cabdff1aSopenharmony_ci
3145cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
3146cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
3147cabdff1aSopenharmony_ci{
3148cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3149cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3;
3150cabdff1aSopenharmony_ci    v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6;
3151cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3152cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3153cabdff1aSopenharmony_ci    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3154cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
3155cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
3156cabdff1aSopenharmony_ci
3157cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3158cabdff1aSopenharmony_ci    mask3 = mask0 + 8;
3159cabdff1aSopenharmony_ci    mask4 = mask1 + 8;
3160cabdff1aSopenharmony_ci    mask5 = mask2 + 8;
3161cabdff1aSopenharmony_ci    src -= 2;
3162cabdff1aSopenharmony_ci
3163cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
3164cabdff1aSopenharmony_ci        LD_SB2(src, 16, src0, src1);
3165cabdff1aSopenharmony_ci        src += stride;
3166cabdff1aSopenharmony_ci        LD_SB2(src, 16, src2, src3);
3167cabdff1aSopenharmony_ci        src += stride;
3168cabdff1aSopenharmony_ci        LD_SB2(src, 16, src4, src5);
3169cabdff1aSopenharmony_ci        src += stride;
3170cabdff1aSopenharmony_ci        LD_SB2(src, 16, src6, src7);
3171cabdff1aSopenharmony_ci        src += stride;
3172cabdff1aSopenharmony_ci
3173cabdff1aSopenharmony_ci        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3174cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3175cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3176cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3177cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3178cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3179cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3180cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3181cabdff1aSopenharmony_ci        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3182cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3183cabdff1aSopenharmony_ci                     minus5b, res0, res1, res2, res3);
3184cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3185cabdff1aSopenharmony_ci                     plus20b, res0, res1, res2, res3);
3186cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3187cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3188cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3189cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3190cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3191cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3192cabdff1aSopenharmony_ci        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3193cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3194cabdff1aSopenharmony_ci                     minus5b, res4, res5, res6, res7);
3195cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3196cabdff1aSopenharmony_ci                     plus20b, res4, res5, res6, res7);
3197cabdff1aSopenharmony_ci        SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2,
3198cabdff1aSopenharmony_ci                   src0, src2, src4, src6);
3199cabdff1aSopenharmony_ci        SRARI_H4_SH(res0, res1, res2, res3, 5);
3200cabdff1aSopenharmony_ci        SRARI_H4_SH(res4, res5, res6, res7, 5);
3201cabdff1aSopenharmony_ci        SAT_SH4_SH(res0, res1, res2, res3, 7);
3202cabdff1aSopenharmony_ci        SAT_SH4_SH(res4, res5, res6, res7, 7);
3203cabdff1aSopenharmony_ci        PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
3204cabdff1aSopenharmony_ci        PCKEV_B2_SB(res5, res4, res7, res6, out2, out3);
3205cabdff1aSopenharmony_ci        out0 = __msa_aver_s_b(out0, src0);
3206cabdff1aSopenharmony_ci        out1 = __msa_aver_s_b(out1, src2);
3207cabdff1aSopenharmony_ci        out2 = __msa_aver_s_b(out2, src4);
3208cabdff1aSopenharmony_ci        out3 = __msa_aver_s_b(out3, src6);
3209cabdff1aSopenharmony_ci        XORI_B4_128_SB(out0, out1, out2, out3);
3210cabdff1aSopenharmony_ci        AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
3211cabdff1aSopenharmony_ci        AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3);
3212cabdff1aSopenharmony_ci        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3213cabdff1aSopenharmony_ci        dst += (4 * stride);
3214cabdff1aSopenharmony_ci    }
3215cabdff1aSopenharmony_ci}
3216cabdff1aSopenharmony_ci
3217cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
3218cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
3219cabdff1aSopenharmony_ci{
3220cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3221cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3;
3222cabdff1aSopenharmony_ci    v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6;
3223cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3224cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3225cabdff1aSopenharmony_ci    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3226cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
3227cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
3228cabdff1aSopenharmony_ci
3229cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3230cabdff1aSopenharmony_ci    mask3 = mask0 + 8;
3231cabdff1aSopenharmony_ci    mask4 = mask1 + 8;
3232cabdff1aSopenharmony_ci    mask5 = mask2 + 8;
3233cabdff1aSopenharmony_ci    src -= 2;
3234cabdff1aSopenharmony_ci
3235cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
3236cabdff1aSopenharmony_ci        LD_SB2(src, 16, src0, src1);
3237cabdff1aSopenharmony_ci        src += stride;
3238cabdff1aSopenharmony_ci        LD_SB2(src, 16, src2, src3);
3239cabdff1aSopenharmony_ci        src += stride;
3240cabdff1aSopenharmony_ci        LD_SB2(src, 16, src4, src5);
3241cabdff1aSopenharmony_ci        src += stride;
3242cabdff1aSopenharmony_ci        LD_SB2(src, 16, src6, src7);
3243cabdff1aSopenharmony_ci        src += stride;
3244cabdff1aSopenharmony_ci
3245cabdff1aSopenharmony_ci        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3246cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3247cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3248cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3249cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3250cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3251cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3252cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3253cabdff1aSopenharmony_ci        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3254cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3255cabdff1aSopenharmony_ci                     minus5b, res0, res1, res2, res3);
3256cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3257cabdff1aSopenharmony_ci                     plus20b, res0, res1, res2, res3);
3258cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3259cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3260cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3261cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3262cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3263cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3264cabdff1aSopenharmony_ci        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3265cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3266cabdff1aSopenharmony_ci                     minus5b, res4, res5, res6, res7);
3267cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3268cabdff1aSopenharmony_ci                     plus20b, res4, res5, res6, res7);
3269cabdff1aSopenharmony_ci        SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3,
3270cabdff1aSopenharmony_ci                   src0, src2, src4, src6);
3271cabdff1aSopenharmony_ci        SRARI_H4_SH(res0, res1, res2, res3, 5);
3272cabdff1aSopenharmony_ci        SRARI_H4_SH(res4, res5, res6, res7, 5);
3273cabdff1aSopenharmony_ci        SAT_SH4_SH(res0, res1, res2, res3, 7);
3274cabdff1aSopenharmony_ci        SAT_SH4_SH(res4, res5, res6, res7, 7);
3275cabdff1aSopenharmony_ci        PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
3276cabdff1aSopenharmony_ci        PCKEV_B2_SB(res5, res4, res7, res6, out2, out3);
3277cabdff1aSopenharmony_ci        out0 = __msa_aver_s_b(out0, src0);
3278cabdff1aSopenharmony_ci        out1 = __msa_aver_s_b(out1, src2);
3279cabdff1aSopenharmony_ci        out2 = __msa_aver_s_b(out2, src4);
3280cabdff1aSopenharmony_ci        out3 = __msa_aver_s_b(out3, src6);
3281cabdff1aSopenharmony_ci        XORI_B4_128_SB(out0, out1, out2, out3);
3282cabdff1aSopenharmony_ci        AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
3283cabdff1aSopenharmony_ci        AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3);
3284cabdff1aSopenharmony_ci        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3285cabdff1aSopenharmony_ci        dst += (4 * stride);
3286cabdff1aSopenharmony_ci    }
3287cabdff1aSopenharmony_ci}
3288cabdff1aSopenharmony_ci
3289cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
3290cabdff1aSopenharmony_ci                                ptrdiff_t stride)
3291cabdff1aSopenharmony_ci{
3292cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
3293cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3294cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3295cabdff1aSopenharmony_ci    v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3296cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3297cabdff1aSopenharmony_ci    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3298cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
3299cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
3300cabdff1aSopenharmony_ci
3301cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3302cabdff1aSopenharmony_ci    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3303cabdff1aSopenharmony_ci    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3304cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3305cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3306cabdff1aSopenharmony_ci    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3307cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3308cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3309cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3310cabdff1aSopenharmony_ci                 res0, res1, res2, res3);
3311cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3312cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3313cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3314cabdff1aSopenharmony_ci                 res0, res1, res2, res3);
3315cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3316cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3317cabdff1aSopenharmony_ci    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3318cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3319cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3320cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3321cabdff1aSopenharmony_ci                 res4, res5, res6, res7);
3322cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3323cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3324cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3325cabdff1aSopenharmony_ci                 res4, res5, res6, res7);
3326cabdff1aSopenharmony_ci    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
3327cabdff1aSopenharmony_ci               src0, src1, src2, src3);
3328cabdff1aSopenharmony_ci    SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2,
3329cabdff1aSopenharmony_ci               src4, src5, src6, src7);
3330cabdff1aSopenharmony_ci    PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
3331cabdff1aSopenharmony_ci    PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
3332cabdff1aSopenharmony_ci    SRARI_H4_SH(res0, res1, res2, res3, 5);
3333cabdff1aSopenharmony_ci    SRARI_H4_SH(res4, res5, res6, res7, 5);
3334cabdff1aSopenharmony_ci    SAT_SH4_SH(res0, res1, res2, res3, 7);
3335cabdff1aSopenharmony_ci    SAT_SH4_SH(res4, res5, res6, res7, 7);
3336cabdff1aSopenharmony_ci    PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
3337cabdff1aSopenharmony_ci    PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
3338cabdff1aSopenharmony_ci    tmp0 = __msa_aver_s_b(tmp0, src0);
3339cabdff1aSopenharmony_ci    tmp1 = __msa_aver_s_b(tmp1, src1);
3340cabdff1aSopenharmony_ci    tmp2 = __msa_aver_s_b(tmp2, src4);
3341cabdff1aSopenharmony_ci    tmp3 = __msa_aver_s_b(tmp3, src5);
3342cabdff1aSopenharmony_ci    XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
3343cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
3344cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
3345cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
3346cabdff1aSopenharmony_ci    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3347cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst2);
3348cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst3);
3349cabdff1aSopenharmony_ci    AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
3350cabdff1aSopenharmony_ci    AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
3351cabdff1aSopenharmony_ci    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3352cabdff1aSopenharmony_ci}
3353cabdff1aSopenharmony_ci
3354cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
3355cabdff1aSopenharmony_ci                                ptrdiff_t stride)
3356cabdff1aSopenharmony_ci{
3357cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
3358cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3359cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3360cabdff1aSopenharmony_ci    v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3361cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3362cabdff1aSopenharmony_ci    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3363cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
3364cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
3365cabdff1aSopenharmony_ci
3366cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3367cabdff1aSopenharmony_ci    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3368cabdff1aSopenharmony_ci    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3369cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3370cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3371cabdff1aSopenharmony_ci    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3372cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3373cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3374cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3375cabdff1aSopenharmony_ci                 res0, res1, res2, res3);
3376cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3377cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3378cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3379cabdff1aSopenharmony_ci                 res0, res1, res2, res3);
3380cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3381cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3382cabdff1aSopenharmony_ci    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3383cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3384cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3385cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3386cabdff1aSopenharmony_ci                 res4, res5, res6, res7);
3387cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3388cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3389cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3390cabdff1aSopenharmony_ci                 res4, res5, res6, res7);
3391cabdff1aSopenharmony_ci    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
3392cabdff1aSopenharmony_ci               src0, src1, src2, src3);
3393cabdff1aSopenharmony_ci    SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3,
3394cabdff1aSopenharmony_ci               src4, src5, src6, src7);
3395cabdff1aSopenharmony_ci    PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
3396cabdff1aSopenharmony_ci    PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
3397cabdff1aSopenharmony_ci    SRARI_H4_SH(res0, res1, res2, res3, 5);
3398cabdff1aSopenharmony_ci    SRARI_H4_SH(res4, res5, res6, res7, 5);
3399cabdff1aSopenharmony_ci    SAT_SH4_SH(res0, res1, res2, res3, 7);
3400cabdff1aSopenharmony_ci    SAT_SH4_SH(res4, res5, res6, res7, 7);
3401cabdff1aSopenharmony_ci    PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
3402cabdff1aSopenharmony_ci    PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
3403cabdff1aSopenharmony_ci    tmp0 = __msa_aver_s_b(tmp0, src0);
3404cabdff1aSopenharmony_ci    tmp1 = __msa_aver_s_b(tmp1, src1);
3405cabdff1aSopenharmony_ci    tmp2 = __msa_aver_s_b(tmp2, src4);
3406cabdff1aSopenharmony_ci    tmp3 = __msa_aver_s_b(tmp3, src5);
3407cabdff1aSopenharmony_ci    XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
3408cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
3409cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
3410cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
3411cabdff1aSopenharmony_ci    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3412cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst2);
3413cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst3);
3414cabdff1aSopenharmony_ci    AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
3415cabdff1aSopenharmony_ci    AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
3416cabdff1aSopenharmony_ci    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3417cabdff1aSopenharmony_ci}
3418cabdff1aSopenharmony_ci
3419cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
3420cabdff1aSopenharmony_ci                                ptrdiff_t stride)
3421cabdff1aSopenharmony_ci{
3422cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
3423cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 };
3424cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3425cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2;
3426cabdff1aSopenharmony_ci    v8i16 out0, out1;
3427cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
3428cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
3429cabdff1aSopenharmony_ci
3430cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3431cabdff1aSopenharmony_ci    LD_SB4(src - 2, stride, src0, src1, src2, src3);
3432cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
3433cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3434cabdff1aSopenharmony_ci    HADD_SB2_SH(vec0, vec1, out0, out1);
3435cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3436cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3437cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3438cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3439cabdff1aSopenharmony_ci    SRARI_H2_SH(out0, out1, 5);
3440cabdff1aSopenharmony_ci    SAT_SH2_SH(out0, out1, 7);
3441cabdff1aSopenharmony_ci    res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3442cabdff1aSopenharmony_ci    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
3443cabdff1aSopenharmony_ci               src0, src1, src2, src3);
3444cabdff1aSopenharmony_ci    src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
3445cabdff1aSopenharmony_ci    src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3446cabdff1aSopenharmony_ci    src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
3447cabdff1aSopenharmony_ci    res = __msa_aver_s_b(res, src0);
3448cabdff1aSopenharmony_ci    res = (v16i8) __msa_xori_b((v16u8) res, 128);
3449cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
3450cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3451cabdff1aSopenharmony_ci    dst0 = __msa_aver_u_b((v16u8) res, dst0);
3452cabdff1aSopenharmony_ci    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
3453cabdff1aSopenharmony_ci}
3454cabdff1aSopenharmony_ci
3455cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
3456cabdff1aSopenharmony_ci                                ptrdiff_t stride)
3457cabdff1aSopenharmony_ci{
3458cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
3459cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 };
3460cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3461cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2;
3462cabdff1aSopenharmony_ci    v8i16 out0, out1;
3463cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
3464cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
3465cabdff1aSopenharmony_ci
3466cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3467cabdff1aSopenharmony_ci    LD_SB4(src - 2, stride, src0, src1, src2, src3);
3468cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
3469cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3470cabdff1aSopenharmony_ci    HADD_SB2_SH(vec0, vec1, out0, out1);
3471cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3472cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3473cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3474cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3475cabdff1aSopenharmony_ci    SRARI_H2_SH(out0, out1, 5);
3476cabdff1aSopenharmony_ci    SAT_SH2_SH(out0, out1, 7);
3477cabdff1aSopenharmony_ci    res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3478cabdff1aSopenharmony_ci    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
3479cabdff1aSopenharmony_ci               src0, src1, src2, src3);
3480cabdff1aSopenharmony_ci    src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
3481cabdff1aSopenharmony_ci    src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3482cabdff1aSopenharmony_ci    src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
3483cabdff1aSopenharmony_ci    res = __msa_aver_s_b(res, src0);
3484cabdff1aSopenharmony_ci    res = (v16i8) __msa_xori_b((v16u8) res, 128);
3485cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
3486cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3487cabdff1aSopenharmony_ci    dst0 = __msa_aver_u_b((v16u8) res, dst0);
3488cabdff1aSopenharmony_ci    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
3489cabdff1aSopenharmony_ci}
3490cabdff1aSopenharmony_ci
3491cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
3492cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
3493cabdff1aSopenharmony_ci{
3494cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3495cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3;
3496cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3497cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3498cabdff1aSopenharmony_ci    v16i8 vec11;
3499cabdff1aSopenharmony_ci    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3500cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
3501cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
3502cabdff1aSopenharmony_ci
3503cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3504cabdff1aSopenharmony_ci    src -= 2;
3505cabdff1aSopenharmony_ci
3506cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
3507cabdff1aSopenharmony_ci        LD_SB2(src, 8, src0, src1);
3508cabdff1aSopenharmony_ci        src += stride;
3509cabdff1aSopenharmony_ci        LD_SB2(src, 8, src2, src3);
3510cabdff1aSopenharmony_ci        src += stride;
3511cabdff1aSopenharmony_ci        LD_SB2(src, 8, src4, src5);
3512cabdff1aSopenharmony_ci        src += stride;
3513cabdff1aSopenharmony_ci        LD_SB2(src, 8, src6, src7);
3514cabdff1aSopenharmony_ci        src += stride;
3515cabdff1aSopenharmony_ci
3516cabdff1aSopenharmony_ci        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3517cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3518cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
3519cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
3520cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
3521cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
3522cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
3523cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
3524cabdff1aSopenharmony_ci        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3525cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3526cabdff1aSopenharmony_ci                     minus5b, res0, res1, res2, res3);
3527cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3528cabdff1aSopenharmony_ci                     plus20b, res0, res1, res2, res3);
3529cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
3530cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
3531cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
3532cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
3533cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
3534cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
3535cabdff1aSopenharmony_ci        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3536cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3537cabdff1aSopenharmony_ci                     minus5b, res4, res5, res6, res7);
3538cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3539cabdff1aSopenharmony_ci                     plus20b, res4, res5, res6, res7);
3540cabdff1aSopenharmony_ci        SRARI_H4_SH(res0, res1, res2, res3, 5);
3541cabdff1aSopenharmony_ci        SRARI_H4_SH(res4, res5, res6, res7, 5);
3542cabdff1aSopenharmony_ci        SAT_SH4_SH(res0, res1, res2, res3, 7);
3543cabdff1aSopenharmony_ci        SAT_SH4_SH(res4, res5, res6, res7, 7);
3544cabdff1aSopenharmony_ci        PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
3545cabdff1aSopenharmony_ci                    vec2, vec3);
3546cabdff1aSopenharmony_ci        XORI_B4_128_SB(vec0, vec1, vec2, vec3);
3547cabdff1aSopenharmony_ci        AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1);
3548cabdff1aSopenharmony_ci        AVER_UB2_UB(vec2, dst2, vec3, dst3, dst2, dst3);
3549cabdff1aSopenharmony_ci        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3550cabdff1aSopenharmony_ci        dst += (4 * stride);
3551cabdff1aSopenharmony_ci    }
3552cabdff1aSopenharmony_ci}
3553cabdff1aSopenharmony_ci
3554cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
3555cabdff1aSopenharmony_ci                                ptrdiff_t stride)
3556cabdff1aSopenharmony_ci{
3557cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
3558cabdff1aSopenharmony_ci    v16u8 out0, out1, out2 = { 0 }, out3 = { 0 };
3559cabdff1aSopenharmony_ci    v16u8 out4, out5, out6 = { 0 }, out7 = { 0 };
3560cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3561cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3562cabdff1aSopenharmony_ci    v16i8 vec11;
3563cabdff1aSopenharmony_ci    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3564cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
3565cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
3566cabdff1aSopenharmony_ci
3567cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3568cabdff1aSopenharmony_ci
3569cabdff1aSopenharmony_ci    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3570cabdff1aSopenharmony_ci    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3571cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3572cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3573cabdff1aSopenharmony_ci    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3574cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3575cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3576cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3577cabdff1aSopenharmony_ci                 res0, res1, res2, res3);
3578cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3579cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3580cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3581cabdff1aSopenharmony_ci                 res0, res1, res2, res3);
3582cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3583cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3584cabdff1aSopenharmony_ci    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3585cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3586cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3587cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3588cabdff1aSopenharmony_ci                 res4, res5, res6, res7);
3589cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3590cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3591cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3592cabdff1aSopenharmony_ci                 res4, res5, res6, res7);
3593cabdff1aSopenharmony_ci    SRARI_H4_SH(res0, res1, res2, res3, 5);
3594cabdff1aSopenharmony_ci    SRARI_H4_SH(res4, res5, res6, res7, 5);
3595cabdff1aSopenharmony_ci    SAT_SH4_SH(res0, res1, res2, res3, 7);
3596cabdff1aSopenharmony_ci    SAT_SH4_SH(res4, res5, res6, res7, 7);
3597cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(res0, res1);
3598cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(res2, res3);
3599cabdff1aSopenharmony_ci    out4 = PCKEV_XORI128_UB(res4, res5);
3600cabdff1aSopenharmony_ci    out5 = PCKEV_XORI128_UB(res6, res7);
3601cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
3602cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, out2);
3603cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, out3);
3604cabdff1aSopenharmony_ci    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3605cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, out6);
3606cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, out7);
3607cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, out2, out1, out3, out0, out1);
3608cabdff1aSopenharmony_ci    AVER_UB2_UB(out4, out6, out5, out7, out4, out5);
3609cabdff1aSopenharmony_ci    ST_D8(out0, out1, out4, out5, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3610cabdff1aSopenharmony_ci}
3611cabdff1aSopenharmony_ci
3612cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
3613cabdff1aSopenharmony_ci                                ptrdiff_t stride)
3614cabdff1aSopenharmony_ci{
3615cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
3616cabdff1aSopenharmony_ci    v16u8 res, dst0 = { 0 };
3617cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, vec4, vec5;
3618cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2;
3619cabdff1aSopenharmony_ci    v8i16 res0, res1;
3620cabdff1aSopenharmony_ci    v16i8 minus5b = __msa_ldi_b(-5);
3621cabdff1aSopenharmony_ci    v16i8 plus20b = __msa_ldi_b(20);
3622cabdff1aSopenharmony_ci
3623cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3624cabdff1aSopenharmony_ci    LD_SB4(src - 2, stride, src0, src1, src2, src3);
3625cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
3626cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3627cabdff1aSopenharmony_ci    HADD_SB2_SH(vec0, vec1, res0, res1);
3628cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3629cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
3630cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3631cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
3632cabdff1aSopenharmony_ci    SRARI_H2_SH(res0, res1, 5);
3633cabdff1aSopenharmony_ci    SAT_SH2_SH(res0, res1, 7);
3634cabdff1aSopenharmony_ci    res = PCKEV_XORI128_UB(res0, res1);
3635cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
3636cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3637cabdff1aSopenharmony_ci    res = __msa_aver_u_b(res, dst0);
3638cabdff1aSopenharmony_ci    ST_W4(res, 0, 1, 2, 3, dst, stride);
3639cabdff1aSopenharmony_ci}
3640cabdff1aSopenharmony_ci
3641cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
3642cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
3643cabdff1aSopenharmony_ci{
3644cabdff1aSopenharmony_ci    int32_t loop_cnt;
3645cabdff1aSopenharmony_ci    int16_t filt_const0 = 0xfb01;
3646cabdff1aSopenharmony_ci    int16_t filt_const1 = 0x1414;
3647cabdff1aSopenharmony_ci    int16_t filt_const2 = 0x1fb;
3648cabdff1aSopenharmony_ci    v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3649cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3650cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3651cabdff1aSopenharmony_ci    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3652cabdff1aSopenharmony_ci    v16i8 src65_l, src87_l, filt0, filt1, filt2;
3653cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3654cabdff1aSopenharmony_ci
3655cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
3656cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
3657cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
3658cabdff1aSopenharmony_ci
3659cabdff1aSopenharmony_ci    src -= (stride * 2);
3660cabdff1aSopenharmony_ci
3661cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3662cabdff1aSopenharmony_ci    src += (5 * stride);
3663cabdff1aSopenharmony_ci
3664cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3665cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3666cabdff1aSopenharmony_ci               src32_r, src43_r);
3667cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3668cabdff1aSopenharmony_ci               src32_l, src43_l);
3669cabdff1aSopenharmony_ci
3670cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
3671cabdff1aSopenharmony_ci        LD_SB4(src, stride, src5, src6, src7, src8);
3672cabdff1aSopenharmony_ci        src += (4 * stride);
3673cabdff1aSopenharmony_ci
3674cabdff1aSopenharmony_ci        XORI_B4_128_SB(src5, src6, src7, src8);
3675cabdff1aSopenharmony_ci        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3676cabdff1aSopenharmony_ci                   src65_r, src76_r, src87_r);
3677cabdff1aSopenharmony_ci        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3678cabdff1aSopenharmony_ci                   src65_l, src76_l, src87_l);
3679cabdff1aSopenharmony_ci        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3680cabdff1aSopenharmony_ci        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3681cabdff1aSopenharmony_ci        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3682cabdff1aSopenharmony_ci        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3683cabdff1aSopenharmony_ci        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3684cabdff1aSopenharmony_ci        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3685cabdff1aSopenharmony_ci        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3686cabdff1aSopenharmony_ci        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3687cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3688cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3689cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3690cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3691cabdff1aSopenharmony_ci        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3692cabdff1aSopenharmony_ci                    out3_r, res0, res1, res2, res3);
3693cabdff1aSopenharmony_ci        res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
3694cabdff1aSopenharmony_ci        res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
3695cabdff1aSopenharmony_ci        res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
3696cabdff1aSopenharmony_ci        res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
3697cabdff1aSopenharmony_ci        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3698cabdff1aSopenharmony_ci        XORI_B4_128_UB(res0, res1, res2, res3);
3699cabdff1aSopenharmony_ci        AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
3700cabdff1aSopenharmony_ci        AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3);
3701cabdff1aSopenharmony_ci        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3702cabdff1aSopenharmony_ci        dst += (4 * stride);
3703cabdff1aSopenharmony_ci
3704cabdff1aSopenharmony_ci        src10_r = src54_r;
3705cabdff1aSopenharmony_ci        src32_r = src76_r;
3706cabdff1aSopenharmony_ci        src21_r = src65_r;
3707cabdff1aSopenharmony_ci        src43_r = src87_r;
3708cabdff1aSopenharmony_ci        src10_l = src54_l;
3709cabdff1aSopenharmony_ci        src32_l = src76_l;
3710cabdff1aSopenharmony_ci        src21_l = src65_l;
3711cabdff1aSopenharmony_ci        src43_l = src87_l;
3712cabdff1aSopenharmony_ci        src2 = src6;
3713cabdff1aSopenharmony_ci        src3 = src7;
3714cabdff1aSopenharmony_ci        src4 = src8;
3715cabdff1aSopenharmony_ci    }
3716cabdff1aSopenharmony_ci}
3717cabdff1aSopenharmony_ci
3718cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
3719cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
3720cabdff1aSopenharmony_ci{
3721cabdff1aSopenharmony_ci    int32_t loop_cnt;
3722cabdff1aSopenharmony_ci    int16_t filt_const0 = 0xfb01;
3723cabdff1aSopenharmony_ci    int16_t filt_const1 = 0x1414;
3724cabdff1aSopenharmony_ci    int16_t filt_const2 = 0x1fb;
3725cabdff1aSopenharmony_ci    v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3726cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3727cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3728cabdff1aSopenharmony_ci    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3729cabdff1aSopenharmony_ci    v16i8 src65_l, src87_l, filt0, filt1, filt2;
3730cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3731cabdff1aSopenharmony_ci
3732cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
3733cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
3734cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
3735cabdff1aSopenharmony_ci
3736cabdff1aSopenharmony_ci    src -= (stride * 2);
3737cabdff1aSopenharmony_ci
3738cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3739cabdff1aSopenharmony_ci    src += (5 * stride);
3740cabdff1aSopenharmony_ci
3741cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3742cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3743cabdff1aSopenharmony_ci               src32_r, src43_r);
3744cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3745cabdff1aSopenharmony_ci               src32_l, src43_l);
3746cabdff1aSopenharmony_ci
3747cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
3748cabdff1aSopenharmony_ci        LD_SB4(src, stride, src5, src6, src7, src8);
3749cabdff1aSopenharmony_ci        src += (4 * stride);
3750cabdff1aSopenharmony_ci
3751cabdff1aSopenharmony_ci        XORI_B4_128_SB(src5, src6, src7, src8);
3752cabdff1aSopenharmony_ci        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3753cabdff1aSopenharmony_ci                   src65_r, src76_r, src87_r);
3754cabdff1aSopenharmony_ci        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3755cabdff1aSopenharmony_ci                   src65_l, src76_l, src87_l);
3756cabdff1aSopenharmony_ci        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3757cabdff1aSopenharmony_ci        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3758cabdff1aSopenharmony_ci        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3759cabdff1aSopenharmony_ci        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3760cabdff1aSopenharmony_ci        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3761cabdff1aSopenharmony_ci        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3762cabdff1aSopenharmony_ci        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3763cabdff1aSopenharmony_ci        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3764cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3765cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3766cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3767cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3768cabdff1aSopenharmony_ci        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3769cabdff1aSopenharmony_ci                    out3_r, res0, res1, res2, res3);
3770cabdff1aSopenharmony_ci        res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
3771cabdff1aSopenharmony_ci        res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
3772cabdff1aSopenharmony_ci        res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
3773cabdff1aSopenharmony_ci        res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
3774cabdff1aSopenharmony_ci        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3775cabdff1aSopenharmony_ci        XORI_B4_128_UB(res0, res1, res2, res3);
3776cabdff1aSopenharmony_ci        AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
3777cabdff1aSopenharmony_ci        AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3);
3778cabdff1aSopenharmony_ci        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3779cabdff1aSopenharmony_ci        dst += (4 * stride);
3780cabdff1aSopenharmony_ci
3781cabdff1aSopenharmony_ci        src10_r = src54_r;
3782cabdff1aSopenharmony_ci        src32_r = src76_r;
3783cabdff1aSopenharmony_ci        src21_r = src65_r;
3784cabdff1aSopenharmony_ci        src43_r = src87_r;
3785cabdff1aSopenharmony_ci        src10_l = src54_l;
3786cabdff1aSopenharmony_ci        src32_l = src76_l;
3787cabdff1aSopenharmony_ci        src21_l = src65_l;
3788cabdff1aSopenharmony_ci        src43_l = src87_l;
3789cabdff1aSopenharmony_ci        src3 = src7;
3790cabdff1aSopenharmony_ci        src4 = src8;
3791cabdff1aSopenharmony_ci    }
3792cabdff1aSopenharmony_ci}
3793cabdff1aSopenharmony_ci
3794cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
3795cabdff1aSopenharmony_ci                                ptrdiff_t stride)
3796cabdff1aSopenharmony_ci{
3797cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
3798cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
3799cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
3800cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
3801cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3802cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3803cabdff1aSopenharmony_ci    v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3804cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3805cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3806cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3807cabdff1aSopenharmony_ci
3808cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
3809cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
3810cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
3811cabdff1aSopenharmony_ci
3812cabdff1aSopenharmony_ci    src -= (stride * 2);
3813cabdff1aSopenharmony_ci
3814cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3815cabdff1aSopenharmony_ci    src += (5 * stride);
3816cabdff1aSopenharmony_ci
3817cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3818cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3819cabdff1aSopenharmony_ci               src32_r, src43_r);
3820cabdff1aSopenharmony_ci    LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
3821cabdff1aSopenharmony_ci    XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3822cabdff1aSopenharmony_ci    ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3823cabdff1aSopenharmony_ci               src87_r, src98_r, src109_r);
3824cabdff1aSopenharmony_ci    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3825cabdff1aSopenharmony_ci    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3826cabdff1aSopenharmony_ci    out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3827cabdff1aSopenharmony_ci    out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3828cabdff1aSopenharmony_ci    PCKEV_D2_SB(src3, src2, src7, src4, tmp0, tmp1);
3829cabdff1aSopenharmony_ci    ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3830cabdff1aSopenharmony_ci               src21_r, src32_r, src43_r);
3831cabdff1aSopenharmony_ci    out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3832cabdff1aSopenharmony_ci    out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3833cabdff1aSopenharmony_ci    out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3834cabdff1aSopenharmony_ci    out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3835cabdff1aSopenharmony_ci    PCKEV_D2_SB(src9, src8, src11, src10, tmp2, tmp3);
3836cabdff1aSopenharmony_ci    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3837cabdff1aSopenharmony_ci    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3838cabdff1aSopenharmony_ci    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3839cabdff1aSopenharmony_ci    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3840cabdff1aSopenharmony_ci
3841cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
3842cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
3843cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
3844cabdff1aSopenharmony_ci    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3845cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst2);
3846cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst3);
3847cabdff1aSopenharmony_ci
3848cabdff1aSopenharmony_ci    PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3849cabdff1aSopenharmony_ci    PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3850cabdff1aSopenharmony_ci    out0 = __msa_aver_s_b(out0, tmp0);
3851cabdff1aSopenharmony_ci    out1 = __msa_aver_s_b(out1, tmp1);
3852cabdff1aSopenharmony_ci    out2 = __msa_aver_s_b(out2, tmp2);
3853cabdff1aSopenharmony_ci    out3 = __msa_aver_s_b(out3, tmp3);
3854cabdff1aSopenharmony_ci    XORI_B4_128_SB(out0, out1, out2, out3);
3855cabdff1aSopenharmony_ci    AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3856cabdff1aSopenharmony_ci                dst2, dst3);
3857cabdff1aSopenharmony_ci    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3858cabdff1aSopenharmony_ci}
3859cabdff1aSopenharmony_ci
3860cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
3861cabdff1aSopenharmony_ci                                ptrdiff_t stride)
3862cabdff1aSopenharmony_ci{
3863cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
3864cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
3865cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
3866cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
3867cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3868cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3869cabdff1aSopenharmony_ci    v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3870cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3871cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3872cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3873cabdff1aSopenharmony_ci
3874cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
3875cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
3876cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
3877cabdff1aSopenharmony_ci
3878cabdff1aSopenharmony_ci    src -= (stride * 2);
3879cabdff1aSopenharmony_ci
3880cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3881cabdff1aSopenharmony_ci    src += (5 * stride);
3882cabdff1aSopenharmony_ci
3883cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3884cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3885cabdff1aSopenharmony_ci               src32_r, src43_r);
3886cabdff1aSopenharmony_ci    LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
3887cabdff1aSopenharmony_ci    XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3888cabdff1aSopenharmony_ci    ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3889cabdff1aSopenharmony_ci               src87_r, src98_r, src109_r);
3890cabdff1aSopenharmony_ci    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3891cabdff1aSopenharmony_ci    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3892cabdff1aSopenharmony_ci    out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3893cabdff1aSopenharmony_ci    out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3894cabdff1aSopenharmony_ci    PCKEV_D2_SB(src4, src3, src8, src7, tmp0, tmp1);
3895cabdff1aSopenharmony_ci    ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3896cabdff1aSopenharmony_ci               src21_r, src32_r, src43_r);
3897cabdff1aSopenharmony_ci    out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3898cabdff1aSopenharmony_ci    out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3899cabdff1aSopenharmony_ci    out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3900cabdff1aSopenharmony_ci    out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3901cabdff1aSopenharmony_ci    PCKEV_D2_SB(src10, src9, src12, src11, tmp2, tmp3);
3902cabdff1aSopenharmony_ci    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3903cabdff1aSopenharmony_ci    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3904cabdff1aSopenharmony_ci    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3905cabdff1aSopenharmony_ci    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3906cabdff1aSopenharmony_ci
3907cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
3908cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
3909cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
3910cabdff1aSopenharmony_ci    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3911cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst2);
3912cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst3);
3913cabdff1aSopenharmony_ci
3914cabdff1aSopenharmony_ci    PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3915cabdff1aSopenharmony_ci    PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3916cabdff1aSopenharmony_ci    out0 = __msa_aver_s_b(out0, tmp0);
3917cabdff1aSopenharmony_ci    out1 = __msa_aver_s_b(out1, tmp1);
3918cabdff1aSopenharmony_ci    out2 = __msa_aver_s_b(out2, tmp2);
3919cabdff1aSopenharmony_ci    out3 = __msa_aver_s_b(out3, tmp3);
3920cabdff1aSopenharmony_ci    XORI_B4_128_SB(out0, out1, out2, out3);
3921cabdff1aSopenharmony_ci    AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3922cabdff1aSopenharmony_ci                dst2, dst3);
3923cabdff1aSopenharmony_ci    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3924cabdff1aSopenharmony_ci}
3925cabdff1aSopenharmony_ci
3926cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
3927cabdff1aSopenharmony_ci                                ptrdiff_t stride)
3928cabdff1aSopenharmony_ci{
3929cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
3930cabdff1aSopenharmony_ci    int16_t filt_const0 = 0xfb01;
3931cabdff1aSopenharmony_ci    int16_t filt_const1 = 0x1414;
3932cabdff1aSopenharmony_ci    int16_t filt_const2 = 0x1fb;
3933cabdff1aSopenharmony_ci    v16u8 res, dst0 = { 0 };
3934cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3935cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3936cabdff1aSopenharmony_ci    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3937cabdff1aSopenharmony_ci    v8i16 out10, out32;
3938cabdff1aSopenharmony_ci
3939cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
3940cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
3941cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
3942cabdff1aSopenharmony_ci
3943cabdff1aSopenharmony_ci    src -= (stride * 2);
3944cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3945cabdff1aSopenharmony_ci    src += (5 * stride);
3946cabdff1aSopenharmony_ci
3947cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3948cabdff1aSopenharmony_ci               src32_r, src43_r);
3949cabdff1aSopenharmony_ci    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3950cabdff1aSopenharmony_ci    XORI_B2_128_SB(src2110, src4332);
3951cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
3952cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3953cabdff1aSopenharmony_ci               src76_r, src87_r);
3954cabdff1aSopenharmony_ci    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
3955cabdff1aSopenharmony_ci    XORI_B2_128_SB(src6554, src8776);
3956cabdff1aSopenharmony_ci    src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3957cabdff1aSopenharmony_ci    src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
3958cabdff1aSopenharmony_ci    src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
3959cabdff1aSopenharmony_ci    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
3960cabdff1aSopenharmony_ci    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
3961cabdff1aSopenharmony_ci    SRARI_H2_SH(out10, out32, 5);
3962cabdff1aSopenharmony_ci    SAT_SH2_SH(out10, out32, 7);
3963cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
3964cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3965cabdff1aSopenharmony_ci    res = PCKEV_XORI128_UB(out10, out32);
3966cabdff1aSopenharmony_ci    res = __msa_aver_u_b(res, (v16u8) src32_r);
3967cabdff1aSopenharmony_ci    dst0 = __msa_aver_u_b(res, dst0);
3968cabdff1aSopenharmony_ci    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
3969cabdff1aSopenharmony_ci}
3970cabdff1aSopenharmony_ci
3971cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
3972cabdff1aSopenharmony_ci                                ptrdiff_t stride)
3973cabdff1aSopenharmony_ci{
3974cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
3975cabdff1aSopenharmony_ci    int16_t filt_const0 = 0xfb01;
3976cabdff1aSopenharmony_ci    int16_t filt_const1 = 0x1414;
3977cabdff1aSopenharmony_ci    int16_t filt_const2 = 0x1fb;
3978cabdff1aSopenharmony_ci    v16u8 res, dst0 = { 0 };
3979cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3980cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3981cabdff1aSopenharmony_ci    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3982cabdff1aSopenharmony_ci    v8i16 out10, out32;
3983cabdff1aSopenharmony_ci
3984cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
3985cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
3986cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
3987cabdff1aSopenharmony_ci
3988cabdff1aSopenharmony_ci    src -= (stride * 2);
3989cabdff1aSopenharmony_ci
3990cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3991cabdff1aSopenharmony_ci    src += (5 * stride);
3992cabdff1aSopenharmony_ci
3993cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3994cabdff1aSopenharmony_ci               src32_r, src43_r);
3995cabdff1aSopenharmony_ci    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3996cabdff1aSopenharmony_ci    XORI_B2_128_SB(src2110, src4332);
3997cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
3998cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3999cabdff1aSopenharmony_ci               src76_r, src87_r);
4000cabdff1aSopenharmony_ci    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4001cabdff1aSopenharmony_ci    XORI_B2_128_SB(src6554, src8776);
4002cabdff1aSopenharmony_ci    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4003cabdff1aSopenharmony_ci    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4004cabdff1aSopenharmony_ci    SRARI_H2_SH(out10, out32, 5);
4005cabdff1aSopenharmony_ci    SAT_SH2_SH(out10, out32, 7);
4006cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
4007cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4008cabdff1aSopenharmony_ci    res = PCKEV_XORI128_UB(out10, out32);
4009cabdff1aSopenharmony_ci    src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
4010cabdff1aSopenharmony_ci    src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
4011cabdff1aSopenharmony_ci    src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
4012cabdff1aSopenharmony_ci    res = __msa_aver_u_b(res, (v16u8) src32_r);
4013cabdff1aSopenharmony_ci    dst0 = __msa_aver_u_b(res, dst0);
4014cabdff1aSopenharmony_ci    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
4015cabdff1aSopenharmony_ci}
4016cabdff1aSopenharmony_ci
4017cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
4018cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
4019cabdff1aSopenharmony_ci{
4020cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
4021cabdff1aSopenharmony_ci                                           src - (stride * 2),
4022cabdff1aSopenharmony_ci                                           dst, stride);
4023cabdff1aSopenharmony_ci}
4024cabdff1aSopenharmony_ci
4025cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
4026cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
4027cabdff1aSopenharmony_ci{
4028cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
4029cabdff1aSopenharmony_ci                                           src - (stride * 2) +
4030cabdff1aSopenharmony_ci                                           sizeof(uint8_t),
4031cabdff1aSopenharmony_ci                                           dst, stride);
4032cabdff1aSopenharmony_ci}
4033cabdff1aSopenharmony_ci
4034cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
4035cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
4036cabdff1aSopenharmony_ci{
4037cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4038cabdff1aSopenharmony_ci                                           src - (stride * 2),
4039cabdff1aSopenharmony_ci                                           dst, stride);
4040cabdff1aSopenharmony_ci}
4041cabdff1aSopenharmony_ci
4042cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
4043cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
4044cabdff1aSopenharmony_ci{
4045cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4046cabdff1aSopenharmony_ci                                           src - (stride * 2) +
4047cabdff1aSopenharmony_ci                                           sizeof(uint8_t),
4048cabdff1aSopenharmony_ci                                           dst, stride);
4049cabdff1aSopenharmony_ci}
4050cabdff1aSopenharmony_ci
4051cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
4052cabdff1aSopenharmony_ci                                ptrdiff_t stride)
4053cabdff1aSopenharmony_ci{
4054cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
4055cabdff1aSopenharmony_ci                                         src - (stride * 2),
4056cabdff1aSopenharmony_ci                                         dst, stride);
4057cabdff1aSopenharmony_ci}
4058cabdff1aSopenharmony_ci
4059cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
4060cabdff1aSopenharmony_ci                                ptrdiff_t stride)
4061cabdff1aSopenharmony_ci{
4062cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
4063cabdff1aSopenharmony_ci                                         src - (stride * 2) +
4064cabdff1aSopenharmony_ci                                         sizeof(uint8_t), dst, stride);
4065cabdff1aSopenharmony_ci}
4066cabdff1aSopenharmony_ci
4067cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
4068cabdff1aSopenharmony_ci                                ptrdiff_t stride)
4069cabdff1aSopenharmony_ci{
4070cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4071cabdff1aSopenharmony_ci                                         src - (stride * 2),
4072cabdff1aSopenharmony_ci                                         dst, stride);
4073cabdff1aSopenharmony_ci}
4074cabdff1aSopenharmony_ci
4075cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
4076cabdff1aSopenharmony_ci                                ptrdiff_t stride)
4077cabdff1aSopenharmony_ci{
4078cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4079cabdff1aSopenharmony_ci                                         src - (stride * 2) +
4080cabdff1aSopenharmony_ci                                         sizeof(uint8_t), dst, stride);
4081cabdff1aSopenharmony_ci}
4082cabdff1aSopenharmony_ci
4083cabdff1aSopenharmony_ci
4084cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
4085cabdff1aSopenharmony_ci                                ptrdiff_t stride)
4086cabdff1aSopenharmony_ci{
4087cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
4088cabdff1aSopenharmony_ci                                         src - (stride * 2),
4089cabdff1aSopenharmony_ci                                         dst, stride);
4090cabdff1aSopenharmony_ci}
4091cabdff1aSopenharmony_ci
4092cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
4093cabdff1aSopenharmony_ci                                ptrdiff_t stride)
4094cabdff1aSopenharmony_ci{
4095cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
4096cabdff1aSopenharmony_ci                                         src - (stride * 2) +
4097cabdff1aSopenharmony_ci                                         sizeof(uint8_t), dst, stride);
4098cabdff1aSopenharmony_ci}
4099cabdff1aSopenharmony_ci
4100cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
4101cabdff1aSopenharmony_ci                                ptrdiff_t stride)
4102cabdff1aSopenharmony_ci{
4103cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4104cabdff1aSopenharmony_ci                                         src - (stride * 2),
4105cabdff1aSopenharmony_ci                                         dst, stride);
4106cabdff1aSopenharmony_ci}
4107cabdff1aSopenharmony_ci
4108cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
4109cabdff1aSopenharmony_ci                                ptrdiff_t stride)
4110cabdff1aSopenharmony_ci{
4111cabdff1aSopenharmony_ci    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4112cabdff1aSopenharmony_ci                                         src - (stride * 2) +
4113cabdff1aSopenharmony_ci                                         sizeof(uint8_t), dst, stride);
4114cabdff1aSopenharmony_ci}
4115cabdff1aSopenharmony_ci
4116cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
4117cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
4118cabdff1aSopenharmony_ci{
4119cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
4120cabdff1aSopenharmony_ci    uint8_t *dst_tmp = dst;
4121cabdff1aSopenharmony_ci    const uint8_t *src_tmp = src - (2 * stride) - 2;
4122cabdff1aSopenharmony_ci    uint32_t multiple8_cnt, loop_cnt;
4123cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
4124cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
4125cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
4126cabdff1aSopenharmony_ci    v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4127cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4128cabdff1aSopenharmony_ci    v16i8 mask2;
4129cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4130cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4131cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4132cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4133cabdff1aSopenharmony_ci    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4134cabdff1aSopenharmony_ci    v8i16 hz_out87_l, filt0, filt1, filt2;
4135cabdff1aSopenharmony_ci    v4i32 tmp0_w, tmp1_w;
4136cabdff1aSopenharmony_ci
4137cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
4138cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
4139cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
4140cabdff1aSopenharmony_ci
4141cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4142cabdff1aSopenharmony_ci
4143cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
4144cabdff1aSopenharmony_ci        dst = dst_tmp;
4145cabdff1aSopenharmony_ci        src = src_tmp;
4146cabdff1aSopenharmony_ci
4147cabdff1aSopenharmony_ci        LD_SB5(src, stride, src0, src1, src2, src3, src4);
4148cabdff1aSopenharmony_ci        XORI_B5_128_SB(src0, src1, src2, src3, src4);
4149cabdff1aSopenharmony_ci        src += (5 * stride);
4150cabdff1aSopenharmony_ci
4151cabdff1aSopenharmony_ci        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4152cabdff1aSopenharmony_ci        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4153cabdff1aSopenharmony_ci        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4154cabdff1aSopenharmony_ci        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4155cabdff1aSopenharmony_ci        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4156cabdff1aSopenharmony_ci
4157cabdff1aSopenharmony_ci        for (loop_cnt = 4; loop_cnt--;) {
4158cabdff1aSopenharmony_ci            LD_SB2(src, stride, src5, src6);
4159cabdff1aSopenharmony_ci            src += (2 * stride);
4160cabdff1aSopenharmony_ci
4161cabdff1aSopenharmony_ci            XORI_B2_128_SB(src5, src6);
4162cabdff1aSopenharmony_ci            hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4163cabdff1aSopenharmony_ci            hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4164cabdff1aSopenharmony_ci            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4165cabdff1aSopenharmony_ci                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4166cabdff1aSopenharmony_ci                       hz_out43_r);
4167cabdff1aSopenharmony_ci            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4168cabdff1aSopenharmony_ci                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4169cabdff1aSopenharmony_ci                       hz_out43_l);
4170cabdff1aSopenharmony_ci            ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r,
4171cabdff1aSopenharmony_ci                       hz_out65_r);
4172cabdff1aSopenharmony_ci            ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l,
4173cabdff1aSopenharmony_ci                       hz_out65_l);
4174cabdff1aSopenharmony_ci            tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4175cabdff1aSopenharmony_ci                                    filt1, filt2);
4176cabdff1aSopenharmony_ci            tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4177cabdff1aSopenharmony_ci                                    filt1, filt2);
4178cabdff1aSopenharmony_ci            tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4179cabdff1aSopenharmony_ci            tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4180cabdff1aSopenharmony_ci                                    filt1, filt2);
4181cabdff1aSopenharmony_ci            tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4182cabdff1aSopenharmony_ci                                    filt1, filt2);
4183cabdff1aSopenharmony_ci            tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4184cabdff1aSopenharmony_ci
4185cabdff1aSopenharmony_ci            tmp1 = __msa_srari_h(hz_out2, 5);
4186cabdff1aSopenharmony_ci            tmp3 = __msa_srari_h(hz_out3, 5);
4187cabdff1aSopenharmony_ci            SAT_SH2_SH(tmp1, tmp3, 7);
4188cabdff1aSopenharmony_ci
4189cabdff1aSopenharmony_ci            tmp0 = __msa_aver_s_h(tmp0, tmp1);
4190cabdff1aSopenharmony_ci            tmp1 = __msa_aver_s_h(tmp2, tmp3);
4191cabdff1aSopenharmony_ci
4192cabdff1aSopenharmony_ci            LD2(dst, stride, tp0, tp1);
4193cabdff1aSopenharmony_ci            INSERT_D2_UB(tp0, tp1, dst0);
4194cabdff1aSopenharmony_ci
4195cabdff1aSopenharmony_ci            out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4196cabdff1aSopenharmony_ci            dst0 = __msa_aver_u_b(out0, dst0);
4197cabdff1aSopenharmony_ci            ST_D2(dst0, 0, 1, dst, stride);
4198cabdff1aSopenharmony_ci            dst += (2 * stride);
4199cabdff1aSopenharmony_ci
4200cabdff1aSopenharmony_ci            LD_SB2(src, stride, src7, src8);
4201cabdff1aSopenharmony_ci            src += (2 * stride);
4202cabdff1aSopenharmony_ci
4203cabdff1aSopenharmony_ci            XORI_B2_128_SB(src7, src8);
4204cabdff1aSopenharmony_ci            hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4205cabdff1aSopenharmony_ci            hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4206cabdff1aSopenharmony_ci            ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4207cabdff1aSopenharmony_ci                       hz_out87_r);
4208cabdff1aSopenharmony_ci            ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4209cabdff1aSopenharmony_ci                       hz_out87_l);
4210cabdff1aSopenharmony_ci            tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4211cabdff1aSopenharmony_ci                                    filt1, filt2);
4212cabdff1aSopenharmony_ci            tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4213cabdff1aSopenharmony_ci                                    filt1, filt2);
4214cabdff1aSopenharmony_ci            tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4215cabdff1aSopenharmony_ci            tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4216cabdff1aSopenharmony_ci                                    filt1, filt2);
4217cabdff1aSopenharmony_ci            tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4218cabdff1aSopenharmony_ci                                    filt1, filt2);
4219cabdff1aSopenharmony_ci            tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4220cabdff1aSopenharmony_ci
4221cabdff1aSopenharmony_ci            tmp5 = __msa_srari_h(hz_out4, 5);
4222cabdff1aSopenharmony_ci            tmp7 = __msa_srari_h(hz_out5, 5);
4223cabdff1aSopenharmony_ci            SAT_SH2_SH(tmp5, tmp7, 7);
4224cabdff1aSopenharmony_ci
4225cabdff1aSopenharmony_ci            tmp2 = __msa_aver_s_h(tmp4, tmp5);
4226cabdff1aSopenharmony_ci            tmp3 = __msa_aver_s_h(tmp6, tmp7);
4227cabdff1aSopenharmony_ci
4228cabdff1aSopenharmony_ci            LD2(dst, stride, tp2, tp3);
4229cabdff1aSopenharmony_ci            INSERT_D2_UB(tp2, tp3, dst1);
4230cabdff1aSopenharmony_ci
4231cabdff1aSopenharmony_ci            out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4232cabdff1aSopenharmony_ci            dst1 = __msa_aver_u_b(out1, dst1);
4233cabdff1aSopenharmony_ci            ST_D2(dst1, 0, 1, dst, stride);
4234cabdff1aSopenharmony_ci            dst += (2 * stride);
4235cabdff1aSopenharmony_ci
4236cabdff1aSopenharmony_ci            hz_out0 = hz_out4;
4237cabdff1aSopenharmony_ci            hz_out1 = hz_out5;
4238cabdff1aSopenharmony_ci            hz_out2 = hz_out6;
4239cabdff1aSopenharmony_ci            hz_out3 = hz_out7;
4240cabdff1aSopenharmony_ci            hz_out4 = hz_out8;
4241cabdff1aSopenharmony_ci        }
4242cabdff1aSopenharmony_ci
4243cabdff1aSopenharmony_ci        src_tmp += 8;
4244cabdff1aSopenharmony_ci        dst_tmp += 8;
4245cabdff1aSopenharmony_ci    }
4246cabdff1aSopenharmony_ci}
4247cabdff1aSopenharmony_ci
4248cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
4249cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
4250cabdff1aSopenharmony_ci{
4251cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
4252cabdff1aSopenharmony_ci    uint8_t *dst_tmp = dst;
4253cabdff1aSopenharmony_ci    const uint8_t *src_tmp = src - (2 * stride) - 2;
4254cabdff1aSopenharmony_ci    uint32_t multiple8_cnt, loop_cnt;
4255cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
4256cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
4257cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
4258cabdff1aSopenharmony_ci    v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4259cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4260cabdff1aSopenharmony_ci    v16i8 mask2;
4261cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4262cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4263cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4264cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4265cabdff1aSopenharmony_ci    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4266cabdff1aSopenharmony_ci    v8i16 hz_out87_l, filt0, filt1, filt2;
4267cabdff1aSopenharmony_ci    v4i32 tmp0_w, tmp1_w;
4268cabdff1aSopenharmony_ci
4269cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
4270cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
4271cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
4272cabdff1aSopenharmony_ci
4273cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4274cabdff1aSopenharmony_ci
4275cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
4276cabdff1aSopenharmony_ci        dst = dst_tmp;
4277cabdff1aSopenharmony_ci        src = src_tmp;
4278cabdff1aSopenharmony_ci
4279cabdff1aSopenharmony_ci        LD_SB5(src, stride, src0, src1, src2, src3, src4);
4280cabdff1aSopenharmony_ci        XORI_B5_128_SB(src0, src1, src2, src3, src4);
4281cabdff1aSopenharmony_ci        src += (5 * stride);
4282cabdff1aSopenharmony_ci
4283cabdff1aSopenharmony_ci        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4284cabdff1aSopenharmony_ci        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4285cabdff1aSopenharmony_ci        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4286cabdff1aSopenharmony_ci        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4287cabdff1aSopenharmony_ci        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4288cabdff1aSopenharmony_ci
4289cabdff1aSopenharmony_ci        for (loop_cnt = 4; loop_cnt--;) {
4290cabdff1aSopenharmony_ci            LD_SB2(src, stride, src5, src6);
4291cabdff1aSopenharmony_ci            src += (2 * stride);
4292cabdff1aSopenharmony_ci
4293cabdff1aSopenharmony_ci            XORI_B2_128_SB(src5, src6);
4294cabdff1aSopenharmony_ci            hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4295cabdff1aSopenharmony_ci            hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4296cabdff1aSopenharmony_ci            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4297cabdff1aSopenharmony_ci                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4298cabdff1aSopenharmony_ci                       hz_out43_r);
4299cabdff1aSopenharmony_ci            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4300cabdff1aSopenharmony_ci                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4301cabdff1aSopenharmony_ci                       hz_out43_l);
4302cabdff1aSopenharmony_ci            ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r, hz_out65_r);
4303cabdff1aSopenharmony_ci            ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l, hz_out65_l);
4304cabdff1aSopenharmony_ci
4305cabdff1aSopenharmony_ci            tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4306cabdff1aSopenharmony_ci                                    filt1, filt2);
4307cabdff1aSopenharmony_ci            tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4308cabdff1aSopenharmony_ci                                    filt1, filt2);
4309cabdff1aSopenharmony_ci            tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4310cabdff1aSopenharmony_ci            tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4311cabdff1aSopenharmony_ci                                    filt1, filt2);
4312cabdff1aSopenharmony_ci            tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4313cabdff1aSopenharmony_ci                                    filt1, filt2);
4314cabdff1aSopenharmony_ci            tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4315cabdff1aSopenharmony_ci
4316cabdff1aSopenharmony_ci            tmp1 = __msa_srari_h(hz_out3, 5);
4317cabdff1aSopenharmony_ci            tmp3 = __msa_srari_h(hz_out4, 5);
4318cabdff1aSopenharmony_ci            SAT_SH2_SH(tmp1, tmp3, 7);
4319cabdff1aSopenharmony_ci
4320cabdff1aSopenharmony_ci            tmp0 = __msa_aver_s_h(tmp0, tmp1);
4321cabdff1aSopenharmony_ci            tmp1 = __msa_aver_s_h(tmp2, tmp3);
4322cabdff1aSopenharmony_ci
4323cabdff1aSopenharmony_ci            LD2(dst, stride, tp0, tp1);
4324cabdff1aSopenharmony_ci            INSERT_D2_UB(tp0, tp1, dst0);
4325cabdff1aSopenharmony_ci            out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4326cabdff1aSopenharmony_ci            dst0 = __msa_aver_u_b(out0, dst0);
4327cabdff1aSopenharmony_ci            ST_D2(dst0, 0, 1, dst, stride);
4328cabdff1aSopenharmony_ci            dst += (2 * stride);
4329cabdff1aSopenharmony_ci
4330cabdff1aSopenharmony_ci            LD_SB2(src, stride, src7, src8);
4331cabdff1aSopenharmony_ci            src += (2 * stride);
4332cabdff1aSopenharmony_ci
4333cabdff1aSopenharmony_ci            XORI_B2_128_SB(src7, src8);
4334cabdff1aSopenharmony_ci            hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4335cabdff1aSopenharmony_ci            hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4336cabdff1aSopenharmony_ci            ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4337cabdff1aSopenharmony_ci                       hz_out87_r);
4338cabdff1aSopenharmony_ci            ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4339cabdff1aSopenharmony_ci                       hz_out87_l);
4340cabdff1aSopenharmony_ci            tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4341cabdff1aSopenharmony_ci                                    filt1, filt2);
4342cabdff1aSopenharmony_ci            tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4343cabdff1aSopenharmony_ci                                    filt1, filt2);
4344cabdff1aSopenharmony_ci            tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4345cabdff1aSopenharmony_ci            tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4346cabdff1aSopenharmony_ci                                    filt1, filt2);
4347cabdff1aSopenharmony_ci            tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4348cabdff1aSopenharmony_ci                                    filt1, filt2);
4349cabdff1aSopenharmony_ci            tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4350cabdff1aSopenharmony_ci
4351cabdff1aSopenharmony_ci            tmp5 = __msa_srari_h(hz_out5, 5);
4352cabdff1aSopenharmony_ci            tmp7 = __msa_srari_h(hz_out6, 5);
4353cabdff1aSopenharmony_ci            SAT_SH2_SH(tmp5, tmp7, 7);
4354cabdff1aSopenharmony_ci
4355cabdff1aSopenharmony_ci            tmp2 = __msa_aver_s_h(tmp4, tmp5);
4356cabdff1aSopenharmony_ci            tmp3 = __msa_aver_s_h(tmp6, tmp7);
4357cabdff1aSopenharmony_ci
4358cabdff1aSopenharmony_ci            LD2(dst, stride, tp2, tp3);
4359cabdff1aSopenharmony_ci            INSERT_D2_UB(tp2, tp3, dst1);
4360cabdff1aSopenharmony_ci            out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4361cabdff1aSopenharmony_ci            dst1 = __msa_aver_u_b(out1, dst1);
4362cabdff1aSopenharmony_ci            ST_D2(dst1, 0, 1, dst, stride);
4363cabdff1aSopenharmony_ci            dst += (2 * stride);
4364cabdff1aSopenharmony_ci
4365cabdff1aSopenharmony_ci            hz_out0 = hz_out4;
4366cabdff1aSopenharmony_ci            hz_out1 = hz_out5;
4367cabdff1aSopenharmony_ci            hz_out2 = hz_out6;
4368cabdff1aSopenharmony_ci            hz_out3 = hz_out7;
4369cabdff1aSopenharmony_ci            hz_out4 = hz_out8;
4370cabdff1aSopenharmony_ci        }
4371cabdff1aSopenharmony_ci
4372cabdff1aSopenharmony_ci        src_tmp += 8;
4373cabdff1aSopenharmony_ci        dst_tmp += 8;
4374cabdff1aSopenharmony_ci    }
4375cabdff1aSopenharmony_ci}
4376cabdff1aSopenharmony_ci
4377cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
4378cabdff1aSopenharmony_ci                                ptrdiff_t stride)
4379cabdff1aSopenharmony_ci{
4380cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
4381cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
4382cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
4383cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
4384cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4385cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4386cabdff1aSopenharmony_ci    v16i8 src11, src12, mask0, mask1, mask2;
4387cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4388cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4389cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4390cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4391cabdff1aSopenharmony_ci    v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4392cabdff1aSopenharmony_ci    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4393cabdff1aSopenharmony_ci    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4394cabdff1aSopenharmony_ci    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4395cabdff1aSopenharmony_ci    v4i32 tmp0_w, tmp1_w;
4396cabdff1aSopenharmony_ci
4397cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4398cabdff1aSopenharmony_ci
4399cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
4400cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
4401cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
4402cabdff1aSopenharmony_ci
4403cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
4404cabdff1aSopenharmony_ci
4405cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
4406cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4407cabdff1aSopenharmony_ci    src += (5 * stride);
4408cabdff1aSopenharmony_ci
4409cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4410cabdff1aSopenharmony_ci    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4411cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4412cabdff1aSopenharmony_ci    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4413cabdff1aSopenharmony_ci    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4414cabdff1aSopenharmony_ci
4415cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
4416cabdff1aSopenharmony_ci    src += (4 * stride);
4417cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
4418cabdff1aSopenharmony_ci
4419cabdff1aSopenharmony_ci    hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4420cabdff1aSopenharmony_ci    hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4421cabdff1aSopenharmony_ci    hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4422cabdff1aSopenharmony_ci    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4423cabdff1aSopenharmony_ci
4424cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4425cabdff1aSopenharmony_ci               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4426cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4427cabdff1aSopenharmony_ci               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4428cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4429cabdff1aSopenharmony_ci               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4430cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4431cabdff1aSopenharmony_ci               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4432cabdff1aSopenharmony_ci
4433cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4434cabdff1aSopenharmony_ci                            filt2);
4435cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4436cabdff1aSopenharmony_ci                            filt2);
4437cabdff1aSopenharmony_ci    tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4438cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4439cabdff1aSopenharmony_ci                            filt2);
4440cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4441cabdff1aSopenharmony_ci                            filt2);
4442cabdff1aSopenharmony_ci    tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4443cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4444cabdff1aSopenharmony_ci                            filt2);
4445cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4446cabdff1aSopenharmony_ci                            filt2);
4447cabdff1aSopenharmony_ci    tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4448cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4449cabdff1aSopenharmony_ci                            filt2);
4450cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4451cabdff1aSopenharmony_ci                            filt2);
4452cabdff1aSopenharmony_ci    tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4453cabdff1aSopenharmony_ci
4454cabdff1aSopenharmony_ci    SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
4455cabdff1aSopenharmony_ci    SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
4456cabdff1aSopenharmony_ci
4457cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
4458cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
4459cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
4460cabdff1aSopenharmony_ci
4461cabdff1aSopenharmony_ci    tmp0 = __msa_aver_s_h(tmp0, hz_out2);
4462cabdff1aSopenharmony_ci    tmp1 = __msa_aver_s_h(tmp1, hz_out3);
4463cabdff1aSopenharmony_ci    tmp2 = __msa_aver_s_h(tmp2, hz_out4);
4464cabdff1aSopenharmony_ci    tmp3 = __msa_aver_s_h(tmp3, hz_out5);
4465cabdff1aSopenharmony_ci
4466cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4467cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4468cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4469cabdff1aSopenharmony_ci    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4470cabdff1aSopenharmony_ci    dst += (4 * stride);
4471cabdff1aSopenharmony_ci
4472cabdff1aSopenharmony_ci    LD_SB4(src, stride, src9, src10, src11, src12);
4473cabdff1aSopenharmony_ci    XORI_B4_128_SB(src9, src10, src11, src12);
4474cabdff1aSopenharmony_ci    hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
4475cabdff1aSopenharmony_ci    hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
4476cabdff1aSopenharmony_ci    hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
4477cabdff1aSopenharmony_ci    hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
4478cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4479cabdff1aSopenharmony_ci               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4480cabdff1aSopenharmony_ci               hz_out1211_r);
4481cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4482cabdff1aSopenharmony_ci               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4483cabdff1aSopenharmony_ci               hz_out1211_l);
4484cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4485cabdff1aSopenharmony_ci                            filt2);
4486cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4487cabdff1aSopenharmony_ci                            filt2);
4488cabdff1aSopenharmony_ci    tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4489cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4490cabdff1aSopenharmony_ci                            filt2);
4491cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4492cabdff1aSopenharmony_ci                            filt2);
4493cabdff1aSopenharmony_ci    tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4494cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4495cabdff1aSopenharmony_ci                            filt2);
4496cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4497cabdff1aSopenharmony_ci                            filt2);
4498cabdff1aSopenharmony_ci    tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4499cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4500cabdff1aSopenharmony_ci                            filt2);
4501cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4502cabdff1aSopenharmony_ci                            filt2);
4503cabdff1aSopenharmony_ci    tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4504cabdff1aSopenharmony_ci
4505cabdff1aSopenharmony_ci    SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
4506cabdff1aSopenharmony_ci    SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
4507cabdff1aSopenharmony_ci
4508cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
4509cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
4510cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
4511cabdff1aSopenharmony_ci
4512cabdff1aSopenharmony_ci    tmp0 = __msa_aver_s_h(tmp0, hz_out6);
4513cabdff1aSopenharmony_ci    tmp1 = __msa_aver_s_h(tmp1, hz_out7);
4514cabdff1aSopenharmony_ci    tmp2 = __msa_aver_s_h(tmp2, hz_out8);
4515cabdff1aSopenharmony_ci    tmp3 = __msa_aver_s_h(tmp3, hz_out9);
4516cabdff1aSopenharmony_ci
4517cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4518cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4519cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4520cabdff1aSopenharmony_ci    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4521cabdff1aSopenharmony_ci}
4522cabdff1aSopenharmony_ci
4523cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
4524cabdff1aSopenharmony_ci                                ptrdiff_t stride)
4525cabdff1aSopenharmony_ci{
4526cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
4527cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
4528cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
4529cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
4530cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4531cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4532cabdff1aSopenharmony_ci    v16i8 src11, src12, mask0, mask1, mask2;
4533cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4534cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4535cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4536cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4537cabdff1aSopenharmony_ci    v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4538cabdff1aSopenharmony_ci    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4539cabdff1aSopenharmony_ci    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4540cabdff1aSopenharmony_ci    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4541cabdff1aSopenharmony_ci    v4i32 tmp0_w, tmp1_w;
4542cabdff1aSopenharmony_ci
4543cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4544cabdff1aSopenharmony_ci
4545cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
4546cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
4547cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
4548cabdff1aSopenharmony_ci
4549cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
4550cabdff1aSopenharmony_ci
4551cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
4552cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4553cabdff1aSopenharmony_ci    src += (5 * stride);
4554cabdff1aSopenharmony_ci
4555cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4556cabdff1aSopenharmony_ci    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4557cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4558cabdff1aSopenharmony_ci    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4559cabdff1aSopenharmony_ci    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4560cabdff1aSopenharmony_ci
4561cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
4562cabdff1aSopenharmony_ci    src += (4 * stride);
4563cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
4564cabdff1aSopenharmony_ci
4565cabdff1aSopenharmony_ci    hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4566cabdff1aSopenharmony_ci    hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4567cabdff1aSopenharmony_ci    hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4568cabdff1aSopenharmony_ci    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4569cabdff1aSopenharmony_ci
4570cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4571cabdff1aSopenharmony_ci               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4572cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4573cabdff1aSopenharmony_ci               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4574cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4575cabdff1aSopenharmony_ci               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4576cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4577cabdff1aSopenharmony_ci               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4578cabdff1aSopenharmony_ci
4579cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4580cabdff1aSopenharmony_ci                            filt2);
4581cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4582cabdff1aSopenharmony_ci                            filt2);
4583cabdff1aSopenharmony_ci    tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4584cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4585cabdff1aSopenharmony_ci                            filt2);
4586cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4587cabdff1aSopenharmony_ci                            filt2);
4588cabdff1aSopenharmony_ci    tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4589cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4590cabdff1aSopenharmony_ci                            filt2);
4591cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4592cabdff1aSopenharmony_ci                            filt2);
4593cabdff1aSopenharmony_ci    tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4594cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4595cabdff1aSopenharmony_ci                            filt2);
4596cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4597cabdff1aSopenharmony_ci                            filt2);
4598cabdff1aSopenharmony_ci    tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4599cabdff1aSopenharmony_ci
4600cabdff1aSopenharmony_ci    SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
4601cabdff1aSopenharmony_ci    SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
4602cabdff1aSopenharmony_ci
4603cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
4604cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
4605cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
4606cabdff1aSopenharmony_ci
4607cabdff1aSopenharmony_ci    tmp0 = __msa_aver_s_h(tmp0, hz_out3);
4608cabdff1aSopenharmony_ci    tmp1 = __msa_aver_s_h(tmp1, hz_out4);
4609cabdff1aSopenharmony_ci    tmp2 = __msa_aver_s_h(tmp2, hz_out5);
4610cabdff1aSopenharmony_ci    tmp3 = __msa_aver_s_h(tmp3, hz_out6);
4611cabdff1aSopenharmony_ci
4612cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4613cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4614cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4615cabdff1aSopenharmony_ci    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4616cabdff1aSopenharmony_ci    dst += (4 * stride);
4617cabdff1aSopenharmony_ci
4618cabdff1aSopenharmony_ci    LD_SB4(src, stride, src9, src10, src11, src12);
4619cabdff1aSopenharmony_ci    XORI_B4_128_SB(src9, src10, src11, src12);
4620cabdff1aSopenharmony_ci    hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
4621cabdff1aSopenharmony_ci    hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
4622cabdff1aSopenharmony_ci    hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
4623cabdff1aSopenharmony_ci    hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
4624cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4625cabdff1aSopenharmony_ci               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4626cabdff1aSopenharmony_ci               hz_out1211_r);
4627cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4628cabdff1aSopenharmony_ci               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4629cabdff1aSopenharmony_ci               hz_out1211_l);
4630cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4631cabdff1aSopenharmony_ci                            filt2);
4632cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4633cabdff1aSopenharmony_ci                            filt2);
4634cabdff1aSopenharmony_ci    tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4635cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4636cabdff1aSopenharmony_ci                            filt2);
4637cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4638cabdff1aSopenharmony_ci                            filt2);
4639cabdff1aSopenharmony_ci    tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4640cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4641cabdff1aSopenharmony_ci                            filt2);
4642cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4643cabdff1aSopenharmony_ci                            filt2);
4644cabdff1aSopenharmony_ci    tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4645cabdff1aSopenharmony_ci    tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4646cabdff1aSopenharmony_ci                            filt2);
4647cabdff1aSopenharmony_ci    tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4648cabdff1aSopenharmony_ci                            filt2);
4649cabdff1aSopenharmony_ci    tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4650cabdff1aSopenharmony_ci
4651cabdff1aSopenharmony_ci    SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
4652cabdff1aSopenharmony_ci    SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
4653cabdff1aSopenharmony_ci
4654cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
4655cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
4656cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
4657cabdff1aSopenharmony_ci
4658cabdff1aSopenharmony_ci    tmp0 = __msa_aver_s_h(tmp0, hz_out7);
4659cabdff1aSopenharmony_ci    tmp1 = __msa_aver_s_h(tmp1, hz_out8);
4660cabdff1aSopenharmony_ci    tmp2 = __msa_aver_s_h(tmp2, hz_out9);
4661cabdff1aSopenharmony_ci    tmp3 = __msa_aver_s_h(tmp3, hz_out10);
4662cabdff1aSopenharmony_ci
4663cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4664cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4665cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4666cabdff1aSopenharmony_ci    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4667cabdff1aSopenharmony_ci}
4668cabdff1aSopenharmony_ci
4669cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
4670cabdff1aSopenharmony_ci                                ptrdiff_t stride)
4671cabdff1aSopenharmony_ci{
4672cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
4673cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
4674cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
4675cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
4676cabdff1aSopenharmony_ci    v16u8 res, out = { 0 };
4677cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4678cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2;
4679cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4680cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4681cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4682cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4683cabdff1aSopenharmony_ci    v4i32 tmp0, tmp1;
4684cabdff1aSopenharmony_ci
4685cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
4686cabdff1aSopenharmony_ci
4687cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
4688cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
4689cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
4690cabdff1aSopenharmony_ci
4691cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
4692cabdff1aSopenharmony_ci
4693cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
4694cabdff1aSopenharmony_ci    src += (5 * stride);
4695cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
4696cabdff1aSopenharmony_ci
4697cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4698cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
4699cabdff1aSopenharmony_ci
4700cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
4701cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
4702cabdff1aSopenharmony_ci    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
4703cabdff1aSopenharmony_ci    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
4704cabdff1aSopenharmony_ci    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4705cabdff1aSopenharmony_ci    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4706cabdff1aSopenharmony_ci    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4707cabdff1aSopenharmony_ci
4708cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4709cabdff1aSopenharmony_ci               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4710cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4711cabdff1aSopenharmony_ci               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4712cabdff1aSopenharmony_ci
4713cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4714cabdff1aSopenharmony_ci                          filt2);
4715cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4716cabdff1aSopenharmony_ci                          filt2);
4717cabdff1aSopenharmony_ci    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4718cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4719cabdff1aSopenharmony_ci                          filt2);
4720cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4721cabdff1aSopenharmony_ci                          filt2);
4722cabdff1aSopenharmony_ci    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4723cabdff1aSopenharmony_ci
4724cabdff1aSopenharmony_ci    SRARI_H2_SH(hz_out2, hz_out4, 5);
4725cabdff1aSopenharmony_ci    SAT_SH2_SH(hz_out2, hz_out4, 7);
4726cabdff1aSopenharmony_ci
4727cabdff1aSopenharmony_ci    dst0 = __msa_aver_s_h(dst0, hz_out2);
4728cabdff1aSopenharmony_ci    dst1 = __msa_aver_s_h(dst1, hz_out4);
4729cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
4730cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
4731cabdff1aSopenharmony_ci    res = PCKEV_XORI128_UB(dst0, dst1);
4732cabdff1aSopenharmony_ci    res = __msa_aver_u_b(res, out);
4733cabdff1aSopenharmony_ci    ST_W4(res, 0, 1, 2, 3, dst, stride);
4734cabdff1aSopenharmony_ci}
4735cabdff1aSopenharmony_ci
4736cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
4737cabdff1aSopenharmony_ci                                ptrdiff_t stride)
4738cabdff1aSopenharmony_ci{
4739cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
4740cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
4741cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
4742cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
4743cabdff1aSopenharmony_ci    v16u8 res, out = { 0 };
4744cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4745cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2;
4746cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4747cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4748cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4749cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4750cabdff1aSopenharmony_ci    v4i32 tmp0, tmp1;
4751cabdff1aSopenharmony_ci
4752cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
4753cabdff1aSopenharmony_ci
4754cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
4755cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
4756cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
4757cabdff1aSopenharmony_ci
4758cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
4759cabdff1aSopenharmony_ci
4760cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
4761cabdff1aSopenharmony_ci    src += (5 * stride);
4762cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
4763cabdff1aSopenharmony_ci
4764cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4765cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
4766cabdff1aSopenharmony_ci
4767cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
4768cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
4769cabdff1aSopenharmony_ci    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
4770cabdff1aSopenharmony_ci    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
4771cabdff1aSopenharmony_ci    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4772cabdff1aSopenharmony_ci    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4773cabdff1aSopenharmony_ci    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4774cabdff1aSopenharmony_ci
4775cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4776cabdff1aSopenharmony_ci               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4777cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4778cabdff1aSopenharmony_ci               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4779cabdff1aSopenharmony_ci
4780cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4781cabdff1aSopenharmony_ci                          filt2);
4782cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4783cabdff1aSopenharmony_ci                          filt2);
4784cabdff1aSopenharmony_ci    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4785cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4786cabdff1aSopenharmony_ci                          filt2);
4787cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4788cabdff1aSopenharmony_ci                          filt2);
4789cabdff1aSopenharmony_ci    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4790cabdff1aSopenharmony_ci
4791cabdff1aSopenharmony_ci    PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
4792cabdff1aSopenharmony_ci    SRARI_H2_SH(hz_out0, hz_out1, 5);
4793cabdff1aSopenharmony_ci    SAT_SH2_SH(hz_out0, hz_out1, 7);
4794cabdff1aSopenharmony_ci
4795cabdff1aSopenharmony_ci    dst0 = __msa_aver_s_h(dst0, hz_out0);
4796cabdff1aSopenharmony_ci    dst1 = __msa_aver_s_h(dst1, hz_out1);
4797cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
4798cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
4799cabdff1aSopenharmony_ci    res = PCKEV_XORI128_UB(dst0, dst1);
4800cabdff1aSopenharmony_ci    res = __msa_aver_u_b(res, out);
4801cabdff1aSopenharmony_ci    ST_W4(res, 0, 1, 2, 3, dst, stride);
4802cabdff1aSopenharmony_ci}
4803cabdff1aSopenharmony_ci
4804cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
4805cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
4806cabdff1aSopenharmony_ci{
4807cabdff1aSopenharmony_ci    int32_t loop_cnt;
4808cabdff1aSopenharmony_ci    int16_t filt_const0 = 0xfb01;
4809cabdff1aSopenharmony_ci    int16_t filt_const1 = 0x1414;
4810cabdff1aSopenharmony_ci    int16_t filt_const2 = 0x1fb;
4811cabdff1aSopenharmony_ci    v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
4812cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4813cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4814cabdff1aSopenharmony_ci    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
4815cabdff1aSopenharmony_ci    v16i8 src65_l, src87_l, filt0, filt1, filt2;
4816cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
4817cabdff1aSopenharmony_ci
4818cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
4819cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
4820cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
4821cabdff1aSopenharmony_ci    src -= (stride * 2);
4822cabdff1aSopenharmony_ci
4823cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
4824cabdff1aSopenharmony_ci    src += (5 * stride);
4825cabdff1aSopenharmony_ci
4826cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4827cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4828cabdff1aSopenharmony_ci               src32_r, src43_r);
4829cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
4830cabdff1aSopenharmony_ci               src32_l, src43_l);
4831cabdff1aSopenharmony_ci
4832cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
4833cabdff1aSopenharmony_ci        LD_SB4(src, stride, src5, src6, src7, src8);
4834cabdff1aSopenharmony_ci        src += (4 * stride);
4835cabdff1aSopenharmony_ci
4836cabdff1aSopenharmony_ci        XORI_B4_128_SB(src5, src6, src7, src8);
4837cabdff1aSopenharmony_ci        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
4838cabdff1aSopenharmony_ci                   src65_r, src76_r, src87_r);
4839cabdff1aSopenharmony_ci        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
4840cabdff1aSopenharmony_ci                   src65_l, src76_l, src87_l);
4841cabdff1aSopenharmony_ci        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
4842cabdff1aSopenharmony_ci        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
4843cabdff1aSopenharmony_ci        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
4844cabdff1aSopenharmony_ci        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
4845cabdff1aSopenharmony_ci        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
4846cabdff1aSopenharmony_ci        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
4847cabdff1aSopenharmony_ci        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
4848cabdff1aSopenharmony_ci        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
4849cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
4850cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4851cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
4852cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
4853cabdff1aSopenharmony_ci        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
4854cabdff1aSopenharmony_ci        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
4855cabdff1aSopenharmony_ci                    out3_r, res0, res1, res2, res3);
4856cabdff1aSopenharmony_ci        XORI_B4_128_UB(res0, res1, res2, res3);
4857cabdff1aSopenharmony_ci        AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
4858cabdff1aSopenharmony_ci        AVER_UB2_UB(res2, dst2, res3, dst3, res2, res3);
4859cabdff1aSopenharmony_ci        ST_UB4(res0, res1, res2, res3, dst, stride);
4860cabdff1aSopenharmony_ci        dst += (4 * stride);
4861cabdff1aSopenharmony_ci
4862cabdff1aSopenharmony_ci        src10_r = src54_r;
4863cabdff1aSopenharmony_ci        src32_r = src76_r;
4864cabdff1aSopenharmony_ci        src21_r = src65_r;
4865cabdff1aSopenharmony_ci        src43_r = src87_r;
4866cabdff1aSopenharmony_ci        src10_l = src54_l;
4867cabdff1aSopenharmony_ci        src32_l = src76_l;
4868cabdff1aSopenharmony_ci        src21_l = src65_l;
4869cabdff1aSopenharmony_ci        src43_l = src87_l;
4870cabdff1aSopenharmony_ci        src4 = src8;
4871cabdff1aSopenharmony_ci    }
4872cabdff1aSopenharmony_ci}
4873cabdff1aSopenharmony_ci
4874cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
4875cabdff1aSopenharmony_ci                                ptrdiff_t stride)
4876cabdff1aSopenharmony_ci{
4877cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
4878cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
4879cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
4880cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
4881cabdff1aSopenharmony_ci    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
4882cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
4883cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src109_r;
4884cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
4885cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2;
4886cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
4887cabdff1aSopenharmony_ci
4888cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
4889cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
4890cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
4891cabdff1aSopenharmony_ci
4892cabdff1aSopenharmony_ci    src -= (stride * 2);
4893cabdff1aSopenharmony_ci
4894cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
4895cabdff1aSopenharmony_ci    src += (5 * stride);
4896cabdff1aSopenharmony_ci
4897cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4898cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4899cabdff1aSopenharmony_ci               src32_r, src43_r);
4900cabdff1aSopenharmony_ci
4901cabdff1aSopenharmony_ci    LD_SB4(src, stride, src7, src8, src9, src10);
4902cabdff1aSopenharmony_ci    src += (4 * stride);
4903cabdff1aSopenharmony_ci    XORI_B4_128_SB(src7, src8, src9, src10);
4904cabdff1aSopenharmony_ci    ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
4905cabdff1aSopenharmony_ci               src87_r, src98_r, src109_r);
4906cabdff1aSopenharmony_ci    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
4907cabdff1aSopenharmony_ci    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
4908cabdff1aSopenharmony_ci    out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
4909cabdff1aSopenharmony_ci    out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
4910cabdff1aSopenharmony_ci
4911cabdff1aSopenharmony_ci    LD_SB4(src, stride, src0, src1, src2, src3);
4912cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
4913cabdff1aSopenharmony_ci    ILVR_B4_SB(src0, src10, src1, src0, src2, src1, src3, src2, src10_r,
4914cabdff1aSopenharmony_ci               src21_r, src32_r, src43_r);
4915cabdff1aSopenharmony_ci    out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
4916cabdff1aSopenharmony_ci    out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
4917cabdff1aSopenharmony_ci    out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
4918cabdff1aSopenharmony_ci    out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
4919cabdff1aSopenharmony_ci
4920cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
4921cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
4922cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
4923cabdff1aSopenharmony_ci    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
4924cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst2);
4925cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst3);
4926cabdff1aSopenharmony_ci
4927cabdff1aSopenharmony_ci    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
4928cabdff1aSopenharmony_ci    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
4929cabdff1aSopenharmony_ci    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4930cabdff1aSopenharmony_ci    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
4931cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(out0_r, out1_r);
4932cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(out2_r, out3_r);
4933cabdff1aSopenharmony_ci    out2 = PCKEV_XORI128_UB(out4_r, out5_r);
4934cabdff1aSopenharmony_ci    out3 = PCKEV_XORI128_UB(out6_r, out7_r);
4935cabdff1aSopenharmony_ci    AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
4936cabdff1aSopenharmony_ci                dst2, dst3);
4937cabdff1aSopenharmony_ci    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
4938cabdff1aSopenharmony_ci}
4939cabdff1aSopenharmony_ci
4940cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
4941cabdff1aSopenharmony_ci                                ptrdiff_t stride)
4942cabdff1aSopenharmony_ci{
4943cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
4944cabdff1aSopenharmony_ci    int16_t filt_const0 = 0xfb01;
4945cabdff1aSopenharmony_ci    int16_t filt_const1 = 0x1414;
4946cabdff1aSopenharmony_ci    int16_t filt_const2 = 0x1fb;
4947cabdff1aSopenharmony_ci    v16u8 res, dst0 = { 0 };
4948cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4949cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4950cabdff1aSopenharmony_ci    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
4951cabdff1aSopenharmony_ci    v8i16 out10, out32;
4952cabdff1aSopenharmony_ci
4953cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
4954cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
4955cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
4956cabdff1aSopenharmony_ci
4957cabdff1aSopenharmony_ci    src -= (stride * 2);
4958cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
4959cabdff1aSopenharmony_ci    src += (5 * stride);
4960cabdff1aSopenharmony_ci
4961cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4962cabdff1aSopenharmony_ci               src32_r, src43_r);
4963cabdff1aSopenharmony_ci    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
4964cabdff1aSopenharmony_ci    XORI_B2_128_SB(src2110, src4332);
4965cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
4966cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
4967cabdff1aSopenharmony_ci               src76_r, src87_r);
4968cabdff1aSopenharmony_ci    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4969cabdff1aSopenharmony_ci    XORI_B2_128_SB(src6554, src8776);
4970cabdff1aSopenharmony_ci    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4971cabdff1aSopenharmony_ci    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4972cabdff1aSopenharmony_ci    SRARI_H2_SH(out10, out32, 5);
4973cabdff1aSopenharmony_ci    SAT_SH2_SH(out10, out32, 7);
4974cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
4975cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4976cabdff1aSopenharmony_ci    res = PCKEV_XORI128_UB(out10, out32);
4977cabdff1aSopenharmony_ci    dst0 = __msa_aver_u_b(res, dst0);
4978cabdff1aSopenharmony_ci    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
4979cabdff1aSopenharmony_ci}
4980cabdff1aSopenharmony_ci
4981cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
4982cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
4983cabdff1aSopenharmony_ci{
4984cabdff1aSopenharmony_ci    uint32_t row;
4985cabdff1aSopenharmony_ci    v16u8 out, dst0;
4986cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4987cabdff1aSopenharmony_ci    v16i8 src11;
4988cabdff1aSopenharmony_ci    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
4989cabdff1aSopenharmony_ci    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
4990cabdff1aSopenharmony_ci    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
4991cabdff1aSopenharmony_ci    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
4992cabdff1aSopenharmony_ci    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
4993cabdff1aSopenharmony_ci    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
4994cabdff1aSopenharmony_ci    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
4995cabdff1aSopenharmony_ci    v8i16 minus5h = __msa_ldi_h(-5);
4996cabdff1aSopenharmony_ci    v8i16 plus20h = __msa_ldi_h(20);
4997cabdff1aSopenharmony_ci
4998cabdff1aSopenharmony_ci    mask3 = mask0 + 4;
4999cabdff1aSopenharmony_ci    mask4 = mask1 + 4;
5000cabdff1aSopenharmony_ci    mask5 = mask2 + 4;
5001cabdff1aSopenharmony_ci
5002cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
5003cabdff1aSopenharmony_ci
5004cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5005cabdff1aSopenharmony_ci    LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
5006cabdff1aSopenharmony_ci    src += (5 * stride);
5007cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5008cabdff1aSopenharmony_ci    XORI_B5_128_SB(src7, src8, src9, src10, src11);
5009cabdff1aSopenharmony_ci
5010cabdff1aSopenharmony_ci    for (row = 16; row--;) {
5011cabdff1aSopenharmony_ci        LD_SB2(src, 8, src5, src6);
5012cabdff1aSopenharmony_ci        src += stride;
5013cabdff1aSopenharmony_ci        XORI_B2_128_SB(src5, src6);
5014cabdff1aSopenharmony_ci        dst0 = LD_UB(dst);
5015cabdff1aSopenharmony_ci
5016cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5017cabdff1aSopenharmony_ci                                        vt_res0, vt_res1);
5018cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
5019cabdff1aSopenharmony_ci                                        vt_res2, vt_res3);
5020cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5021cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5022cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5023cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5024cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5025cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5026cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5027cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5028cabdff1aSopenharmony_ci        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5029cabdff1aSopenharmony_ci        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5030cabdff1aSopenharmony_ci        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5031cabdff1aSopenharmony_ci        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5032cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5033cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5034cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5035cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5036cabdff1aSopenharmony_ci        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5037cabdff1aSopenharmony_ci        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5038cabdff1aSopenharmony_ci        tmp0 = __msa_srari_h(shf_vec2, 5);
5039cabdff1aSopenharmony_ci        tmp1 = __msa_srari_h(shf_vec5, 5);
5040cabdff1aSopenharmony_ci        tmp2 = __msa_srari_h(shf_vec8, 5);
5041cabdff1aSopenharmony_ci        tmp3 = __msa_srari_h(shf_vec11, 5);
5042cabdff1aSopenharmony_ci        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5043cabdff1aSopenharmony_ci        PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
5044cabdff1aSopenharmony_ci        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5045cabdff1aSopenharmony_ci        tmp0 = __msa_aver_s_h(tmp2, tmp0);
5046cabdff1aSopenharmony_ci        tmp1 = __msa_aver_s_h(tmp3, tmp1);
5047cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(tmp0, tmp1);
5048cabdff1aSopenharmony_ci        out = __msa_aver_u_b(out, dst0);
5049cabdff1aSopenharmony_ci        ST_UB(out, dst);
5050cabdff1aSopenharmony_ci        dst += stride;
5051cabdff1aSopenharmony_ci
5052cabdff1aSopenharmony_ci        src0 = src1;
5053cabdff1aSopenharmony_ci        src1 = src2;
5054cabdff1aSopenharmony_ci        src2 = src3;
5055cabdff1aSopenharmony_ci        src3 = src4;
5056cabdff1aSopenharmony_ci        src4 = src5;
5057cabdff1aSopenharmony_ci        src7 = src8;
5058cabdff1aSopenharmony_ci        src8 = src9;
5059cabdff1aSopenharmony_ci        src9 = src10;
5060cabdff1aSopenharmony_ci        src10 = src11;
5061cabdff1aSopenharmony_ci        src11 = src6;
5062cabdff1aSopenharmony_ci    }
5063cabdff1aSopenharmony_ci}
5064cabdff1aSopenharmony_ci
5065cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
5066cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
5067cabdff1aSopenharmony_ci{
5068cabdff1aSopenharmony_ci    uint32_t row;
5069cabdff1aSopenharmony_ci    v16u8 out, dst0;
5070cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5071cabdff1aSopenharmony_ci    v16i8 src11;
5072cabdff1aSopenharmony_ci    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
5073cabdff1aSopenharmony_ci    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5074cabdff1aSopenharmony_ci    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
5075cabdff1aSopenharmony_ci    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5076cabdff1aSopenharmony_ci    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5077cabdff1aSopenharmony_ci    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5078cabdff1aSopenharmony_ci    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5079cabdff1aSopenharmony_ci    v8i16 minus5h = __msa_ldi_h(-5);
5080cabdff1aSopenharmony_ci    v8i16 plus20h = __msa_ldi_h(20);
5081cabdff1aSopenharmony_ci
5082cabdff1aSopenharmony_ci    mask3 = mask0 + 4;
5083cabdff1aSopenharmony_ci    mask4 = mask1 + 4;
5084cabdff1aSopenharmony_ci    mask5 = mask2 + 4;
5085cabdff1aSopenharmony_ci
5086cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
5087cabdff1aSopenharmony_ci
5088cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5089cabdff1aSopenharmony_ci    LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
5090cabdff1aSopenharmony_ci    src += (5 * stride);
5091cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5092cabdff1aSopenharmony_ci    XORI_B5_128_SB(src7, src8, src9, src10, src11);
5093cabdff1aSopenharmony_ci
5094cabdff1aSopenharmony_ci    for (row = 16; row--;) {
5095cabdff1aSopenharmony_ci        LD_SB2(src, 8, src5, src6);
5096cabdff1aSopenharmony_ci        src += stride;
5097cabdff1aSopenharmony_ci        XORI_B2_128_SB(src5, src6);
5098cabdff1aSopenharmony_ci        dst0 = LD_UB(dst);
5099cabdff1aSopenharmony_ci
5100cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5101cabdff1aSopenharmony_ci                                        vt_res0, vt_res1);
5102cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
5103cabdff1aSopenharmony_ci                                        vt_res2, vt_res3);
5104cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5105cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5106cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5107cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5108cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5109cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5110cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5111cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5112cabdff1aSopenharmony_ci        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5113cabdff1aSopenharmony_ci        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5114cabdff1aSopenharmony_ci        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5115cabdff1aSopenharmony_ci        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5116cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5117cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5118cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5119cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5120cabdff1aSopenharmony_ci        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5121cabdff1aSopenharmony_ci        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5122cabdff1aSopenharmony_ci        tmp0 = __msa_srari_h(shf_vec2, 5);
5123cabdff1aSopenharmony_ci        tmp1 = __msa_srari_h(shf_vec5, 5);
5124cabdff1aSopenharmony_ci        tmp2 = __msa_srari_h(shf_vec8, 5);
5125cabdff1aSopenharmony_ci        tmp3 = __msa_srari_h(shf_vec11, 5);
5126cabdff1aSopenharmony_ci        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5127cabdff1aSopenharmony_ci        tmp0 = __msa_pckod_h(tmp2, tmp0);
5128cabdff1aSopenharmony_ci        tmp1 = __msa_pckod_h(tmp3, tmp1);
5129cabdff1aSopenharmony_ci        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5130cabdff1aSopenharmony_ci        tmp0 = __msa_aver_s_h(tmp2, tmp0);
5131cabdff1aSopenharmony_ci        tmp1 = __msa_aver_s_h(tmp3, tmp1);
5132cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(tmp0, tmp1);
5133cabdff1aSopenharmony_ci        out = __msa_aver_u_b(out, dst0);
5134cabdff1aSopenharmony_ci        ST_UB(out, dst);
5135cabdff1aSopenharmony_ci        dst += stride;
5136cabdff1aSopenharmony_ci
5137cabdff1aSopenharmony_ci        src0 = src1;
5138cabdff1aSopenharmony_ci        src1 = src2;
5139cabdff1aSopenharmony_ci        src2 = src3;
5140cabdff1aSopenharmony_ci        src3 = src4;
5141cabdff1aSopenharmony_ci        src4 = src5;
5142cabdff1aSopenharmony_ci        src7 = src8;
5143cabdff1aSopenharmony_ci        src8 = src9;
5144cabdff1aSopenharmony_ci        src9 = src10;
5145cabdff1aSopenharmony_ci        src10 = src11;
5146cabdff1aSopenharmony_ci        src11 = src6;
5147cabdff1aSopenharmony_ci    }
5148cabdff1aSopenharmony_ci}
5149cabdff1aSopenharmony_ci
5150cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
5151cabdff1aSopenharmony_ci                                ptrdiff_t stride)
5152cabdff1aSopenharmony_ci{
5153cabdff1aSopenharmony_ci    uint32_t row;
5154cabdff1aSopenharmony_ci    uint64_t tp0, tp1;
5155cabdff1aSopenharmony_ci    v16u8 out, dst0 = { 0 };
5156cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
5157cabdff1aSopenharmony_ci    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5158cabdff1aSopenharmony_ci    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5159cabdff1aSopenharmony_ci    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5160cabdff1aSopenharmony_ci    v8i16 mask3, mask4, mask5;
5161cabdff1aSopenharmony_ci    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5162cabdff1aSopenharmony_ci    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5163cabdff1aSopenharmony_ci    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5164cabdff1aSopenharmony_ci    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5165cabdff1aSopenharmony_ci    v8i16 minus5h = __msa_ldi_h(-5);
5166cabdff1aSopenharmony_ci    v8i16 plus20h = __msa_ldi_h(20);
5167cabdff1aSopenharmony_ci
5168cabdff1aSopenharmony_ci    mask3 = mask0 + 4;
5169cabdff1aSopenharmony_ci    mask4 = mask1 + 4;
5170cabdff1aSopenharmony_ci    mask5 = mask2 + 4;
5171cabdff1aSopenharmony_ci
5172cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
5173cabdff1aSopenharmony_ci
5174cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5175cabdff1aSopenharmony_ci    src += (5 * stride);
5176cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5177cabdff1aSopenharmony_ci
5178cabdff1aSopenharmony_ci    for (row = 4; row--;) {
5179cabdff1aSopenharmony_ci        LD_SB2(src, stride, src5, src6);
5180cabdff1aSopenharmony_ci        src += (2 * stride);
5181cabdff1aSopenharmony_ci        XORI_B2_128_SB(src5, src6);
5182cabdff1aSopenharmony_ci
5183cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5184cabdff1aSopenharmony_ci                                        vt_res0, vt_res1);
5185cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
5186cabdff1aSopenharmony_ci                                        vt_res2, vt_res3);
5187cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5188cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5189cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5190cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5191cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5192cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5193cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5194cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5195cabdff1aSopenharmony_ci        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5196cabdff1aSopenharmony_ci        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5197cabdff1aSopenharmony_ci        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5198cabdff1aSopenharmony_ci        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5199cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5200cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5201cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5202cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5203cabdff1aSopenharmony_ci        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5204cabdff1aSopenharmony_ci        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5205cabdff1aSopenharmony_ci        tmp0 = __msa_srari_h(shf_vec2, 5);
5206cabdff1aSopenharmony_ci        tmp1 = __msa_srari_h(shf_vec5, 5);
5207cabdff1aSopenharmony_ci        tmp2 = __msa_srari_h(shf_vec8, 5);
5208cabdff1aSopenharmony_ci        tmp3 = __msa_srari_h(shf_vec11, 5);
5209cabdff1aSopenharmony_ci        LD2(dst, stride, tp0, tp1);
5210cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, dst0);
5211cabdff1aSopenharmony_ci        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5212cabdff1aSopenharmony_ci        PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
5213cabdff1aSopenharmony_ci        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5214cabdff1aSopenharmony_ci        tmp0 = __msa_aver_s_h(tmp2, tmp0);
5215cabdff1aSopenharmony_ci        tmp1 = __msa_aver_s_h(tmp3, tmp1);
5216cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(tmp0, tmp1);
5217cabdff1aSopenharmony_ci        out = __msa_aver_u_b(out, dst0);
5218cabdff1aSopenharmony_ci        ST_D2(out, 0, 1, dst, stride);
5219cabdff1aSopenharmony_ci        dst += (2 * stride);
5220cabdff1aSopenharmony_ci
5221cabdff1aSopenharmony_ci        src0 = src2;
5222cabdff1aSopenharmony_ci        src1 = src3;
5223cabdff1aSopenharmony_ci        src2 = src4;
5224cabdff1aSopenharmony_ci        src3 = src5;
5225cabdff1aSopenharmony_ci        src4 = src6;
5226cabdff1aSopenharmony_ci    }
5227cabdff1aSopenharmony_ci}
5228cabdff1aSopenharmony_ci
5229cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
5230cabdff1aSopenharmony_ci                                ptrdiff_t stride)
5231cabdff1aSopenharmony_ci{
5232cabdff1aSopenharmony_ci    uint32_t row;
5233cabdff1aSopenharmony_ci    uint64_t tp0, tp1;
5234cabdff1aSopenharmony_ci    v16u8 out, dst0 = { 0 };
5235cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
5236cabdff1aSopenharmony_ci    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5237cabdff1aSopenharmony_ci    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5238cabdff1aSopenharmony_ci    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5239cabdff1aSopenharmony_ci    v8i16 mask3, mask4, mask5;
5240cabdff1aSopenharmony_ci    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5241cabdff1aSopenharmony_ci    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5242cabdff1aSopenharmony_ci    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5243cabdff1aSopenharmony_ci    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5244cabdff1aSopenharmony_ci    v8i16 minus5h = __msa_ldi_h(-5);
5245cabdff1aSopenharmony_ci    v8i16 plus20h = __msa_ldi_h(20);
5246cabdff1aSopenharmony_ci
5247cabdff1aSopenharmony_ci    mask3 = mask0 + 4;
5248cabdff1aSopenharmony_ci    mask4 = mask1 + 4;
5249cabdff1aSopenharmony_ci    mask5 = mask2 + 4;
5250cabdff1aSopenharmony_ci
5251cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
5252cabdff1aSopenharmony_ci
5253cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5254cabdff1aSopenharmony_ci    src += (5 * stride);
5255cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5256cabdff1aSopenharmony_ci
5257cabdff1aSopenharmony_ci    for (row = 4; row--;) {
5258cabdff1aSopenharmony_ci        LD_SB2(src, stride, src5, src6);
5259cabdff1aSopenharmony_ci        src += (2 * stride);
5260cabdff1aSopenharmony_ci        XORI_B2_128_SB(src5, src6);
5261cabdff1aSopenharmony_ci
5262cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5263cabdff1aSopenharmony_ci                                        vt_res0, vt_res1);
5264cabdff1aSopenharmony_ci        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
5265cabdff1aSopenharmony_ci                                        vt_res2, vt_res3);
5266cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5267cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5268cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5269cabdff1aSopenharmony_ci                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5270cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5271cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5272cabdff1aSopenharmony_ci        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5273cabdff1aSopenharmony_ci                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5274cabdff1aSopenharmony_ci        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5275cabdff1aSopenharmony_ci        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5276cabdff1aSopenharmony_ci        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5277cabdff1aSopenharmony_ci        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5278cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5279cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5280cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5281cabdff1aSopenharmony_ci        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5282cabdff1aSopenharmony_ci        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5283cabdff1aSopenharmony_ci        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5284cabdff1aSopenharmony_ci        tmp0 = __msa_srari_h(shf_vec2, 5);
5285cabdff1aSopenharmony_ci        tmp1 = __msa_srari_h(shf_vec5, 5);
5286cabdff1aSopenharmony_ci        tmp2 = __msa_srari_h(shf_vec8, 5);
5287cabdff1aSopenharmony_ci        tmp3 = __msa_srari_h(shf_vec11, 5);
5288cabdff1aSopenharmony_ci        LD2(dst, stride, tp0, tp1);
5289cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, dst0);
5290cabdff1aSopenharmony_ci        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5291cabdff1aSopenharmony_ci        tmp0 = __msa_pckod_h(tmp2, tmp0);
5292cabdff1aSopenharmony_ci        tmp1 = __msa_pckod_h(tmp3, tmp1);
5293cabdff1aSopenharmony_ci        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5294cabdff1aSopenharmony_ci        tmp0 = __msa_aver_s_h(tmp2, tmp0);
5295cabdff1aSopenharmony_ci        tmp1 = __msa_aver_s_h(tmp3, tmp1);
5296cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(tmp0, tmp1);
5297cabdff1aSopenharmony_ci        out = __msa_aver_u_b(out, dst0);
5298cabdff1aSopenharmony_ci        ST_D2(out, 0, 1, dst, stride);
5299cabdff1aSopenharmony_ci        dst += (2 * stride);
5300cabdff1aSopenharmony_ci
5301cabdff1aSopenharmony_ci        src0 = src2;
5302cabdff1aSopenharmony_ci        src1 = src3;
5303cabdff1aSopenharmony_ci        src2 = src4;
5304cabdff1aSopenharmony_ci        src3 = src5;
5305cabdff1aSopenharmony_ci        src4 = src6;
5306cabdff1aSopenharmony_ci    }
5307cabdff1aSopenharmony_ci}
5308cabdff1aSopenharmony_ci
5309cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
5310cabdff1aSopenharmony_ci                                ptrdiff_t stride)
5311cabdff1aSopenharmony_ci{
5312cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
5313cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
5314cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
5315cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
5316cabdff1aSopenharmony_ci    v16u8 out, dstv = { 0 };
5317cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5318cabdff1aSopenharmony_ci    v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5319cabdff1aSopenharmony_ci    v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5320cabdff1aSopenharmony_ci    v16i8 src76_l, src87_l, filt0, filt1, filt2;
5321cabdff1aSopenharmony_ci    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5322cabdff1aSopenharmony_ci    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5323cabdff1aSopenharmony_ci    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5324cabdff1aSopenharmony_ci    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5325cabdff1aSopenharmony_ci    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5326cabdff1aSopenharmony_ci    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5327cabdff1aSopenharmony_ci    v8i16 minus5h = __msa_ldi_h(-5);
5328cabdff1aSopenharmony_ci    v8i16 plus20h = __msa_ldi_h(20);
5329cabdff1aSopenharmony_ci    v8i16 zeros = { 0 };
5330cabdff1aSopenharmony_ci
5331cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
5332cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
5333cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
5334cabdff1aSopenharmony_ci
5335cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
5336cabdff1aSopenharmony_ci
5337cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5338cabdff1aSopenharmony_ci    src += (5 * stride);
5339cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5340cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
5341cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
5342cabdff1aSopenharmony_ci
5343cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
5344cabdff1aSopenharmony_ci               src32_r, src43_r);
5345cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5346cabdff1aSopenharmony_ci               src76_r, src87_r);
5347cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
5348cabdff1aSopenharmony_ci               src32_l, src43_l);
5349cabdff1aSopenharmony_ci    ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5350cabdff1aSopenharmony_ci               src76_l, src87_l);
5351cabdff1aSopenharmony_ci    vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5352cabdff1aSopenharmony_ci    vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5353cabdff1aSopenharmony_ci    vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5354cabdff1aSopenharmony_ci    vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5355cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5356cabdff1aSopenharmony_ci               mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5357cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5358cabdff1aSopenharmony_ci               mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5359cabdff1aSopenharmony_ci    hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5360cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5361cabdff1aSopenharmony_ci    hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5362cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5363cabdff1aSopenharmony_ci
5364cabdff1aSopenharmony_ci    vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5365cabdff1aSopenharmony_ci    vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5366cabdff1aSopenharmony_ci    vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5367cabdff1aSopenharmony_ci    vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5368cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5369cabdff1aSopenharmony_ci               mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5370cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5371cabdff1aSopenharmony_ci               mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5372cabdff1aSopenharmony_ci    hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5373cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5374cabdff1aSopenharmony_ci    hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5375cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5376cabdff1aSopenharmony_ci
5377cabdff1aSopenharmony_ci    SRARI_W2_SW(hz_res0, hz_res1, 10);
5378cabdff1aSopenharmony_ci    SAT_SW2_SW(hz_res0, hz_res1, 7);
5379cabdff1aSopenharmony_ci    SRARI_W2_SW(hz_res2, hz_res3, 10);
5380cabdff1aSopenharmony_ci    SAT_SW2_SW(hz_res2, hz_res3, 7);
5381cabdff1aSopenharmony_ci
5382cabdff1aSopenharmony_ci    dst0 = __msa_srari_h(shf_vec2, 5);
5383cabdff1aSopenharmony_ci    dst1 = __msa_srari_h(shf_vec5, 5);
5384cabdff1aSopenharmony_ci    dst2 = __msa_srari_h(shf_vec6, 5);
5385cabdff1aSopenharmony_ci    dst3 = __msa_srari_h(shf_vec7, 5);
5386cabdff1aSopenharmony_ci
5387cabdff1aSopenharmony_ci    SAT_SH2_SH(dst0, dst1, 7);
5388cabdff1aSopenharmony_ci    SAT_SH2_SH(dst2, dst3, 7);
5389cabdff1aSopenharmony_ci    ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
5390cabdff1aSopenharmony_ci    ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
5391cabdff1aSopenharmony_ci
5392cabdff1aSopenharmony_ci    hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5393cabdff1aSopenharmony_ci    hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5394cabdff1aSopenharmony_ci    hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5395cabdff1aSopenharmony_ci    hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5396cabdff1aSopenharmony_ci
5397cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
5398cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv);
5399cabdff1aSopenharmony_ci    PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5400cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(dst0, dst2);
5401cabdff1aSopenharmony_ci    out = __msa_aver_u_b(out, dstv);
5402cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, stride);
5403cabdff1aSopenharmony_ci}
5404cabdff1aSopenharmony_ci
5405cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
5406cabdff1aSopenharmony_ci                                ptrdiff_t stride)
5407cabdff1aSopenharmony_ci{
5408cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
5409cabdff1aSopenharmony_ci    const int16_t filt_const0 = 0xfb01;
5410cabdff1aSopenharmony_ci    const int16_t filt_const1 = 0x1414;
5411cabdff1aSopenharmony_ci    const int16_t filt_const2 = 0x1fb;
5412cabdff1aSopenharmony_ci    v16u8 out, dstv = { 0 };
5413cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5414cabdff1aSopenharmony_ci    v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5415cabdff1aSopenharmony_ci    v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5416cabdff1aSopenharmony_ci    v16i8 src76_l, src87_l, filt0, filt1, filt2;
5417cabdff1aSopenharmony_ci    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5418cabdff1aSopenharmony_ci    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5419cabdff1aSopenharmony_ci    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5420cabdff1aSopenharmony_ci    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5421cabdff1aSopenharmony_ci    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5422cabdff1aSopenharmony_ci    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5423cabdff1aSopenharmony_ci    v8i16 minus5h = __msa_ldi_h(-5);
5424cabdff1aSopenharmony_ci    v8i16 plus20h = __msa_ldi_h(20);
5425cabdff1aSopenharmony_ci    v8i16 zeros = { 0 };
5426cabdff1aSopenharmony_ci
5427cabdff1aSopenharmony_ci    filt0 = (v16i8) __msa_fill_h(filt_const0);
5428cabdff1aSopenharmony_ci    filt1 = (v16i8) __msa_fill_h(filt_const1);
5429cabdff1aSopenharmony_ci    filt2 = (v16i8) __msa_fill_h(filt_const2);
5430cabdff1aSopenharmony_ci
5431cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
5432cabdff1aSopenharmony_ci
5433cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5434cabdff1aSopenharmony_ci    src += (5 * stride);
5435cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5436cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
5437cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
5438cabdff1aSopenharmony_ci
5439cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
5440cabdff1aSopenharmony_ci               src32_r, src43_r);
5441cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5442cabdff1aSopenharmony_ci               src76_r, src87_r);
5443cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
5444cabdff1aSopenharmony_ci               src32_l, src43_l);
5445cabdff1aSopenharmony_ci    ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5446cabdff1aSopenharmony_ci               src76_l, src87_l);
5447cabdff1aSopenharmony_ci    vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5448cabdff1aSopenharmony_ci    vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5449cabdff1aSopenharmony_ci    vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5450cabdff1aSopenharmony_ci    vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5451cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5452cabdff1aSopenharmony_ci               mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5453cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5454cabdff1aSopenharmony_ci               mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5455cabdff1aSopenharmony_ci    hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5456cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5457cabdff1aSopenharmony_ci    hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5458cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5459cabdff1aSopenharmony_ci
5460cabdff1aSopenharmony_ci    vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5461cabdff1aSopenharmony_ci    vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5462cabdff1aSopenharmony_ci    vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5463cabdff1aSopenharmony_ci    vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5464cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5465cabdff1aSopenharmony_ci               mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5466cabdff1aSopenharmony_ci    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5467cabdff1aSopenharmony_ci               mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5468cabdff1aSopenharmony_ci    hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5469cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5470cabdff1aSopenharmony_ci    hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5471cabdff1aSopenharmony_ci    DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5472cabdff1aSopenharmony_ci
5473cabdff1aSopenharmony_ci    SRARI_W2_SW(hz_res0, hz_res1, 10);
5474cabdff1aSopenharmony_ci    SAT_SW2_SW(hz_res0, hz_res1, 7);
5475cabdff1aSopenharmony_ci    SRARI_W2_SW(hz_res2, hz_res3, 10);
5476cabdff1aSopenharmony_ci    SAT_SW2_SW(hz_res2, hz_res3, 7);
5477cabdff1aSopenharmony_ci
5478cabdff1aSopenharmony_ci    dst0 = __msa_srari_h(shf_vec2, 5);
5479cabdff1aSopenharmony_ci    dst1 = __msa_srari_h(shf_vec5, 5);
5480cabdff1aSopenharmony_ci    dst2 = __msa_srari_h(shf_vec6, 5);
5481cabdff1aSopenharmony_ci    dst3 = __msa_srari_h(shf_vec7, 5);
5482cabdff1aSopenharmony_ci
5483cabdff1aSopenharmony_ci    SAT_SH2_SH(dst0, dst1, 7);
5484cabdff1aSopenharmony_ci    SAT_SH2_SH(dst2, dst3, 7);
5485cabdff1aSopenharmony_ci
5486cabdff1aSopenharmony_ci    dst0 = __msa_ilvod_h(zeros, dst0);
5487cabdff1aSopenharmony_ci    dst1 = __msa_ilvod_h(zeros, dst1);
5488cabdff1aSopenharmony_ci    dst2 = __msa_ilvod_h(zeros, dst2);
5489cabdff1aSopenharmony_ci    dst3 = __msa_ilvod_h(zeros, dst3);
5490cabdff1aSopenharmony_ci
5491cabdff1aSopenharmony_ci    hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5492cabdff1aSopenharmony_ci    hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5493cabdff1aSopenharmony_ci    hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5494cabdff1aSopenharmony_ci    hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5495cabdff1aSopenharmony_ci
5496cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
5497cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv);
5498cabdff1aSopenharmony_ci    PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5499cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(dst0, dst2);
5500cabdff1aSopenharmony_ci    out = __msa_aver_u_b(out, dstv);
5501cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, stride);
5502cabdff1aSopenharmony_ci}
5503cabdff1aSopenharmony_ci
5504cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
5505cabdff1aSopenharmony_ci                                 ptrdiff_t stride)
5506cabdff1aSopenharmony_ci{
5507cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
5508cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
5509cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
5510cabdff1aSopenharmony_ci    const uint8_t *src_tmp = src - (2 * stride) - 2;
5511cabdff1aSopenharmony_ci    uint8_t *dst_tmp = dst;
5512cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
5513cabdff1aSopenharmony_ci    uint32_t multiple8_cnt, loop_cnt;
5514cabdff1aSopenharmony_ci    v16u8 dst0, dst1, out0, out1;
5515cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
5516cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5517cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, res0, res1, res2, res3;
5518cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5519cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
5520cabdff1aSopenharmony_ci    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
5521cabdff1aSopenharmony_ci    v8i16 hz_out87_l, filt0, filt1, filt2;
5522cabdff1aSopenharmony_ci    v4i32 tmp0, tmp1;
5523cabdff1aSopenharmony_ci
5524cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
5525cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
5526cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
5527cabdff1aSopenharmony_ci
5528cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
5529cabdff1aSopenharmony_ci
5530cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
5531cabdff1aSopenharmony_ci        src = src_tmp;
5532cabdff1aSopenharmony_ci        dst = dst_tmp;
5533cabdff1aSopenharmony_ci
5534cabdff1aSopenharmony_ci        LD_SB5(src, stride, src0, src1, src2, src3, src4);
5535cabdff1aSopenharmony_ci        XORI_B5_128_SB(src0, src1, src2, src3, src4);
5536cabdff1aSopenharmony_ci        src += (5 * stride);
5537cabdff1aSopenharmony_ci
5538cabdff1aSopenharmony_ci        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5539cabdff1aSopenharmony_ci        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5540cabdff1aSopenharmony_ci        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5541cabdff1aSopenharmony_ci        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5542cabdff1aSopenharmony_ci        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
5543cabdff1aSopenharmony_ci
5544cabdff1aSopenharmony_ci        for (loop_cnt = 4; loop_cnt--;) {
5545cabdff1aSopenharmony_ci            LD_SB4(src, stride, src0, src1, src2, src3);
5546cabdff1aSopenharmony_ci            XORI_B4_128_SB(src0, src1, src2, src3);
5547cabdff1aSopenharmony_ci            src += (4 * stride);
5548cabdff1aSopenharmony_ci
5549cabdff1aSopenharmony_ci            hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5550cabdff1aSopenharmony_ci            hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5551cabdff1aSopenharmony_ci            hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5552cabdff1aSopenharmony_ci            hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5553cabdff1aSopenharmony_ci            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5554cabdff1aSopenharmony_ci                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
5555cabdff1aSopenharmony_ci                       hz_out43_r);
5556cabdff1aSopenharmony_ci            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5557cabdff1aSopenharmony_ci                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
5558cabdff1aSopenharmony_ci                       hz_out43_l);
5559cabdff1aSopenharmony_ci            ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5560cabdff1aSopenharmony_ci                       hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
5561cabdff1aSopenharmony_ci                       hz_out87_r);
5562cabdff1aSopenharmony_ci            ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5563cabdff1aSopenharmony_ci                       hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
5564cabdff1aSopenharmony_ci                       hz_out87_l);
5565cabdff1aSopenharmony_ci
5566cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
5567cabdff1aSopenharmony_ci                                  filt1, filt2);
5568cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
5569cabdff1aSopenharmony_ci                                  filt1, filt2);
5570cabdff1aSopenharmony_ci            res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5571cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
5572cabdff1aSopenharmony_ci                                  filt1, filt2);
5573cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
5574cabdff1aSopenharmony_ci                                  filt1, filt2);
5575cabdff1aSopenharmony_ci            res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5576cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
5577cabdff1aSopenharmony_ci                                  filt1, filt2);
5578cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
5579cabdff1aSopenharmony_ci                                  filt1, filt2);
5580cabdff1aSopenharmony_ci            res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5581cabdff1aSopenharmony_ci            tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
5582cabdff1aSopenharmony_ci                                  filt1, filt2);
5583cabdff1aSopenharmony_ci            tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
5584cabdff1aSopenharmony_ci                                  filt1, filt2);
5585cabdff1aSopenharmony_ci            res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5586cabdff1aSopenharmony_ci
5587cabdff1aSopenharmony_ci            LD4(dst, stride, tp0, tp1, tp2, tp3);
5588cabdff1aSopenharmony_ci            INSERT_D2_UB(tp0, tp1, dst0);
5589cabdff1aSopenharmony_ci            INSERT_D2_UB(tp2, tp3, dst1);
5590cabdff1aSopenharmony_ci            out0 = PCKEV_XORI128_UB(res0, res1);
5591cabdff1aSopenharmony_ci            out1 = PCKEV_XORI128_UB(res2, res3);
5592cabdff1aSopenharmony_ci            AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
5593cabdff1aSopenharmony_ci            ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
5594cabdff1aSopenharmony_ci            dst += (4 * stride);
5595cabdff1aSopenharmony_ci
5596cabdff1aSopenharmony_ci            hz_out0 = hz_out4;
5597cabdff1aSopenharmony_ci            hz_out1 = hz_out5;
5598cabdff1aSopenharmony_ci            hz_out2 = hz_out6;
5599cabdff1aSopenharmony_ci            hz_out3 = hz_out7;
5600cabdff1aSopenharmony_ci            hz_out4 = hz_out8;
5601cabdff1aSopenharmony_ci        }
5602cabdff1aSopenharmony_ci
5603cabdff1aSopenharmony_ci        src_tmp += 8;
5604cabdff1aSopenharmony_ci        dst_tmp += 8;
5605cabdff1aSopenharmony_ci    }
5606cabdff1aSopenharmony_ci}
5607cabdff1aSopenharmony_ci
5608cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
5609cabdff1aSopenharmony_ci                                ptrdiff_t stride)
5610cabdff1aSopenharmony_ci{
5611cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
5612cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
5613cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
5614cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
5615cabdff1aSopenharmony_ci    v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
5616cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
5617cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5618cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
5619cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5620cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
5621cabdff1aSopenharmony_ci    v8i16 hz_out1110_r, hz_out1211_r, res0, res1, res2, res3;
5622cabdff1aSopenharmony_ci    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
5623cabdff1aSopenharmony_ci    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
5624cabdff1aSopenharmony_ci    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
5625cabdff1aSopenharmony_ci    v4i32 tmp0, tmp1;
5626cabdff1aSopenharmony_ci
5627cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
5628cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
5629cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
5630cabdff1aSopenharmony_ci
5631cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
5632cabdff1aSopenharmony_ci
5633cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
5634cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5635cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5636cabdff1aSopenharmony_ci    src += (5 * stride);
5637cabdff1aSopenharmony_ci
5638cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5639cabdff1aSopenharmony_ci    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5640cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5641cabdff1aSopenharmony_ci    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5642cabdff1aSopenharmony_ci    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
5643cabdff1aSopenharmony_ci
5644cabdff1aSopenharmony_ci    LD_SB4(src, stride, src0, src1, src2, src3);
5645cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
5646cabdff1aSopenharmony_ci    src += (4 * stride);
5647cabdff1aSopenharmony_ci    hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5648cabdff1aSopenharmony_ci    hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5649cabdff1aSopenharmony_ci    hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5650cabdff1aSopenharmony_ci    hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5651cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5652cabdff1aSopenharmony_ci               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
5653cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5654cabdff1aSopenharmony_ci               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
5655cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5656cabdff1aSopenharmony_ci               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
5657cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5658cabdff1aSopenharmony_ci               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
5659cabdff1aSopenharmony_ci
5660cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
5661cabdff1aSopenharmony_ci                          filt2);
5662cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
5663cabdff1aSopenharmony_ci                          filt2);
5664cabdff1aSopenharmony_ci    res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5665cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
5666cabdff1aSopenharmony_ci                          filt2);
5667cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
5668cabdff1aSopenharmony_ci                          filt2);
5669cabdff1aSopenharmony_ci    res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5670cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
5671cabdff1aSopenharmony_ci                          filt2);
5672cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
5673cabdff1aSopenharmony_ci                          filt2);
5674cabdff1aSopenharmony_ci    res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5675cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
5676cabdff1aSopenharmony_ci                          filt2);
5677cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
5678cabdff1aSopenharmony_ci                          filt2);
5679cabdff1aSopenharmony_ci    res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5680cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
5681cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
5682cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
5683cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(res0, res1);
5684cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(res2, res3);
5685cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
5686cabdff1aSopenharmony_ci    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
5687cabdff1aSopenharmony_ci    dst += (4 * stride);
5688cabdff1aSopenharmony_ci
5689cabdff1aSopenharmony_ci    LD_SB4(src, stride, src0, src1, src2, src3);
5690cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
5691cabdff1aSopenharmony_ci    hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5692cabdff1aSopenharmony_ci    hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5693cabdff1aSopenharmony_ci    hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5694cabdff1aSopenharmony_ci    hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5695cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5696cabdff1aSopenharmony_ci               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
5697cabdff1aSopenharmony_ci               hz_out1211_r);
5698cabdff1aSopenharmony_ci    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5699cabdff1aSopenharmony_ci               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
5700cabdff1aSopenharmony_ci               hz_out1211_l);
5701cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
5702cabdff1aSopenharmony_ci                          filt2);
5703cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
5704cabdff1aSopenharmony_ci                          filt2);
5705cabdff1aSopenharmony_ci    res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5706cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
5707cabdff1aSopenharmony_ci                          filt2);
5708cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
5709cabdff1aSopenharmony_ci                          filt2);
5710cabdff1aSopenharmony_ci    res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5711cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
5712cabdff1aSopenharmony_ci                          filt2);
5713cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
5714cabdff1aSopenharmony_ci                          filt2);
5715cabdff1aSopenharmony_ci    res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5716cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
5717cabdff1aSopenharmony_ci                          filt2);
5718cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
5719cabdff1aSopenharmony_ci                          filt2);
5720cabdff1aSopenharmony_ci    res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5721cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
5722cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
5723cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
5724cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(res0, res1);
5725cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(res2, res3);
5726cabdff1aSopenharmony_ci    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
5727cabdff1aSopenharmony_ci    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
5728cabdff1aSopenharmony_ci}
5729cabdff1aSopenharmony_ci
5730cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
5731cabdff1aSopenharmony_ci                                ptrdiff_t stride)
5732cabdff1aSopenharmony_ci{
5733cabdff1aSopenharmony_ci    const int32_t filt_const0 = 0xfffb0001;
5734cabdff1aSopenharmony_ci    const int32_t filt_const1 = 0x140014;
5735cabdff1aSopenharmony_ci    const int32_t filt_const2 = 0x1fffb;
5736cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
5737cabdff1aSopenharmony_ci    v16u8 res, dst0 = { 0 };
5738cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5739cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2;
5740cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5741cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, res0, res1, filt0, filt1, filt2;
5742cabdff1aSopenharmony_ci    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5743cabdff1aSopenharmony_ci    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
5744cabdff1aSopenharmony_ci    v4i32 tmp0, tmp1;
5745cabdff1aSopenharmony_ci
5746cabdff1aSopenharmony_ci    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
5747cabdff1aSopenharmony_ci
5748cabdff1aSopenharmony_ci    filt0 = (v8i16) __msa_fill_w(filt_const0);
5749cabdff1aSopenharmony_ci    filt1 = (v8i16) __msa_fill_w(filt_const1);
5750cabdff1aSopenharmony_ci    filt2 = (v8i16) __msa_fill_w(filt_const2);
5751cabdff1aSopenharmony_ci
5752cabdff1aSopenharmony_ci    src -= ((2 * stride) + 2);
5753cabdff1aSopenharmony_ci
5754cabdff1aSopenharmony_ci    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5755cabdff1aSopenharmony_ci    src += (5 * stride);
5756cabdff1aSopenharmony_ci    LD_SB4(src, stride, src5, src6, src7, src8);
5757cabdff1aSopenharmony_ci
5758cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5759cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
5760cabdff1aSopenharmony_ci    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
5761cabdff1aSopenharmony_ci    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
5762cabdff1aSopenharmony_ci    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
5763cabdff1aSopenharmony_ci    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
5764cabdff1aSopenharmony_ci    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
5765cabdff1aSopenharmony_ci    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
5766cabdff1aSopenharmony_ci    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
5767cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5768cabdff1aSopenharmony_ci               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
5769cabdff1aSopenharmony_ci    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5770cabdff1aSopenharmony_ci               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
5771cabdff1aSopenharmony_ci
5772cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
5773cabdff1aSopenharmony_ci                          filt2);
5774cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
5775cabdff1aSopenharmony_ci                          filt2);
5776cabdff1aSopenharmony_ci    res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5777cabdff1aSopenharmony_ci    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
5778cabdff1aSopenharmony_ci                          filt2);
5779cabdff1aSopenharmony_ci    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
5780cabdff1aSopenharmony_ci                          filt2);
5781cabdff1aSopenharmony_ci    res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5782cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
5783cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
5784cabdff1aSopenharmony_ci    res = PCKEV_XORI128_UB(res0, res1);
5785cabdff1aSopenharmony_ci    res = __msa_aver_u_b(res, dst0);
5786cabdff1aSopenharmony_ci    ST_W4(res, 0, 1, 2, 3, dst, stride);
5787cabdff1aSopenharmony_ci}
5788