1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
22cabdff1aSopenharmony_ci#include "libavcodec/mips/hevcdsp_mips.h"
23cabdff1aSopenharmony_ci#include "libavcodec/mips/hevc_macros_msa.h"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_cistatic const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26cabdff1aSopenharmony_ci    /* 8 width cases */
27cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28cabdff1aSopenharmony_ci    /* 4 width cases */
29cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
30cabdff1aSopenharmony_ci};
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ci#define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,  \
33cabdff1aSopenharmony_ci                                       out0_h, out1_h)                        \
34cabdff1aSopenharmony_ci{                                                                             \
35cabdff1aSopenharmony_ci    v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m;                                 \
36cabdff1aSopenharmony_ci                                                                              \
37cabdff1aSopenharmony_ci    ILVRL_H2_SW(in0_h, in0_h, in0_r_m, in0_l_m);                              \
38cabdff1aSopenharmony_ci    ILVRL_H2_SW(in1_h, in1_h, in1_r_m, in1_l_m);                              \
39cabdff1aSopenharmony_ci    DOTP_SH4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, wgt_w, wgt_w, wgt_w,      \
40cabdff1aSopenharmony_ci                wgt_w, in0_r_m, in1_r_m, in0_l_m, in1_l_m);                   \
41cabdff1aSopenharmony_ci    SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w);                    \
42cabdff1aSopenharmony_ci    PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h);          \
43cabdff1aSopenharmony_ci    ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h);          \
44cabdff1aSopenharmony_ci    CLIP_SH2_0_255(out0_h, out1_h);                                           \
45cabdff1aSopenharmony_ci}
46cabdff1aSopenharmony_ci
47cabdff1aSopenharmony_ci#define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w,  \
48cabdff1aSopenharmony_ci                                       offset_h, rnd_w, out0_h, out1_h,    \
49cabdff1aSopenharmony_ci                                       out2_h, out3_h)                     \
50cabdff1aSopenharmony_ci{                                                                          \
51cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,   \
52cabdff1aSopenharmony_ci                                   out0_h, out1_h);                        \
53cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w,   \
54cabdff1aSopenharmony_ci                                   out2_h, out3_h);                        \
55cabdff1aSopenharmony_ci}
56cabdff1aSopenharmony_ci
57cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_4w_msa(uint8_t *src,
58cabdff1aSopenharmony_ci                                    int32_t src_stride,
59cabdff1aSopenharmony_ci                                    uint8_t *dst,
60cabdff1aSopenharmony_ci                                    int32_t dst_stride,
61cabdff1aSopenharmony_ci                                    int32_t height,
62cabdff1aSopenharmony_ci                                    int32_t weight,
63cabdff1aSopenharmony_ci                                    int32_t offset,
64cabdff1aSopenharmony_ci                                    int32_t rnd_val)
65cabdff1aSopenharmony_ci{
66cabdff1aSopenharmony_ci    uint32_t loop_cnt, tp0, tp1, tp2, tp3;
67cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
68cabdff1aSopenharmony_ci    v16u8 out0, out1;
69cabdff1aSopenharmony_ci    v16i8 src0 = { 0 }, src1 = { 0 };
70cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, offset_vec;
71cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
72cabdff1aSopenharmony_ci
73cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
74cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
75cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
76cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
77cabdff1aSopenharmony_ci
78cabdff1aSopenharmony_ci    if (2 == height) {
79cabdff1aSopenharmony_ci        v4i32 dst0_r, dst0_l;
80cabdff1aSopenharmony_ci
81cabdff1aSopenharmony_ci        LW2(src, src_stride, tp0, tp1);
82cabdff1aSopenharmony_ci        INSERT_W2_SB(tp0, tp1, src0);
83cabdff1aSopenharmony_ci        dst0 = (v8i16) __msa_ilvr_b(zero, src0);
84cabdff1aSopenharmony_ci        dst0 <<= 6;
85cabdff1aSopenharmony_ci
86cabdff1aSopenharmony_ci        ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
87cabdff1aSopenharmony_ci        DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
88cabdff1aSopenharmony_ci        SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
89cabdff1aSopenharmony_ci        dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
90cabdff1aSopenharmony_ci        dst0 += offset_vec;
91cabdff1aSopenharmony_ci        CLIP_SH_0_255(dst0);
92cabdff1aSopenharmony_ci        out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
93cabdff1aSopenharmony_ci        ST_W2(out0, 0, 1, dst, dst_stride);
94cabdff1aSopenharmony_ci    } else if (4 == height) {
95cabdff1aSopenharmony_ci        LW4(src, src_stride, tp0, tp1, tp2, tp3);
96cabdff1aSopenharmony_ci        INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
97cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
98cabdff1aSopenharmony_ci        SLLI_2V(dst0, dst1, 6);
99cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
100cabdff1aSopenharmony_ci                                       rnd_vec, dst0, dst1);
101cabdff1aSopenharmony_ci        out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
102cabdff1aSopenharmony_ci        ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
103cabdff1aSopenharmony_ci    } else if (0 == (height % 8)) {
104cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 3); loop_cnt--;) {
105cabdff1aSopenharmony_ci            LW4(src, src_stride, tp0, tp1, tp2, tp3);
106cabdff1aSopenharmony_ci            src += 4 * src_stride;
107cabdff1aSopenharmony_ci            INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
108cabdff1aSopenharmony_ci            LW4(src, src_stride, tp0, tp1, tp2, tp3);
109cabdff1aSopenharmony_ci            src += 4 * src_stride;
110cabdff1aSopenharmony_ci            INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
111cabdff1aSopenharmony_ci            ILVRL_B2_SH(zero, src0, dst0, dst1);
112cabdff1aSopenharmony_ci            ILVRL_B2_SH(zero, src1, dst2, dst3);
113cabdff1aSopenharmony_ci            SLLI_4V(dst0, dst1, dst2, dst3, 6);
114cabdff1aSopenharmony_ci            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
115cabdff1aSopenharmony_ci                                           offset_vec, rnd_vec, dst0, dst1,
116cabdff1aSopenharmony_ci                                           dst2, dst3);
117cabdff1aSopenharmony_ci            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
118cabdff1aSopenharmony_ci            ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
119cabdff1aSopenharmony_ci            dst += 8 * dst_stride;
120cabdff1aSopenharmony_ci        }
121cabdff1aSopenharmony_ci    }
122cabdff1aSopenharmony_ci}
123cabdff1aSopenharmony_ci
124cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_6w_msa(uint8_t *src,
125cabdff1aSopenharmony_ci                                    int32_t src_stride,
126cabdff1aSopenharmony_ci                                    uint8_t *dst,
127cabdff1aSopenharmony_ci                                    int32_t dst_stride,
128cabdff1aSopenharmony_ci                                    int32_t height,
129cabdff1aSopenharmony_ci                                    int32_t weight,
130cabdff1aSopenharmony_ci                                    int32_t offset,
131cabdff1aSopenharmony_ci                                    int32_t rnd_val)
132cabdff1aSopenharmony_ci{
133cabdff1aSopenharmony_ci    uint32_t loop_cnt;
134cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
135cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
136cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
137cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
138cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
139cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
140cabdff1aSopenharmony_ci
141cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
142cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
143cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
144cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
145cabdff1aSopenharmony_ci
146cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
147cabdff1aSopenharmony_ci        LD4(src, src_stride, tp0, tp1, tp2, tp3);
148cabdff1aSopenharmony_ci        src += (4 * src_stride);
149cabdff1aSopenharmony_ci        INSERT_D2_SB(tp0, tp1, src0);
150cabdff1aSopenharmony_ci        INSERT_D2_SB(tp2, tp3, src1);
151cabdff1aSopenharmony_ci        LD4(src, src_stride, tp0, tp1, tp2, tp3);
152cabdff1aSopenharmony_ci        src += (4 * src_stride);
153cabdff1aSopenharmony_ci        INSERT_D2_SB(tp0, tp1, src2);
154cabdff1aSopenharmony_ci        INSERT_D2_SB(tp2, tp3, src3);
155cabdff1aSopenharmony_ci
156cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
157cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src1, dst2, dst3);
158cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src2, dst4, dst5);
159cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src3, dst6, dst7);
160cabdff1aSopenharmony_ci
161cabdff1aSopenharmony_ci        SLLI_4V(dst0, dst1, dst2, dst3, 6);
162cabdff1aSopenharmony_ci        SLLI_4V(dst4, dst5, dst6, dst7, 6);
163cabdff1aSopenharmony_ci
164cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
165cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
166cabdff1aSopenharmony_ci                                       dst3);
167cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
168cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst4, dst5, dst6,
169cabdff1aSopenharmony_ci                                       dst7);
170cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
171cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
172cabdff1aSopenharmony_ci
173cabdff1aSopenharmony_ci        ST_W2(out0, 0, 2, dst, dst_stride);
174cabdff1aSopenharmony_ci        ST_H2(out0, 2, 6, dst + 4, dst_stride);
175cabdff1aSopenharmony_ci        ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
176cabdff1aSopenharmony_ci        ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
177cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
178cabdff1aSopenharmony_ci        ST_W2(out2, 0, 2, dst, dst_stride);
179cabdff1aSopenharmony_ci        ST_H2(out2, 2, 6, dst + 4, dst_stride);
180cabdff1aSopenharmony_ci        ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
181cabdff1aSopenharmony_ci        ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
182cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
183cabdff1aSopenharmony_ci    }
184cabdff1aSopenharmony_ci}
185cabdff1aSopenharmony_ci
186cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_8w_msa(uint8_t *src,
187cabdff1aSopenharmony_ci                                    int32_t src_stride,
188cabdff1aSopenharmony_ci                                    uint8_t *dst,
189cabdff1aSopenharmony_ci                                    int32_t dst_stride,
190cabdff1aSopenharmony_ci                                    int32_t height,
191cabdff1aSopenharmony_ci                                    int32_t weight,
192cabdff1aSopenharmony_ci                                    int32_t offset,
193cabdff1aSopenharmony_ci                                    int32_t rnd_val)
194cabdff1aSopenharmony_ci{
195cabdff1aSopenharmony_ci    uint32_t loop_cnt;
196cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
197cabdff1aSopenharmony_ci    v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
198cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
199cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
200cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
201cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
202cabdff1aSopenharmony_ci
203cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
204cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
205cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
206cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
207cabdff1aSopenharmony_ci
208cabdff1aSopenharmony_ci    if (2 == height) {
209cabdff1aSopenharmony_ci        LD2(src, src_stride, tp0, tp1);
210cabdff1aSopenharmony_ci        INSERT_D2_SB(tp0, tp1, src0);
211cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
212cabdff1aSopenharmony_ci        SLLI_2V(dst0, dst1, 6);
213cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
214cabdff1aSopenharmony_ci                                       rnd_vec, dst0, dst1);
215cabdff1aSopenharmony_ci        out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
216cabdff1aSopenharmony_ci        ST_D2(out0, 0, 1, dst, dst_stride);
217cabdff1aSopenharmony_ci    } else if (4 == height) {
218cabdff1aSopenharmony_ci        LD4(src, src_stride, tp0, tp1, tp2, tp3);
219cabdff1aSopenharmony_ci        INSERT_D2_SB(tp0, tp1, src0);
220cabdff1aSopenharmony_ci        INSERT_D2_SB(tp2, tp3, src1);
221cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
222cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src1, dst2, dst3);
223cabdff1aSopenharmony_ci        SLLI_4V(dst0, dst1, dst2, dst3, 6);
224cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
225cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
226cabdff1aSopenharmony_ci                                       dst3);
227cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
228cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
229cabdff1aSopenharmony_ci    } else if (6 == height) {
230cabdff1aSopenharmony_ci        LD4(src, src_stride, tp0, tp1, tp2, tp3);
231cabdff1aSopenharmony_ci        src += 4 * src_stride;
232cabdff1aSopenharmony_ci        INSERT_D2_SB(tp0, tp1, src0);
233cabdff1aSopenharmony_ci        INSERT_D2_SB(tp2, tp3, src1);
234cabdff1aSopenharmony_ci        LD2(src, src_stride, tp0, tp1);
235cabdff1aSopenharmony_ci        INSERT_D2_SB(tp0, tp1, src2);
236cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
237cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src1, dst2, dst3);
238cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src2, dst4, dst5);
239cabdff1aSopenharmony_ci        SLLI_4V(dst0, dst1, dst2, dst3, 6);
240cabdff1aSopenharmony_ci        SLLI_2V(dst4, dst5, 6);
241cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
242cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
243cabdff1aSopenharmony_ci                                       dst3);
244cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
245cabdff1aSopenharmony_ci                                       rnd_vec, dst4, dst5);
246cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
247cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
248cabdff1aSopenharmony_ci        ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
249cabdff1aSopenharmony_ci    } else if (0 == height % 8) {
250cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 3); loop_cnt--;) {
251cabdff1aSopenharmony_ci            LD4(src, src_stride, tp0, tp1, tp2, tp3);
252cabdff1aSopenharmony_ci            src += 4 * src_stride;
253cabdff1aSopenharmony_ci            INSERT_D2_SB(tp0, tp1, src0);
254cabdff1aSopenharmony_ci            INSERT_D2_SB(tp2, tp3, src1);
255cabdff1aSopenharmony_ci            LD4(src, src_stride, tp0, tp1, tp2, tp3);
256cabdff1aSopenharmony_ci            src += 4 * src_stride;
257cabdff1aSopenharmony_ci            INSERT_D2_SB(tp0, tp1, src2);
258cabdff1aSopenharmony_ci            INSERT_D2_SB(tp2, tp3, src3);
259cabdff1aSopenharmony_ci
260cabdff1aSopenharmony_ci            ILVRL_B2_SH(zero, src0, dst0, dst1);
261cabdff1aSopenharmony_ci            ILVRL_B2_SH(zero, src1, dst2, dst3);
262cabdff1aSopenharmony_ci            ILVRL_B2_SH(zero, src2, dst4, dst5);
263cabdff1aSopenharmony_ci            ILVRL_B2_SH(zero, src3, dst6, dst7);
264cabdff1aSopenharmony_ci            SLLI_4V(dst0, dst1, dst2, dst3, 6);
265cabdff1aSopenharmony_ci            SLLI_4V(dst4, dst5, dst6, dst7, 6);
266cabdff1aSopenharmony_ci            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
267cabdff1aSopenharmony_ci                                           offset_vec, rnd_vec, dst0, dst1,
268cabdff1aSopenharmony_ci                                           dst2, dst3);
269cabdff1aSopenharmony_ci            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
270cabdff1aSopenharmony_ci                                           offset_vec, rnd_vec, dst4, dst5,
271cabdff1aSopenharmony_ci                                           dst6, dst7);
272cabdff1aSopenharmony_ci            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
273cabdff1aSopenharmony_ci            PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
274cabdff1aSopenharmony_ci            ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1,
275cabdff1aSopenharmony_ci                  dst, dst_stride);
276cabdff1aSopenharmony_ci            dst += (8 * dst_stride);
277cabdff1aSopenharmony_ci        }
278cabdff1aSopenharmony_ci    }
279cabdff1aSopenharmony_ci}
280cabdff1aSopenharmony_ci
281cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_12w_msa(uint8_t *src,
282cabdff1aSopenharmony_ci                                     int32_t src_stride,
283cabdff1aSopenharmony_ci                                     uint8_t *dst,
284cabdff1aSopenharmony_ci                                     int32_t dst_stride,
285cabdff1aSopenharmony_ci                                     int32_t height,
286cabdff1aSopenharmony_ci                                     int32_t weight,
287cabdff1aSopenharmony_ci                                     int32_t offset,
288cabdff1aSopenharmony_ci                                     int32_t rnd_val)
289cabdff1aSopenharmony_ci{
290cabdff1aSopenharmony_ci    uint32_t loop_cnt;
291cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
292cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
293cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
294cabdff1aSopenharmony_ci    v8i16 offset_vec;
295cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
296cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
297cabdff1aSopenharmony_ci
298cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
299cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
300cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
301cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
302cabdff1aSopenharmony_ci
303cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
304cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
305cabdff1aSopenharmony_ci        src += (4 * src_stride);
306cabdff1aSopenharmony_ci        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
307cabdff1aSopenharmony_ci                   dst0, dst1, dst2, dst3);
308cabdff1aSopenharmony_ci
309cabdff1aSopenharmony_ci        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
310cabdff1aSopenharmony_ci        ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
311cabdff1aSopenharmony_ci        SLLI_4V(dst0, dst1, dst2, dst3, 6);
312cabdff1aSopenharmony_ci        SLLI_2V(dst4, dst5, 6);
313cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
314cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
315cabdff1aSopenharmony_ci                                       dst3);
316cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
317cabdff1aSopenharmony_ci                                       rnd_vec, dst4, dst5);
318cabdff1aSopenharmony_ci
319cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
320cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
321cabdff1aSopenharmony_ci        ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
322cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
323cabdff1aSopenharmony_ci    }
324cabdff1aSopenharmony_ci}
325cabdff1aSopenharmony_ci
326cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_16w_msa(uint8_t *src,
327cabdff1aSopenharmony_ci                                     int32_t src_stride,
328cabdff1aSopenharmony_ci                                     uint8_t *dst,
329cabdff1aSopenharmony_ci                                     int32_t dst_stride,
330cabdff1aSopenharmony_ci                                     int32_t height,
331cabdff1aSopenharmony_ci                                     int32_t weight,
332cabdff1aSopenharmony_ci                                     int32_t offset,
333cabdff1aSopenharmony_ci                                     int32_t rnd_val)
334cabdff1aSopenharmony_ci{
335cabdff1aSopenharmony_ci    uint32_t loop_cnt;
336cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
337cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
338cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
339cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
340cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
341cabdff1aSopenharmony_ci
342cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
343cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
344cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
345cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
346cabdff1aSopenharmony_ci
347cabdff1aSopenharmony_ci    for (loop_cnt = height >> 2; loop_cnt--;) {
348cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
349cabdff1aSopenharmony_ci        src += (4 * src_stride);
350cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
351cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src1, dst2, dst3);
352cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src2, dst4, dst5);
353cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src3, dst6, dst7);
354cabdff1aSopenharmony_ci        SLLI_4V(dst0, dst1, dst2, dst3, 6);
355cabdff1aSopenharmony_ci        SLLI_4V(dst4, dst5, dst6, dst7, 6);
356cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
357cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
358cabdff1aSopenharmony_ci                                       dst3);
359cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
360cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst4, dst5, dst6,
361cabdff1aSopenharmony_ci                                       dst7);
362cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
363cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
364cabdff1aSopenharmony_ci        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
365cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
366cabdff1aSopenharmony_ci    }
367cabdff1aSopenharmony_ci}
368cabdff1aSopenharmony_ci
369cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_24w_msa(uint8_t *src,
370cabdff1aSopenharmony_ci                                     int32_t src_stride,
371cabdff1aSopenharmony_ci                                     uint8_t *dst,
372cabdff1aSopenharmony_ci                                     int32_t dst_stride,
373cabdff1aSopenharmony_ci                                     int32_t height,
374cabdff1aSopenharmony_ci                                     int32_t weight,
375cabdff1aSopenharmony_ci                                     int32_t offset,
376cabdff1aSopenharmony_ci                                     int32_t rnd_val)
377cabdff1aSopenharmony_ci{
378cabdff1aSopenharmony_ci    uint32_t loop_cnt;
379cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3, out4, out5;
380cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
381cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
382cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
383cabdff1aSopenharmony_ci    v8i16 dst8, dst9, dst10, dst11;
384cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
385cabdff1aSopenharmony_ci
386cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
387cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
388cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
389cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
390cabdff1aSopenharmony_ci
391cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
392cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src4, src5);
393cabdff1aSopenharmony_ci        LD_SB4(src + 16, src_stride, src2, src3, src6, src7);
394cabdff1aSopenharmony_ci        src += (4 * src_stride);
395cabdff1aSopenharmony_ci
396cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
397cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src1, dst2, dst3);
398cabdff1aSopenharmony_ci        ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
399cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src4, dst6, dst7);
400cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src5, dst8, dst9);
401cabdff1aSopenharmony_ci        ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
402cabdff1aSopenharmony_ci        SLLI_4V(dst0, dst1, dst2, dst3, 6);
403cabdff1aSopenharmony_ci        SLLI_4V(dst4, dst5, dst6, dst7, 6);
404cabdff1aSopenharmony_ci        SLLI_4V(dst8, dst9, dst10, dst11, 6);
405cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
406cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
407cabdff1aSopenharmony_ci                                       dst3);
408cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
409cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst4, dst5, dst6,
410cabdff1aSopenharmony_ci                                       dst7);
411cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
412cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst8, dst9, dst10,
413cabdff1aSopenharmony_ci                                       dst11);
414cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
415cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
416cabdff1aSopenharmony_ci        ST_UB4(out0, out1, out3, out4, dst, dst_stride);
417cabdff1aSopenharmony_ci        ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
418cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
419cabdff1aSopenharmony_ci    }
420cabdff1aSopenharmony_ci}
421cabdff1aSopenharmony_ci
422cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_32w_msa(uint8_t *src,
423cabdff1aSopenharmony_ci                                     int32_t src_stride,
424cabdff1aSopenharmony_ci                                     uint8_t *dst,
425cabdff1aSopenharmony_ci                                     int32_t dst_stride,
426cabdff1aSopenharmony_ci                                     int32_t height,
427cabdff1aSopenharmony_ci                                     int32_t weight,
428cabdff1aSopenharmony_ci                                     int32_t offset,
429cabdff1aSopenharmony_ci                                     int32_t rnd_val)
430cabdff1aSopenharmony_ci{
431cabdff1aSopenharmony_ci    uint32_t loop_cnt;
432cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
433cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
434cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
435cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
436cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
437cabdff1aSopenharmony_ci
438cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
439cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
440cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
441cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
442cabdff1aSopenharmony_ci
443cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
444cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src0, src1);
445cabdff1aSopenharmony_ci        LD_SB2(src + 16, src_stride, src2, src3);
446cabdff1aSopenharmony_ci        src += (2 * src_stride);
447cabdff1aSopenharmony_ci
448cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
449cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src1, dst2, dst3);
450cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src2, dst4, dst5);
451cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src3, dst6, dst7);
452cabdff1aSopenharmony_ci        SLLI_4V(dst0, dst1, dst2, dst3, 6);
453cabdff1aSopenharmony_ci        SLLI_4V(dst4, dst5, dst6, dst7, 6);
454cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
455cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
456cabdff1aSopenharmony_ci                                       dst3);
457cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
458cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst4, dst5, dst6,
459cabdff1aSopenharmony_ci                                       dst7);
460cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
461cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
462cabdff1aSopenharmony_ci        ST_UB2(out0, out1, dst, dst_stride);
463cabdff1aSopenharmony_ci        ST_UB2(out2, out3, dst + 16, dst_stride);
464cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
465cabdff1aSopenharmony_ci    }
466cabdff1aSopenharmony_ci}
467cabdff1aSopenharmony_ci
468cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_48w_msa(uint8_t *src,
469cabdff1aSopenharmony_ci                                     int32_t src_stride,
470cabdff1aSopenharmony_ci                                     uint8_t *dst,
471cabdff1aSopenharmony_ci                                     int32_t dst_stride,
472cabdff1aSopenharmony_ci                                     int32_t height,
473cabdff1aSopenharmony_ci                                     int32_t weight,
474cabdff1aSopenharmony_ci                                     int32_t offset,
475cabdff1aSopenharmony_ci                                     int32_t rnd_val)
476cabdff1aSopenharmony_ci{
477cabdff1aSopenharmony_ci    uint32_t loop_cnt;
478cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3, out4, out5;
479cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5;
480cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
481cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec;
482cabdff1aSopenharmony_ci    v8i16 dst6, dst7, dst8, dst9, dst10, dst11;
483cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
484cabdff1aSopenharmony_ci
485cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
486cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
487cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
488cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
489cabdff1aSopenharmony_ci
490cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
491cabdff1aSopenharmony_ci        LD_SB3(src, 16, src0, src1, src2);
492cabdff1aSopenharmony_ci        src += src_stride;
493cabdff1aSopenharmony_ci        LD_SB3(src, 16, src3, src4, src5);
494cabdff1aSopenharmony_ci        src += src_stride;
495cabdff1aSopenharmony_ci
496cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
497cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src1, dst2, dst3);
498cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src2, dst4, dst5);
499cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src3, dst6, dst7);
500cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src4, dst8, dst9);
501cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src5, dst10, dst11);
502cabdff1aSopenharmony_ci        SLLI_4V(dst0, dst1, dst2, dst3, 6);
503cabdff1aSopenharmony_ci        SLLI_4V(dst4, dst5, dst6, dst7, 6);
504cabdff1aSopenharmony_ci        SLLI_4V(dst8, dst9, dst10, dst11, 6);
505cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
506cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
507cabdff1aSopenharmony_ci                                       dst3);
508cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
509cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst4, dst5, dst6,
510cabdff1aSopenharmony_ci                                       dst7);
511cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
512cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst8, dst9, dst10,
513cabdff1aSopenharmony_ci                                       dst11);
514cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
515cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
516cabdff1aSopenharmony_ci        ST_UB2(out0, out1, dst, 16);
517cabdff1aSopenharmony_ci        ST_UB(out2, dst + 32);
518cabdff1aSopenharmony_ci        dst += dst_stride;
519cabdff1aSopenharmony_ci        ST_UB2(out3, out4, dst, 16);
520cabdff1aSopenharmony_ci        ST_UB(out5, dst + 32);
521cabdff1aSopenharmony_ci        dst += dst_stride;
522cabdff1aSopenharmony_ci    }
523cabdff1aSopenharmony_ci}
524cabdff1aSopenharmony_ci
525cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_64w_msa(uint8_t *src,
526cabdff1aSopenharmony_ci                                     int32_t src_stride,
527cabdff1aSopenharmony_ci                                     uint8_t *dst,
528cabdff1aSopenharmony_ci                                     int32_t dst_stride,
529cabdff1aSopenharmony_ci                                     int32_t height,
530cabdff1aSopenharmony_ci                                     int32_t weight,
531cabdff1aSopenharmony_ci                                     int32_t offset,
532cabdff1aSopenharmony_ci                                     int32_t rnd_val)
533cabdff1aSopenharmony_ci{
534cabdff1aSopenharmony_ci    uint32_t loop_cnt;
535cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3, out4, out5, out6, out7;
536cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
537cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
538cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
539cabdff1aSopenharmony_ci    v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
540cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
541cabdff1aSopenharmony_ci
542cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
543cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
544cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
545cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
546cabdff1aSopenharmony_ci
547cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
548cabdff1aSopenharmony_ci        LD_SB4(src, 16, src0, src1, src2, src3);
549cabdff1aSopenharmony_ci        src += src_stride;
550cabdff1aSopenharmony_ci        LD_SB4(src, 16, src4, src5, src6, src7);
551cabdff1aSopenharmony_ci        src += src_stride;
552cabdff1aSopenharmony_ci
553cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
554cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src1, dst2, dst3);
555cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src2, dst4, dst5);
556cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src3, dst6, dst7);
557cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src4, dst8, dst9);
558cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src5, dst10, dst11);
559cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src6, dst12, dst13);
560cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src7, dst14, dst15);
561cabdff1aSopenharmony_ci        SLLI_4V(dst0, dst1, dst2, dst3, 6);
562cabdff1aSopenharmony_ci        SLLI_4V(dst4, dst5, dst6, dst7, 6);
563cabdff1aSopenharmony_ci        SLLI_4V(dst8, dst9, dst10, dst11, 6);
564cabdff1aSopenharmony_ci        SLLI_4V(dst12, dst13, dst14, dst15, 6);
565cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
566cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
567cabdff1aSopenharmony_ci                                       dst3);
568cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
569cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst4, dst5, dst6,
570cabdff1aSopenharmony_ci                                       dst7);
571cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
572cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst8, dst9, dst10,
573cabdff1aSopenharmony_ci                                       dst11);
574cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst12, dst13, dst14, dst15, weight_vec,
575cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst12, dst13, dst14,
576cabdff1aSopenharmony_ci                                       dst15);
577cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
578cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
579cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
580cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7);
581cabdff1aSopenharmony_ci        ST_UB4(out0, out1, out2, out3, dst, 16);
582cabdff1aSopenharmony_ci        dst += dst_stride;
583cabdff1aSopenharmony_ci        ST_UB4(out4, out5, out6, out7, dst, 16);
584cabdff1aSopenharmony_ci        dst += dst_stride;
585cabdff1aSopenharmony_ci    }
586cabdff1aSopenharmony_ci}
587cabdff1aSopenharmony_ci
588cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src,
589cabdff1aSopenharmony_ci                                     int32_t src_stride,
590cabdff1aSopenharmony_ci                                     uint8_t *dst,
591cabdff1aSopenharmony_ci                                     int32_t dst_stride,
592cabdff1aSopenharmony_ci                                     const int8_t *filter,
593cabdff1aSopenharmony_ci                                     int32_t height,
594cabdff1aSopenharmony_ci                                     int32_t weight,
595cabdff1aSopenharmony_ci                                     int32_t offset,
596cabdff1aSopenharmony_ci                                     int32_t rnd_val)
597cabdff1aSopenharmony_ci{
598cabdff1aSopenharmony_ci    uint32_t loop_cnt;
599cabdff1aSopenharmony_ci    v16u8 out0, out1;
600cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
601cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
602cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
603cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3, vec11, vec12, vec13, vec14, vec15;
604cabdff1aSopenharmony_ci    v8i16 filter_vec, dst01, dst23, dst45, dst67;
605cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
606cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
607cabdff1aSopenharmony_ci
608cabdff1aSopenharmony_ci    src -= 3;
609cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
610cabdff1aSopenharmony_ci
611cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
612cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
613cabdff1aSopenharmony_ci
614cabdff1aSopenharmony_ci    weight *= 128;
615cabdff1aSopenharmony_ci    rnd_val -= 6;
616cabdff1aSopenharmony_ci
617cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
618cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
619cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
620cabdff1aSopenharmony_ci
621cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
622cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
623cabdff1aSopenharmony_ci
624cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
625cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
626cabdff1aSopenharmony_ci
627cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[16]);
628cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
629cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
630cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
631cabdff1aSopenharmony_ci
632cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
633cabdff1aSopenharmony_ci        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
634cabdff1aSopenharmony_ci        src += (8 * src_stride);
635cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
636cabdff1aSopenharmony_ci
637cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
638cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
639cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
640cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
641cabdff1aSopenharmony_ci        VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
642cabdff1aSopenharmony_ci                   vec8, vec9, vec10, vec11);
643cabdff1aSopenharmony_ci        VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
644cabdff1aSopenharmony_ci                   vec12, vec13, vec14, vec15);
645cabdff1aSopenharmony_ci        dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
646cabdff1aSopenharmony_ci                                  filt3);
647cabdff1aSopenharmony_ci        dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
648cabdff1aSopenharmony_ci                                  filt3);
649cabdff1aSopenharmony_ci        dst45 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
650cabdff1aSopenharmony_ci                                  filt3);
651cabdff1aSopenharmony_ci        dst67 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
652cabdff1aSopenharmony_ci                                  filt2, filt3);
653cabdff1aSopenharmony_ci
654cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec,
655cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
656cabdff1aSopenharmony_ci                                       dst3);
657cabdff1aSopenharmony_ci
658cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
659cabdff1aSopenharmony_ci        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
660cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
661cabdff1aSopenharmony_ci    }
662cabdff1aSopenharmony_ci}
663cabdff1aSopenharmony_ci
664cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src,
665cabdff1aSopenharmony_ci                                     int32_t src_stride,
666cabdff1aSopenharmony_ci                                     uint8_t *dst,
667cabdff1aSopenharmony_ci                                     int32_t dst_stride,
668cabdff1aSopenharmony_ci                                     const int8_t *filter,
669cabdff1aSopenharmony_ci                                     int32_t height,
670cabdff1aSopenharmony_ci                                     int32_t weight,
671cabdff1aSopenharmony_ci                                     int32_t offset,
672cabdff1aSopenharmony_ci                                     int32_t rnd_val)
673cabdff1aSopenharmony_ci{
674cabdff1aSopenharmony_ci    uint32_t loop_cnt;
675cabdff1aSopenharmony_ci    v16u8 out0, out1;
676cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
677cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
678cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3;
679cabdff1aSopenharmony_ci    v8i16 filter_vec;
680cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
681cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
682cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
683cabdff1aSopenharmony_ci    v8i16 weight_vec_h, offset_vec, denom_vec;
684cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
685cabdff1aSopenharmony_ci
686cabdff1aSopenharmony_ci    src -= 3;
687cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
688cabdff1aSopenharmony_ci
689cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
690cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
691cabdff1aSopenharmony_ci
692cabdff1aSopenharmony_ci    weight *= 128;
693cabdff1aSopenharmony_ci    rnd_val -= 6;
694cabdff1aSopenharmony_ci
695cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
696cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
697cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
698cabdff1aSopenharmony_ci
699cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
700cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
701cabdff1aSopenharmony_ci
702cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
703cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
704cabdff1aSopenharmony_ci
705cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
706cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
707cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
708cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
709cabdff1aSopenharmony_ci
710cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
711cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
712cabdff1aSopenharmony_ci        src += (4 * src_stride);
713cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
714cabdff1aSopenharmony_ci
715cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
716cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
717cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
718cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
719cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
720cabdff1aSopenharmony_ci                   vec8, vec9, vec10, vec11);
721cabdff1aSopenharmony_ci        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
722cabdff1aSopenharmony_ci                   vec12, vec13, vec14, vec15);
723cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
724cabdff1aSopenharmony_ci                                 filt3);
725cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
726cabdff1aSopenharmony_ci                                 filt3);
727cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
728cabdff1aSopenharmony_ci                                 filt3);
729cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
730cabdff1aSopenharmony_ci                                 filt2, filt3);
731cabdff1aSopenharmony_ci
732cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
733cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
734cabdff1aSopenharmony_ci                                       dst3);
735cabdff1aSopenharmony_ci
736cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
737cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
738cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
739cabdff1aSopenharmony_ci    }
740cabdff1aSopenharmony_ci}
741cabdff1aSopenharmony_ci
742cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src,
743cabdff1aSopenharmony_ci                                      int32_t src_stride,
744cabdff1aSopenharmony_ci                                      uint8_t *dst,
745cabdff1aSopenharmony_ci                                      int32_t dst_stride,
746cabdff1aSopenharmony_ci                                      const int8_t *filter,
747cabdff1aSopenharmony_ci                                      int32_t height,
748cabdff1aSopenharmony_ci                                      int32_t weight,
749cabdff1aSopenharmony_ci                                      int32_t offset,
750cabdff1aSopenharmony_ci                                      int32_t rnd_val)
751cabdff1aSopenharmony_ci{
752cabdff1aSopenharmony_ci    uint32_t loop_cnt;
753cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
754cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
755cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
756cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
757cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
758cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
759cabdff1aSopenharmony_ci    v8i16 filter_vec;
760cabdff1aSopenharmony_ci    v8i16 dst01, dst23, dst0, dst1, dst2, dst3, dst4, dst5;
761cabdff1aSopenharmony_ci    v8i16 weight_vec_h, offset_vec, denom_vec;
762cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
763cabdff1aSopenharmony_ci
764cabdff1aSopenharmony_ci    src -= 3;
765cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
766cabdff1aSopenharmony_ci
767cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
768cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
769cabdff1aSopenharmony_ci
770cabdff1aSopenharmony_ci    weight *= 128;
771cabdff1aSopenharmony_ci    rnd_val -= 6;
772cabdff1aSopenharmony_ci
773cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
774cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
775cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
776cabdff1aSopenharmony_ci
777cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
778cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
779cabdff1aSopenharmony_ci
780cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
781cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
782cabdff1aSopenharmony_ci
783cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
784cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
785cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
786cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
787cabdff1aSopenharmony_ci    mask4 = LD_SB(&ff_hevc_mask_arr[16]);
788cabdff1aSopenharmony_ci    mask5 = mask4 + 2;
789cabdff1aSopenharmony_ci    mask6 = mask4 + 4;
790cabdff1aSopenharmony_ci    mask7 = mask4 + 6;
791cabdff1aSopenharmony_ci
792cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
793cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
794cabdff1aSopenharmony_ci        LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
795cabdff1aSopenharmony_ci        src += (4 * src_stride);
796cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
797cabdff1aSopenharmony_ci
798cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
799cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
800cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
801cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
802cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
803cabdff1aSopenharmony_ci                   vec8, vec9, vec10, vec11);
804cabdff1aSopenharmony_ci        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
805cabdff1aSopenharmony_ci                   vec12, vec13, vec14, vec15);
806cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
807cabdff1aSopenharmony_ci                                 filt3);
808cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
809cabdff1aSopenharmony_ci                                 filt3);
810cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
811cabdff1aSopenharmony_ci                                 filt3);
812cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
813cabdff1aSopenharmony_ci                                 filt2, filt3);
814cabdff1aSopenharmony_ci        VSHF_B4_SB(src4, src5, mask4, mask5, mask6, mask7,
815cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
816cabdff1aSopenharmony_ci        VSHF_B4_SB(src6, src7, mask4, mask5, mask6, mask7,
817cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
818cabdff1aSopenharmony_ci        dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
819cabdff1aSopenharmony_ci                                  filt3);
820cabdff1aSopenharmony_ci        dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
821cabdff1aSopenharmony_ci                                  filt3);
822cabdff1aSopenharmony_ci
823cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
824cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
825cabdff1aSopenharmony_ci                                       dst3);
826cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst01, dst23, weight_vec, offset_vec,
827cabdff1aSopenharmony_ci                                       rnd_vec, dst4, dst5);
828cabdff1aSopenharmony_ci
829cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
830cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
831cabdff1aSopenharmony_ci        ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
832cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
833cabdff1aSopenharmony_ci    }
834cabdff1aSopenharmony_ci}
835cabdff1aSopenharmony_ci
836cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src,
837cabdff1aSopenharmony_ci                                      int32_t src_stride,
838cabdff1aSopenharmony_ci                                      uint8_t *dst,
839cabdff1aSopenharmony_ci                                      int32_t dst_stride,
840cabdff1aSopenharmony_ci                                      const int8_t *filter,
841cabdff1aSopenharmony_ci                                      int32_t height,
842cabdff1aSopenharmony_ci                                      int32_t weight,
843cabdff1aSopenharmony_ci                                      int32_t offset,
844cabdff1aSopenharmony_ci                                      int32_t rnd_val)
845cabdff1aSopenharmony_ci{
846cabdff1aSopenharmony_ci    uint32_t loop_cnt;
847cabdff1aSopenharmony_ci    v16u8 out0, out1;
848cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
849cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
850cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3;
851cabdff1aSopenharmony_ci    v8i16 filter_vec;
852cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
853cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
854cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
855cabdff1aSopenharmony_ci    v8i16 weight_vec_h, offset_vec, denom_vec;
856cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
857cabdff1aSopenharmony_ci
858cabdff1aSopenharmony_ci    src -= 3;
859cabdff1aSopenharmony_ci
860cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
861cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
862cabdff1aSopenharmony_ci
863cabdff1aSopenharmony_ci    weight *= 128;
864cabdff1aSopenharmony_ci    rnd_val -= 6;
865cabdff1aSopenharmony_ci
866cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
867cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
868cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
869cabdff1aSopenharmony_ci
870cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
871cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
872cabdff1aSopenharmony_ci
873cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
874cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
875cabdff1aSopenharmony_ci
876cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
877cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
878cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
879cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
880cabdff1aSopenharmony_ci
881cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
882cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src0, src2);
883cabdff1aSopenharmony_ci        LD_SB2(src + 8, src_stride, src1, src3);
884cabdff1aSopenharmony_ci        src += (2 * src_stride);
885cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
886cabdff1aSopenharmony_ci
887cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
888cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
889cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
890cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
891cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
892cabdff1aSopenharmony_ci                   vec8, vec9, vec10, vec11);
893cabdff1aSopenharmony_ci        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
894cabdff1aSopenharmony_ci                   vec12, vec13, vec14, vec15);
895cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
896cabdff1aSopenharmony_ci                                 filt3);
897cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
898cabdff1aSopenharmony_ci                                 filt3);
899cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
900cabdff1aSopenharmony_ci                                 filt3);
901cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
902cabdff1aSopenharmony_ci                                 filt2, filt3);
903cabdff1aSopenharmony_ci
904cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
905cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
906cabdff1aSopenharmony_ci                                       dst3);
907cabdff1aSopenharmony_ci
908cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
909cabdff1aSopenharmony_ci        ST_UB2(out0, out1, dst, dst_stride);
910cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
911cabdff1aSopenharmony_ci    }
912cabdff1aSopenharmony_ci}
913cabdff1aSopenharmony_ci
914cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src,
915cabdff1aSopenharmony_ci                                      int32_t src_stride,
916cabdff1aSopenharmony_ci                                      uint8_t *dst,
917cabdff1aSopenharmony_ci                                      int32_t dst_stride,
918cabdff1aSopenharmony_ci                                      const int8_t *filter,
919cabdff1aSopenharmony_ci                                      int32_t height,
920cabdff1aSopenharmony_ci                                      int32_t weight,
921cabdff1aSopenharmony_ci                                      int32_t offset,
922cabdff1aSopenharmony_ci                                      int32_t rnd_val)
923cabdff1aSopenharmony_ci{
924cabdff1aSopenharmony_ci    uint32_t loop_cnt;
925cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
926cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
927cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
928cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
929cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
930cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
931cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
932cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
933cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
934cabdff1aSopenharmony_ci
935cabdff1aSopenharmony_ci    src -= 3;
936cabdff1aSopenharmony_ci
937cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
938cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
939cabdff1aSopenharmony_ci
940cabdff1aSopenharmony_ci    weight *= 128;
941cabdff1aSopenharmony_ci    rnd_val -= 6;
942cabdff1aSopenharmony_ci
943cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
944cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
945cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
946cabdff1aSopenharmony_ci
947cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
948cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
949cabdff1aSopenharmony_ci
950cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
951cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
952cabdff1aSopenharmony_ci
953cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
954cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
955cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
956cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
957cabdff1aSopenharmony_ci    mask4 = mask0 + 8;
958cabdff1aSopenharmony_ci    mask5 = mask0 + 10;
959cabdff1aSopenharmony_ci    mask6 = mask0 + 12;
960cabdff1aSopenharmony_ci    mask7 = mask0 + 14;
961cabdff1aSopenharmony_ci
962cabdff1aSopenharmony_ci    for (loop_cnt = 16; loop_cnt--;) {
963cabdff1aSopenharmony_ci        LD_SB2(src, 16, src0, src1);
964cabdff1aSopenharmony_ci        src += src_stride;
965cabdff1aSopenharmony_ci        LD_SB2(src, 16, src2, src3);
966cabdff1aSopenharmony_ci        src += src_stride;
967cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
968cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
969cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
970cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
971cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
972cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
973cabdff1aSopenharmony_ci                   vec8, vec9, vec10, vec11);
974cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
975cabdff1aSopenharmony_ci                   vec12, vec13, vec14, vec15);
976cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
977cabdff1aSopenharmony_ci                                 filt3);
978cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
979cabdff1aSopenharmony_ci                                 filt3);
980cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
981cabdff1aSopenharmony_ci                                 filt3);
982cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
983cabdff1aSopenharmony_ci                                 filt2, filt3);
984cabdff1aSopenharmony_ci
985cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
986cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
987cabdff1aSopenharmony_ci        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
988cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
989cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
990cabdff1aSopenharmony_ci                                 filt3);
991cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
992cabdff1aSopenharmony_ci                                 filt3);
993cabdff1aSopenharmony_ci
994cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
995cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
996cabdff1aSopenharmony_ci                                       dst3);
997cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
998cabdff1aSopenharmony_ci                                       rnd_vec, dst4, dst5);
999cabdff1aSopenharmony_ci
1000cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst1, dst0, dst4, dst3, dst5, dst2, out0, out1, out2);
1001cabdff1aSopenharmony_ci        ST_UB2(out0, out1, dst, dst_stride);
1002cabdff1aSopenharmony_ci        ST_D2(out2, 0, 1, dst + 16, dst_stride);
1003cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
1004cabdff1aSopenharmony_ci    }
1005cabdff1aSopenharmony_ci}
1006cabdff1aSopenharmony_ci
1007cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src,
1008cabdff1aSopenharmony_ci                                      int32_t src_stride,
1009cabdff1aSopenharmony_ci                                      uint8_t *dst,
1010cabdff1aSopenharmony_ci                                      int32_t dst_stride,
1011cabdff1aSopenharmony_ci                                      const int8_t *filter,
1012cabdff1aSopenharmony_ci                                      int32_t height,
1013cabdff1aSopenharmony_ci                                      int32_t weight,
1014cabdff1aSopenharmony_ci                                      int32_t offset,
1015cabdff1aSopenharmony_ci                                      int32_t rnd_val)
1016cabdff1aSopenharmony_ci{
1017cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1018cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
1019cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1020cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1021cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3;
1022cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1023cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1024cabdff1aSopenharmony_ci    v8i16 filter_vec;
1025cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1026cabdff1aSopenharmony_ci    v8i16 weight_vec_h, offset_vec, denom_vec;
1027cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
1028cabdff1aSopenharmony_ci
1029cabdff1aSopenharmony_ci    src -= 3;
1030cabdff1aSopenharmony_ci
1031cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1032cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
1033cabdff1aSopenharmony_ci
1034cabdff1aSopenharmony_ci    weight *= 128;
1035cabdff1aSopenharmony_ci    rnd_val -= 6;
1036cabdff1aSopenharmony_ci
1037cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
1038cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
1039cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
1040cabdff1aSopenharmony_ci
1041cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1042cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1043cabdff1aSopenharmony_ci
1044cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
1045cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1046cabdff1aSopenharmony_ci
1047cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1048cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1049cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1050cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1051cabdff1aSopenharmony_ci
1052cabdff1aSopenharmony_ci    for (loop_cnt = height >> 1; loop_cnt--;) {
1053cabdff1aSopenharmony_ci        LD_SB4(src, 8, src0, src1, src2, src3);
1054cabdff1aSopenharmony_ci        src += src_stride;
1055cabdff1aSopenharmony_ci        LD_SB4(src, 8, src4, src5, src6, src7);
1056cabdff1aSopenharmony_ci        src += src_stride;
1057cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1058cabdff1aSopenharmony_ci
1059cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1060cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1061cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1062cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
1063cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1064cabdff1aSopenharmony_ci                   vec8, vec9, vec10, vec11);
1065cabdff1aSopenharmony_ci        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1066cabdff1aSopenharmony_ci                   vec12, vec13, vec14, vec15);
1067cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1068cabdff1aSopenharmony_ci                                 filt3);
1069cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1070cabdff1aSopenharmony_ci                                 filt3);
1071cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1072cabdff1aSopenharmony_ci                                 filt3);
1073cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1074cabdff1aSopenharmony_ci                                 filt2, filt3);
1075cabdff1aSopenharmony_ci
1076cabdff1aSopenharmony_ci        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1077cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1078cabdff1aSopenharmony_ci        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1079cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
1080cabdff1aSopenharmony_ci        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1081cabdff1aSopenharmony_ci                   vec8, vec9, vec10, vec11);
1082cabdff1aSopenharmony_ci        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1083cabdff1aSopenharmony_ci                   vec12, vec13, vec14, vec15);
1084cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1085cabdff1aSopenharmony_ci                                 filt3);
1086cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1087cabdff1aSopenharmony_ci                                 filt3);
1088cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1089cabdff1aSopenharmony_ci                                 filt3);
1090cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1091cabdff1aSopenharmony_ci                                 filt2, filt3);
1092cabdff1aSopenharmony_ci
1093cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1094cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
1095cabdff1aSopenharmony_ci                                       dst3);
1096cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
1097cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst4, dst5, dst6,
1098cabdff1aSopenharmony_ci                                       dst7);
1099cabdff1aSopenharmony_ci
1100cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1101cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
1102cabdff1aSopenharmony_ci        ST_UB2(out0, out1, dst, 16);
1103cabdff1aSopenharmony_ci        dst += dst_stride;
1104cabdff1aSopenharmony_ci        ST_UB2(out2, out3, dst, 16);
1105cabdff1aSopenharmony_ci        dst += dst_stride;
1106cabdff1aSopenharmony_ci    }
1107cabdff1aSopenharmony_ci}
1108cabdff1aSopenharmony_ci
1109cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src,
1110cabdff1aSopenharmony_ci                                      int32_t src_stride,
1111cabdff1aSopenharmony_ci                                      uint8_t *dst,
1112cabdff1aSopenharmony_ci                                      int32_t dst_stride,
1113cabdff1aSopenharmony_ci                                      const int8_t *filter,
1114cabdff1aSopenharmony_ci                                      int32_t height,
1115cabdff1aSopenharmony_ci                                      int32_t weight,
1116cabdff1aSopenharmony_ci                                      int32_t offset,
1117cabdff1aSopenharmony_ci                                      int32_t rnd_val)
1118cabdff1aSopenharmony_ci{
1119cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1120cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
1121cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
1122cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1123cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1124cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1125cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1126cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1127cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
1128cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
1129cabdff1aSopenharmony_ci
1130cabdff1aSopenharmony_ci    src -= 3;
1131cabdff1aSopenharmony_ci
1132cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
1133cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1134cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
1135cabdff1aSopenharmony_ci
1136cabdff1aSopenharmony_ci    weight *= 128;
1137cabdff1aSopenharmony_ci    rnd_val -= 6;
1138cabdff1aSopenharmony_ci
1139cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
1140cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
1141cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
1142cabdff1aSopenharmony_ci
1143cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1144cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1145cabdff1aSopenharmony_ci
1146cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
1147cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1148cabdff1aSopenharmony_ci
1149cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1150cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1151cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1152cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1153cabdff1aSopenharmony_ci    mask4 = mask0 + 8;
1154cabdff1aSopenharmony_ci    mask5 = mask0 + 10;
1155cabdff1aSopenharmony_ci    mask6 = mask0 + 12;
1156cabdff1aSopenharmony_ci    mask7 = mask0 + 14;
1157cabdff1aSopenharmony_ci
1158cabdff1aSopenharmony_ci    for (loop_cnt = 64; loop_cnt--;) {
1159cabdff1aSopenharmony_ci        LD_SB3(src, 16, src0, src1, src2);
1160cabdff1aSopenharmony_ci        src3 = LD_SB(src + 40);
1161cabdff1aSopenharmony_ci        src += src_stride;
1162cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
1163cabdff1aSopenharmony_ci
1164cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1165cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1166cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1167cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
1168cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1169cabdff1aSopenharmony_ci                   vec8, vec9, vec10, vec11);
1170cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
1171cabdff1aSopenharmony_ci                   vec12, vec13, vec14, vec15);
1172cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1173cabdff1aSopenharmony_ci                                 filt3);
1174cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1175cabdff1aSopenharmony_ci                                 filt3);
1176cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1177cabdff1aSopenharmony_ci                                 filt3);
1178cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1179cabdff1aSopenharmony_ci                                 filt2, filt3);
1180cabdff1aSopenharmony_ci
1181cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1182cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1183cabdff1aSopenharmony_ci        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1184cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
1185cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1186cabdff1aSopenharmony_ci                                 filt3);
1187cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1188cabdff1aSopenharmony_ci                                 filt3);
1189cabdff1aSopenharmony_ci
1190cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1191cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
1192cabdff1aSopenharmony_ci                                       dst3);
1193cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
1194cabdff1aSopenharmony_ci                                       rnd_vec, dst4, dst5);
1195cabdff1aSopenharmony_ci
1196cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
1197cabdff1aSopenharmony_ci        ST_UB2(out0, out1, dst, 16);
1198cabdff1aSopenharmony_ci        ST_UB(out2, dst + 32);
1199cabdff1aSopenharmony_ci        dst += dst_stride;
1200cabdff1aSopenharmony_ci    }
1201cabdff1aSopenharmony_ci}
1202cabdff1aSopenharmony_ci
1203cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src,
1204cabdff1aSopenharmony_ci                                      int32_t src_stride,
1205cabdff1aSopenharmony_ci                                      uint8_t *dst,
1206cabdff1aSopenharmony_ci                                      int32_t dst_stride,
1207cabdff1aSopenharmony_ci                                      const int8_t *filter,
1208cabdff1aSopenharmony_ci                                      int32_t height,
1209cabdff1aSopenharmony_ci                                      int32_t weight,
1210cabdff1aSopenharmony_ci                                      int32_t offset,
1211cabdff1aSopenharmony_ci                                      int32_t rnd_val)
1212cabdff1aSopenharmony_ci{
1213cabdff1aSopenharmony_ci    uint8_t *src_tmp;
1214cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
1215cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
1216cabdff1aSopenharmony_ci    v16u8 out0, out1;
1217cabdff1aSopenharmony_ci    v16i8 src0, src1, src2;
1218cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1219cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1220cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1221cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1222cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
1223cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
1224cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
1225cabdff1aSopenharmony_ci
1226cabdff1aSopenharmony_ci    src -= 3;
1227cabdff1aSopenharmony_ci
1228cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1229cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
1230cabdff1aSopenharmony_ci
1231cabdff1aSopenharmony_ci    weight *= 128;
1232cabdff1aSopenharmony_ci    rnd_val -= 6;
1233cabdff1aSopenharmony_ci
1234cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
1235cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
1236cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
1237cabdff1aSopenharmony_ci
1238cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1239cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1240cabdff1aSopenharmony_ci
1241cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
1242cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1243cabdff1aSopenharmony_ci
1244cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1245cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1246cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1247cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1248cabdff1aSopenharmony_ci    mask4 = mask0 + 8;
1249cabdff1aSopenharmony_ci    mask5 = mask0 + 10;
1250cabdff1aSopenharmony_ci    mask6 = mask0 + 12;
1251cabdff1aSopenharmony_ci    mask7 = mask0 + 14;
1252cabdff1aSopenharmony_ci
1253cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
1254cabdff1aSopenharmony_ci        src_tmp = src;
1255cabdff1aSopenharmony_ci        dst_tmp = dst;
1256cabdff1aSopenharmony_ci
1257cabdff1aSopenharmony_ci        for (cnt = 2; cnt--;) {
1258cabdff1aSopenharmony_ci            LD_SB2(src_tmp, 16, src0, src1);
1259cabdff1aSopenharmony_ci            src2 = LD_SB(src_tmp + 24);
1260cabdff1aSopenharmony_ci            src_tmp += 32;
1261cabdff1aSopenharmony_ci            XORI_B3_128_SB(src0, src1, src2);
1262cabdff1aSopenharmony_ci
1263cabdff1aSopenharmony_ci            VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1264cabdff1aSopenharmony_ci                       vec0, vec1, vec2, vec3);
1265cabdff1aSopenharmony_ci            VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1266cabdff1aSopenharmony_ci                       vec4, vec5, vec6, vec7);
1267cabdff1aSopenharmony_ci            VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1268cabdff1aSopenharmony_ci                       vec8, vec9, vec10, vec11);
1269cabdff1aSopenharmony_ci            VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1270cabdff1aSopenharmony_ci                       vec12, vec13, vec14, vec15);
1271cabdff1aSopenharmony_ci            dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1272cabdff1aSopenharmony_ci                                     filt2, filt3);
1273cabdff1aSopenharmony_ci            dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1,
1274cabdff1aSopenharmony_ci                                     filt2, filt3);
1275cabdff1aSopenharmony_ci            dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1276cabdff1aSopenharmony_ci                                     filt2, filt3);
1277cabdff1aSopenharmony_ci            dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1278cabdff1aSopenharmony_ci                                     filt2, filt3);
1279cabdff1aSopenharmony_ci
1280cabdff1aSopenharmony_ci            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1281cabdff1aSopenharmony_ci                                           offset_vec, rnd_vec, dst0, dst1,
1282cabdff1aSopenharmony_ci                                           dst2, dst3);
1283cabdff1aSopenharmony_ci
1284cabdff1aSopenharmony_ci            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1285cabdff1aSopenharmony_ci            ST_UB2(out0, out1, dst_tmp, 16);
1286cabdff1aSopenharmony_ci            dst_tmp += 32;
1287cabdff1aSopenharmony_ci        }
1288cabdff1aSopenharmony_ci
1289cabdff1aSopenharmony_ci        src += src_stride;
1290cabdff1aSopenharmony_ci        dst += dst_stride;
1291cabdff1aSopenharmony_ci    }
1292cabdff1aSopenharmony_ci}
1293cabdff1aSopenharmony_ci
1294cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src,
1295cabdff1aSopenharmony_ci                                     int32_t src_stride,
1296cabdff1aSopenharmony_ci                                     uint8_t *dst,
1297cabdff1aSopenharmony_ci                                     int32_t dst_stride,
1298cabdff1aSopenharmony_ci                                     const int8_t *filter,
1299cabdff1aSopenharmony_ci                                     int32_t height,
1300cabdff1aSopenharmony_ci                                     int32_t weight,
1301cabdff1aSopenharmony_ci                                     int32_t offset,
1302cabdff1aSopenharmony_ci                                     int32_t rnd_val)
1303cabdff1aSopenharmony_ci{
1304cabdff1aSopenharmony_ci    int32_t loop_cnt;
1305cabdff1aSopenharmony_ci    v16u8 out0, out1;
1306cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1307cabdff1aSopenharmony_ci    v16i8 src9, src10, src11, src12, src13, src14;
1308cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1309cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1310cabdff1aSopenharmony_ci    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1311cabdff1aSopenharmony_ci    v16i8 src2110, src4332, src6554, src8776, src10998;
1312cabdff1aSopenharmony_ci    v16i8 src12111110, src14131312;
1313cabdff1aSopenharmony_ci    v8i16 filter_vec, dst01, dst23, dst45, dst67;
1314cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1315cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
1316cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
1317cabdff1aSopenharmony_ci
1318cabdff1aSopenharmony_ci    src -= (3 * src_stride);
1319cabdff1aSopenharmony_ci
1320cabdff1aSopenharmony_ci
1321cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1322cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
1323cabdff1aSopenharmony_ci
1324cabdff1aSopenharmony_ci    weight *= 128;
1325cabdff1aSopenharmony_ci    rnd_val -= 6;
1326cabdff1aSopenharmony_ci
1327cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
1328cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
1329cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
1330cabdff1aSopenharmony_ci
1331cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1332cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1333cabdff1aSopenharmony_ci
1334cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
1335cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1336cabdff1aSopenharmony_ci
1337cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1338cabdff1aSopenharmony_ci    src += (7 * src_stride);
1339cabdff1aSopenharmony_ci
1340cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1341cabdff1aSopenharmony_ci               src10_r, src32_r, src54_r, src21_r);
1342cabdff1aSopenharmony_ci
1343cabdff1aSopenharmony_ci    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1344cabdff1aSopenharmony_ci
1345cabdff1aSopenharmony_ci    ILVR_D3_SB(src21_r, src10_r, src43_r,
1346cabdff1aSopenharmony_ci               src32_r, src65_r, src54_r, src2110, src4332, src6554);
1347cabdff1aSopenharmony_ci
1348cabdff1aSopenharmony_ci    XORI_B3_128_SB(src2110, src4332, src6554);
1349cabdff1aSopenharmony_ci
1350cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
1351cabdff1aSopenharmony_ci        LD_SB8(src, src_stride,
1352cabdff1aSopenharmony_ci               src7, src8, src9, src10, src11, src12, src13, src14);
1353cabdff1aSopenharmony_ci        src += (8 * src_stride);
1354cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1355cabdff1aSopenharmony_ci                   src76_r, src87_r, src98_r, src109_r);
1356cabdff1aSopenharmony_ci        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1357cabdff1aSopenharmony_ci                   src1110_r, src1211_r, src1312_r, src1413_r);
1358cabdff1aSopenharmony_ci        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1359cabdff1aSopenharmony_ci                   src1413_r, src1312_r,
1360cabdff1aSopenharmony_ci                   src8776, src10998, src12111110, src14131312);
1361cabdff1aSopenharmony_ci        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1362cabdff1aSopenharmony_ci        dst01 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0,
1363cabdff1aSopenharmony_ci                                  filt1, filt2, filt3);
1364cabdff1aSopenharmony_ci        dst23 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0,
1365cabdff1aSopenharmony_ci                                  filt1, filt2, filt3);
1366cabdff1aSopenharmony_ci        dst45 = HEVC_FILT_8TAP_SH(src6554, src8776, src10998, src12111110,
1367cabdff1aSopenharmony_ci                                  filt0, filt1, filt2, filt3);
1368cabdff1aSopenharmony_ci        dst67 = HEVC_FILT_8TAP_SH(src8776, src10998, src12111110, src14131312,
1369cabdff1aSopenharmony_ci                                  filt0, filt1, filt2, filt3);
1370cabdff1aSopenharmony_ci
1371cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec,
1372cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
1373cabdff1aSopenharmony_ci                                       dst3);
1374cabdff1aSopenharmony_ci
1375cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1376cabdff1aSopenharmony_ci        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1377cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
1378cabdff1aSopenharmony_ci
1379cabdff1aSopenharmony_ci        src2110 = src10998;
1380cabdff1aSopenharmony_ci        src4332 = src12111110;
1381cabdff1aSopenharmony_ci        src6554 = src14131312;
1382cabdff1aSopenharmony_ci        src6 = src14;
1383cabdff1aSopenharmony_ci    }
1384cabdff1aSopenharmony_ci}
1385cabdff1aSopenharmony_ci
1386cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src,
1387cabdff1aSopenharmony_ci                                     int32_t src_stride,
1388cabdff1aSopenharmony_ci                                     uint8_t *dst,
1389cabdff1aSopenharmony_ci                                     int32_t dst_stride,
1390cabdff1aSopenharmony_ci                                     const int8_t *filter,
1391cabdff1aSopenharmony_ci                                     int32_t height,
1392cabdff1aSopenharmony_ci                                     int32_t weight,
1393cabdff1aSopenharmony_ci                                     int32_t offset,
1394cabdff1aSopenharmony_ci                                     int32_t rnd_val)
1395cabdff1aSopenharmony_ci{
1396cabdff1aSopenharmony_ci    int32_t loop_cnt;
1397cabdff1aSopenharmony_ci    v16u8 out0, out1;
1398cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1399cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1400cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1401cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1402cabdff1aSopenharmony_ci    v8i16 filter_vec;
1403cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
1404cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
1405cabdff1aSopenharmony_ci
1406cabdff1aSopenharmony_ci    src -= (3 * src_stride);
1407cabdff1aSopenharmony_ci
1408cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1409cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
1410cabdff1aSopenharmony_ci
1411cabdff1aSopenharmony_ci    weight *= 128;
1412cabdff1aSopenharmony_ci    rnd_val -= 6;
1413cabdff1aSopenharmony_ci
1414cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
1415cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
1416cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
1417cabdff1aSopenharmony_ci
1418cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1419cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1420cabdff1aSopenharmony_ci
1421cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
1422cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1423cabdff1aSopenharmony_ci
1424cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1425cabdff1aSopenharmony_ci    src += (7 * src_stride);
1426cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1427cabdff1aSopenharmony_ci
1428cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1429cabdff1aSopenharmony_ci               src10_r, src32_r, src54_r, src21_r);
1430cabdff1aSopenharmony_ci    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1431cabdff1aSopenharmony_ci
1432cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1433cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
1434cabdff1aSopenharmony_ci        src += (4 * src_stride);
1435cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
1436cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1437cabdff1aSopenharmony_ci                   src76_r, src87_r, src98_r, src109_r);
1438cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1439cabdff1aSopenharmony_ci                                 filt1, filt2, filt3);
1440cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1441cabdff1aSopenharmony_ci                                 filt1, filt2, filt3);
1442cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1443cabdff1aSopenharmony_ci                                 filt1, filt2, filt3);
1444cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1445cabdff1aSopenharmony_ci                                 filt1, filt2, filt3);
1446cabdff1aSopenharmony_ci
1447cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1448cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
1449cabdff1aSopenharmony_ci                                       dst3);
1450cabdff1aSopenharmony_ci
1451cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1452cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1453cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1454cabdff1aSopenharmony_ci
1455cabdff1aSopenharmony_ci        src10_r = src54_r;
1456cabdff1aSopenharmony_ci        src32_r = src76_r;
1457cabdff1aSopenharmony_ci        src54_r = src98_r;
1458cabdff1aSopenharmony_ci        src21_r = src65_r;
1459cabdff1aSopenharmony_ci        src43_r = src87_r;
1460cabdff1aSopenharmony_ci        src65_r = src109_r;
1461cabdff1aSopenharmony_ci        src6 = src10;
1462cabdff1aSopenharmony_ci    }
1463cabdff1aSopenharmony_ci}
1464cabdff1aSopenharmony_ci
1465cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src,
1466cabdff1aSopenharmony_ci                                      int32_t src_stride,
1467cabdff1aSopenharmony_ci                                      uint8_t *dst,
1468cabdff1aSopenharmony_ci                                      int32_t dst_stride,
1469cabdff1aSopenharmony_ci                                      const int8_t *filter,
1470cabdff1aSopenharmony_ci                                      int32_t height,
1471cabdff1aSopenharmony_ci                                      int32_t weight,
1472cabdff1aSopenharmony_ci                                      int32_t offset,
1473cabdff1aSopenharmony_ci                                      int32_t rnd_val)
1474cabdff1aSopenharmony_ci{
1475cabdff1aSopenharmony_ci    int32_t loop_cnt;
1476cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
1477cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1478cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1479cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1480cabdff1aSopenharmony_ci    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1481cabdff1aSopenharmony_ci    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1482cabdff1aSopenharmony_ci    v16i8 src2110, src4332, src6554, src8776, src10998;
1483cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1484cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1485cabdff1aSopenharmony_ci    v8i16 weight_vec_h, offset_vec, denom_vec, filter_vec;
1486cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
1487cabdff1aSopenharmony_ci
1488cabdff1aSopenharmony_ci    src -= (3 * src_stride);
1489cabdff1aSopenharmony_ci
1490cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
1491cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1492cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
1493cabdff1aSopenharmony_ci
1494cabdff1aSopenharmony_ci    weight *= 128;
1495cabdff1aSopenharmony_ci    rnd_val -= 6;
1496cabdff1aSopenharmony_ci
1497cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
1498cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
1499cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
1500cabdff1aSopenharmony_ci
1501cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1502cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1503cabdff1aSopenharmony_ci
1504cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
1505cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1506cabdff1aSopenharmony_ci
1507cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1508cabdff1aSopenharmony_ci    src += (7 * src_stride);
1509cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1510cabdff1aSopenharmony_ci
1511cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1512cabdff1aSopenharmony_ci               src10_r, src32_r, src54_r, src21_r);
1513cabdff1aSopenharmony_ci    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1514cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1515cabdff1aSopenharmony_ci               src10_l, src32_l, src54_l, src21_l);
1516cabdff1aSopenharmony_ci    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1517cabdff1aSopenharmony_ci    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1518cabdff1aSopenharmony_ci               src2110, src4332, src6554);
1519cabdff1aSopenharmony_ci
1520cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
1521cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
1522cabdff1aSopenharmony_ci        src += (4 * src_stride);
1523cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
1524cabdff1aSopenharmony_ci
1525cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1526cabdff1aSopenharmony_ci                   src76_r, src87_r, src98_r, src109_r);
1527cabdff1aSopenharmony_ci        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1528cabdff1aSopenharmony_ci                   src76_l, src87_l, src98_l, src109_l);
1529cabdff1aSopenharmony_ci        ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1530cabdff1aSopenharmony_ci
1531cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1532cabdff1aSopenharmony_ci                                 filt1, filt2, filt3);
1533cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1534cabdff1aSopenharmony_ci                                 filt1, filt2, filt3);
1535cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1536cabdff1aSopenharmony_ci                                 filt1, filt2, filt3);
1537cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1538cabdff1aSopenharmony_ci                                 filt1, filt2, filt3);
1539cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0,
1540cabdff1aSopenharmony_ci                                 filt1, filt2, filt3);
1541cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0,
1542cabdff1aSopenharmony_ci                                 filt1, filt2, filt3);
1543cabdff1aSopenharmony_ci
1544cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1545cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
1546cabdff1aSopenharmony_ci                                       dst3);
1547cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
1548cabdff1aSopenharmony_ci                                       rnd_vec, dst4, dst5);
1549cabdff1aSopenharmony_ci
1550cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
1551cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1552cabdff1aSopenharmony_ci        ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
1553cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1554cabdff1aSopenharmony_ci
1555cabdff1aSopenharmony_ci        src10_r = src54_r;
1556cabdff1aSopenharmony_ci        src32_r = src76_r;
1557cabdff1aSopenharmony_ci        src54_r = src98_r;
1558cabdff1aSopenharmony_ci        src21_r = src65_r;
1559cabdff1aSopenharmony_ci        src43_r = src87_r;
1560cabdff1aSopenharmony_ci        src65_r = src109_r;
1561cabdff1aSopenharmony_ci        src2110 = src6554;
1562cabdff1aSopenharmony_ci        src4332 = src8776;
1563cabdff1aSopenharmony_ci        src6554 = src10998;
1564cabdff1aSopenharmony_ci        src6 = src10;
1565cabdff1aSopenharmony_ci    }
1566cabdff1aSopenharmony_ci}
1567cabdff1aSopenharmony_ci
1568cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_16multx4mult_msa(uint8_t *src,
1569cabdff1aSopenharmony_ci                                               int32_t src_stride,
1570cabdff1aSopenharmony_ci                                               uint8_t *dst,
1571cabdff1aSopenharmony_ci                                               int32_t dst_stride,
1572cabdff1aSopenharmony_ci                                               const int8_t *filter,
1573cabdff1aSopenharmony_ci                                               int32_t height,
1574cabdff1aSopenharmony_ci                                               int32_t weight,
1575cabdff1aSopenharmony_ci                                               int32_t offset,
1576cabdff1aSopenharmony_ci                                               int32_t rnd_val,
1577cabdff1aSopenharmony_ci                                               int32_t weightmul16)
1578cabdff1aSopenharmony_ci{
1579cabdff1aSopenharmony_ci    uint8_t *src_tmp;
1580cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
1581cabdff1aSopenharmony_ci    int32_t loop_cnt, cnt;
1582cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
1583cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1584cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r;
1585cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src65_r, src87_r;
1586cabdff1aSopenharmony_ci    v16i8 src10_l, src32_l, src54_l, src76_l;
1587cabdff1aSopenharmony_ci    v16i8 src21_l, src43_l, src65_l, src87_l;
1588cabdff1aSopenharmony_ci    v16i8 src98_r, src109_r, src98_l, src109_l;
1589cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1590cabdff1aSopenharmony_ci    v8i16 filter_vec;
1591cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1592cabdff1aSopenharmony_ci    v8i16 weight_vec_h, offset_vec, denom_vec;
1593cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
1594cabdff1aSopenharmony_ci
1595cabdff1aSopenharmony_ci    src -= (3 * src_stride);
1596cabdff1aSopenharmony_ci
1597cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1598cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
1599cabdff1aSopenharmony_ci
1600cabdff1aSopenharmony_ci    weight *= 128;
1601cabdff1aSopenharmony_ci    rnd_val -= 6;
1602cabdff1aSopenharmony_ci
1603cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
1604cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
1605cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
1606cabdff1aSopenharmony_ci
1607cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1608cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1609cabdff1aSopenharmony_ci
1610cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
1611cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1612cabdff1aSopenharmony_ci
1613cabdff1aSopenharmony_ci    for (cnt = weightmul16; cnt--;) {
1614cabdff1aSopenharmony_ci        src_tmp = src;
1615cabdff1aSopenharmony_ci        dst_tmp = dst;
1616cabdff1aSopenharmony_ci
1617cabdff1aSopenharmony_ci        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1618cabdff1aSopenharmony_ci        src_tmp += (7 * src_stride);
1619cabdff1aSopenharmony_ci        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1620cabdff1aSopenharmony_ci
1621cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 2); loop_cnt--;) {
1622cabdff1aSopenharmony_ci            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1623cabdff1aSopenharmony_ci            src_tmp += (4 * src_stride);
1624cabdff1aSopenharmony_ci            XORI_B4_128_SB(src7, src8, src9, src10);
1625cabdff1aSopenharmony_ci
1626cabdff1aSopenharmony_ci            ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1627cabdff1aSopenharmony_ci                       src10_r, src32_r, src54_r, src21_r);
1628cabdff1aSopenharmony_ci            ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1629cabdff1aSopenharmony_ci            ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1630cabdff1aSopenharmony_ci                       src10_l, src32_l, src54_l, src21_l);
1631cabdff1aSopenharmony_ci            ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1632cabdff1aSopenharmony_ci            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1633cabdff1aSopenharmony_ci                       src76_r, src87_r, src98_r, src109_r);
1634cabdff1aSopenharmony_ci            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1635cabdff1aSopenharmony_ci                       src76_l, src87_l, src98_l, src109_l);
1636cabdff1aSopenharmony_ci
1637cabdff1aSopenharmony_ci            dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1638cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
1639cabdff1aSopenharmony_ci            dst1 = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1640cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
1641cabdff1aSopenharmony_ci            dst2 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1642cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
1643cabdff1aSopenharmony_ci            dst3 = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1644cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
1645cabdff1aSopenharmony_ci            dst4 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1646cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
1647cabdff1aSopenharmony_ci            dst5 = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1648cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
1649cabdff1aSopenharmony_ci            dst6 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1650cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
1651cabdff1aSopenharmony_ci            dst7 = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1652cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
1653cabdff1aSopenharmony_ci
1654cabdff1aSopenharmony_ci            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1655cabdff1aSopenharmony_ci                                           offset_vec, rnd_vec, dst0, dst1,
1656cabdff1aSopenharmony_ci                                           dst2, dst3);
1657cabdff1aSopenharmony_ci            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
1658cabdff1aSopenharmony_ci                                           offset_vec, rnd_vec, dst4, dst5,
1659cabdff1aSopenharmony_ci                                           dst6, dst7);
1660cabdff1aSopenharmony_ci            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1661cabdff1aSopenharmony_ci            PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
1662cabdff1aSopenharmony_ci            ST_UB4(out0, out1, out2, out3, dst_tmp, dst_stride);
1663cabdff1aSopenharmony_ci            dst_tmp += (4 * dst_stride);
1664cabdff1aSopenharmony_ci
1665cabdff1aSopenharmony_ci            src0 = src4;
1666cabdff1aSopenharmony_ci            src1 = src5;
1667cabdff1aSopenharmony_ci            src2 = src6;
1668cabdff1aSopenharmony_ci            src3 = src7;
1669cabdff1aSopenharmony_ci            src4 = src8;
1670cabdff1aSopenharmony_ci            src5 = src9;
1671cabdff1aSopenharmony_ci            src6 = src10;
1672cabdff1aSopenharmony_ci        }
1673cabdff1aSopenharmony_ci
1674cabdff1aSopenharmony_ci        src += 16;
1675cabdff1aSopenharmony_ci        dst += 16;
1676cabdff1aSopenharmony_ci    }
1677cabdff1aSopenharmony_ci}
1678cabdff1aSopenharmony_ci
1679cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src,
1680cabdff1aSopenharmony_ci                                      int32_t src_stride,
1681cabdff1aSopenharmony_ci                                      uint8_t *dst,
1682cabdff1aSopenharmony_ci                                      int32_t dst_stride,
1683cabdff1aSopenharmony_ci                                      const int8_t *filter,
1684cabdff1aSopenharmony_ci                                      int32_t height,
1685cabdff1aSopenharmony_ci                                      int32_t weight,
1686cabdff1aSopenharmony_ci                                      int32_t offset,
1687cabdff1aSopenharmony_ci                                      int32_t rnd_val)
1688cabdff1aSopenharmony_ci{
1689cabdff1aSopenharmony_ci    hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1690cabdff1aSopenharmony_ci                                       filter, height, weight,
1691cabdff1aSopenharmony_ci                                       offset, rnd_val, 1);
1692cabdff1aSopenharmony_ci}
1693cabdff1aSopenharmony_ci
1694cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src,
1695cabdff1aSopenharmony_ci                                      int32_t src_stride,
1696cabdff1aSopenharmony_ci                                      uint8_t *dst,
1697cabdff1aSopenharmony_ci                                      int32_t dst_stride,
1698cabdff1aSopenharmony_ci                                      const int8_t *filter,
1699cabdff1aSopenharmony_ci                                      int32_t height,
1700cabdff1aSopenharmony_ci                                      int32_t weight,
1701cabdff1aSopenharmony_ci                                      int32_t offset,
1702cabdff1aSopenharmony_ci                                      int32_t rnd_val)
1703cabdff1aSopenharmony_ci{
1704cabdff1aSopenharmony_ci    hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1705cabdff1aSopenharmony_ci                                       filter, 32, weight,
1706cabdff1aSopenharmony_ci                                       offset, rnd_val, 1);
1707cabdff1aSopenharmony_ci
1708cabdff1aSopenharmony_ci    hevc_vt_uniwgt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
1709cabdff1aSopenharmony_ci                             filter, 32, weight, offset, rnd_val);
1710cabdff1aSopenharmony_ci}
1711cabdff1aSopenharmony_ci
1712cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src,
1713cabdff1aSopenharmony_ci                                      int32_t src_stride,
1714cabdff1aSopenharmony_ci                                      uint8_t *dst,
1715cabdff1aSopenharmony_ci                                      int32_t dst_stride,
1716cabdff1aSopenharmony_ci                                      const int8_t *filter,
1717cabdff1aSopenharmony_ci                                      int32_t height,
1718cabdff1aSopenharmony_ci                                      int32_t weight,
1719cabdff1aSopenharmony_ci                                      int32_t offset,
1720cabdff1aSopenharmony_ci                                      int32_t rnd_val)
1721cabdff1aSopenharmony_ci{
1722cabdff1aSopenharmony_ci    hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1723cabdff1aSopenharmony_ci                                       filter, height, weight,
1724cabdff1aSopenharmony_ci                                       offset, rnd_val, 2);
1725cabdff1aSopenharmony_ci}
1726cabdff1aSopenharmony_ci
1727cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src,
1728cabdff1aSopenharmony_ci                                      int32_t src_stride,
1729cabdff1aSopenharmony_ci                                      uint8_t *dst,
1730cabdff1aSopenharmony_ci                                      int32_t dst_stride,
1731cabdff1aSopenharmony_ci                                      const int8_t *filter,
1732cabdff1aSopenharmony_ci                                      int32_t height,
1733cabdff1aSopenharmony_ci                                      int32_t weight,
1734cabdff1aSopenharmony_ci                                      int32_t offset,
1735cabdff1aSopenharmony_ci                                      int32_t rnd_val)
1736cabdff1aSopenharmony_ci{
1737cabdff1aSopenharmony_ci    hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1738cabdff1aSopenharmony_ci                                       filter, 64, weight,
1739cabdff1aSopenharmony_ci                                       offset, rnd_val, 3);
1740cabdff1aSopenharmony_ci}
1741cabdff1aSopenharmony_ci
1742cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src,
1743cabdff1aSopenharmony_ci                                      int32_t src_stride,
1744cabdff1aSopenharmony_ci                                      uint8_t *dst,
1745cabdff1aSopenharmony_ci                                      int32_t dst_stride,
1746cabdff1aSopenharmony_ci                                      const int8_t *filter,
1747cabdff1aSopenharmony_ci                                      int32_t height,
1748cabdff1aSopenharmony_ci                                      int32_t weight,
1749cabdff1aSopenharmony_ci                                      int32_t offset,
1750cabdff1aSopenharmony_ci                                      int32_t rnd_val)
1751cabdff1aSopenharmony_ci{
1752cabdff1aSopenharmony_ci    hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1753cabdff1aSopenharmony_ci                                       filter, height, weight,
1754cabdff1aSopenharmony_ci                                       offset, rnd_val, 4);
1755cabdff1aSopenharmony_ci}
1756cabdff1aSopenharmony_ci
1757cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
1758cabdff1aSopenharmony_ci                                     int32_t src_stride,
1759cabdff1aSopenharmony_ci                                     uint8_t *dst,
1760cabdff1aSopenharmony_ci                                     int32_t dst_stride,
1761cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
1762cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
1763cabdff1aSopenharmony_ci                                     int32_t height,
1764cabdff1aSopenharmony_ci                                     int32_t weight,
1765cabdff1aSopenharmony_ci                                     int32_t offset,
1766cabdff1aSopenharmony_ci                                     int32_t rnd_val)
1767cabdff1aSopenharmony_ci{
1768cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1769cabdff1aSopenharmony_ci    v16u8 out;
1770cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1771cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1772cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1773cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3;
1774cabdff1aSopenharmony_ci    v8i16 filter_vec;
1775cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1776cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1777cabdff1aSopenharmony_ci    v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1778cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
1779cabdff1aSopenharmony_ci    v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
1780cabdff1aSopenharmony_ci    v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
1781cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
1782cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1783cabdff1aSopenharmony_ci
1784cabdff1aSopenharmony_ci    src -= ((3 * src_stride) + 3);
1785cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
1786cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1787cabdff1aSopenharmony_ci
1788cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
1789cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
1790cabdff1aSopenharmony_ci
1791cabdff1aSopenharmony_ci    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1792cabdff1aSopenharmony_ci
1793cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1794cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1795cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1796cabdff1aSopenharmony_ci
1797cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1798cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
1799cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
1800cabdff1aSopenharmony_ci    denom_vec = rnd_vec - 6;
1801cabdff1aSopenharmony_ci
1802cabdff1aSopenharmony_ci    const_128 = __msa_ldi_w(128);
1803cabdff1aSopenharmony_ci    const_128 *= weight_vec;
1804cabdff1aSopenharmony_ci    offset_vec += __msa_srar_w(const_128, denom_vec);
1805cabdff1aSopenharmony_ci
1806cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1807cabdff1aSopenharmony_ci    src += (7 * src_stride);
1808cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1809cabdff1aSopenharmony_ci
1810cabdff1aSopenharmony_ci    /* row 0 row 1 row 2 row 3 */
1811cabdff1aSopenharmony_ci    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1812cabdff1aSopenharmony_ci    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1813cabdff1aSopenharmony_ci    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1814cabdff1aSopenharmony_ci               vec8, vec9, vec10, vec11);
1815cabdff1aSopenharmony_ci    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1816cabdff1aSopenharmony_ci               vec12, vec13, vec14, vec15);
1817cabdff1aSopenharmony_ci    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1818cabdff1aSopenharmony_ci                              filt3);
1819cabdff1aSopenharmony_ci    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1820cabdff1aSopenharmony_ci                              filt3);
1821cabdff1aSopenharmony_ci    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1822cabdff1aSopenharmony_ci                              filt3);
1823cabdff1aSopenharmony_ci    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1824cabdff1aSopenharmony_ci                              filt3);
1825cabdff1aSopenharmony_ci
1826cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1827cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1828cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1829cabdff1aSopenharmony_ci
1830cabdff1aSopenharmony_ci    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1831cabdff1aSopenharmony_ci
1832cabdff1aSopenharmony_ci    for (loop_cnt = height >> 2; loop_cnt--;) {
1833cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
1834cabdff1aSopenharmony_ci        src += (4 * src_stride);
1835cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
1836cabdff1aSopenharmony_ci
1837cabdff1aSopenharmony_ci        VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1838cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1839cabdff1aSopenharmony_ci        VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1840cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
1841cabdff1aSopenharmony_ci        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1842cabdff1aSopenharmony_ci                                  filt3);
1843cabdff1aSopenharmony_ci        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1844cabdff1aSopenharmony_ci                                   filt3);
1845cabdff1aSopenharmony_ci
1846cabdff1aSopenharmony_ci        dst76_r = __msa_ilvr_h(dst97, dst66);
1847cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
1848cabdff1aSopenharmony_ci        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1849cabdff1aSopenharmony_ci        dst98_r = __msa_ilvr_h(dst66, dst108);
1850cabdff1aSopenharmony_ci
1851cabdff1aSopenharmony_ci        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1852cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1853cabdff1aSopenharmony_ci        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1854cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1855cabdff1aSopenharmony_ci        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1856cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1857cabdff1aSopenharmony_ci        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1858cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1859cabdff1aSopenharmony_ci
1860cabdff1aSopenharmony_ci        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1861cabdff1aSopenharmony_ci        MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
1862cabdff1aSopenharmony_ci        MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
1863cabdff1aSopenharmony_ci        SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
1864cabdff1aSopenharmony_ci        ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
1865cabdff1aSopenharmony_ci        ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
1866cabdff1aSopenharmony_ci        CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
1867cabdff1aSopenharmony_ci        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1868cabdff1aSopenharmony_ci        out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
1869cabdff1aSopenharmony_ci        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1870cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1871cabdff1aSopenharmony_ci
1872cabdff1aSopenharmony_ci        dst10_r = dst54_r;
1873cabdff1aSopenharmony_ci        dst32_r = dst76_r;
1874cabdff1aSopenharmony_ci        dst54_r = dst98_r;
1875cabdff1aSopenharmony_ci        dst21_r = dst65_r;
1876cabdff1aSopenharmony_ci        dst43_r = dst87_r;
1877cabdff1aSopenharmony_ci        dst65_r = dst109_r;
1878cabdff1aSopenharmony_ci        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1879cabdff1aSopenharmony_ci    }
1880cabdff1aSopenharmony_ci}
1881cabdff1aSopenharmony_ci
1882cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src,
1883cabdff1aSopenharmony_ci                                              int32_t src_stride,
1884cabdff1aSopenharmony_ci                                              uint8_t *dst,
1885cabdff1aSopenharmony_ci                                              int32_t dst_stride,
1886cabdff1aSopenharmony_ci                                              const int8_t *filter_x,
1887cabdff1aSopenharmony_ci                                              const int8_t *filter_y,
1888cabdff1aSopenharmony_ci                                              int32_t height,
1889cabdff1aSopenharmony_ci                                              int32_t weight,
1890cabdff1aSopenharmony_ci                                              int32_t offset,
1891cabdff1aSopenharmony_ci                                              int32_t rnd_val,
1892cabdff1aSopenharmony_ci                                              int32_t width)
1893cabdff1aSopenharmony_ci{
1894cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
1895cabdff1aSopenharmony_ci    uint8_t *src_tmp;
1896cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
1897cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1898cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1899cabdff1aSopenharmony_ci    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1900cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3;
1901cabdff1aSopenharmony_ci    v8i16 filter_vec;
1902cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1903cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1904cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1905cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1906cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1907cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1908cabdff1aSopenharmony_ci    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1909cabdff1aSopenharmony_ci    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1910cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
1911cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1912cabdff1aSopenharmony_ci
1913cabdff1aSopenharmony_ci    src -= ((3 * src_stride) + 3);
1914cabdff1aSopenharmony_ci
1915cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1916cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
1917cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
1918cabdff1aSopenharmony_ci    denom_vec = rnd_vec - 6;
1919cabdff1aSopenharmony_ci
1920cabdff1aSopenharmony_ci    const_128 = __msa_ldi_w(128);
1921cabdff1aSopenharmony_ci    const_128 *= weight_vec;
1922cabdff1aSopenharmony_ci    offset_vec += __msa_srar_w(const_128, denom_vec);
1923cabdff1aSopenharmony_ci
1924cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
1925cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1926cabdff1aSopenharmony_ci
1927cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
1928cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
1929cabdff1aSopenharmony_ci    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1930cabdff1aSopenharmony_ci
1931cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1932cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1933cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1934cabdff1aSopenharmony_ci
1935cabdff1aSopenharmony_ci    for (cnt = width >> 3; cnt--;) {
1936cabdff1aSopenharmony_ci        src_tmp = src;
1937cabdff1aSopenharmony_ci        dst_tmp = dst;
1938cabdff1aSopenharmony_ci
1939cabdff1aSopenharmony_ci        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1940cabdff1aSopenharmony_ci        src_tmp += (7 * src_stride);
1941cabdff1aSopenharmony_ci        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1942cabdff1aSopenharmony_ci
1943cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1944cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1945cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1946cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
1947cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1948cabdff1aSopenharmony_ci                   vec8, vec9, vec10, vec11);
1949cabdff1aSopenharmony_ci        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1950cabdff1aSopenharmony_ci                   vec12, vec13, vec14, vec15);
1951cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1952cabdff1aSopenharmony_ci                                 filt3);
1953cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1954cabdff1aSopenharmony_ci                                 filt3);
1955cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1956cabdff1aSopenharmony_ci                                 filt3);
1957cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1958cabdff1aSopenharmony_ci                                 filt2, filt3);
1959cabdff1aSopenharmony_ci
1960cabdff1aSopenharmony_ci        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1961cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1962cabdff1aSopenharmony_ci        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1963cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
1964cabdff1aSopenharmony_ci        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1965cabdff1aSopenharmony_ci                   vec8, vec9, vec10, vec11);
1966cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1967cabdff1aSopenharmony_ci                                 filt3);
1968cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1969cabdff1aSopenharmony_ci                                 filt3);
1970cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1971cabdff1aSopenharmony_ci                                 filt3);
1972cabdff1aSopenharmony_ci
1973cabdff1aSopenharmony_ci        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1974cabdff1aSopenharmony_ci                   dst10_r, dst32_r, dst54_r, dst21_r);
1975cabdff1aSopenharmony_ci        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1976cabdff1aSopenharmony_ci        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1977cabdff1aSopenharmony_ci                   dst10_l, dst32_l, dst54_l, dst21_l);
1978cabdff1aSopenharmony_ci        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1979cabdff1aSopenharmony_ci
1980cabdff1aSopenharmony_ci        for (loop_cnt = height >> 1; loop_cnt--;) {
1981cabdff1aSopenharmony_ci            LD_SB2(src_tmp, src_stride, src7, src8);
1982cabdff1aSopenharmony_ci            src_tmp += 2 * src_stride;
1983cabdff1aSopenharmony_ci            XORI_B2_128_SB(src7, src8);
1984cabdff1aSopenharmony_ci
1985cabdff1aSopenharmony_ci            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1986cabdff1aSopenharmony_ci                       vec0, vec1, vec2, vec3);
1987cabdff1aSopenharmony_ci            dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1988cabdff1aSopenharmony_ci                                     filt2, filt3);
1989cabdff1aSopenharmony_ci
1990cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1991cabdff1aSopenharmony_ci            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1992cabdff1aSopenharmony_ci                                    filt_h0, filt_h1, filt_h2, filt_h3);
1993cabdff1aSopenharmony_ci            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1994cabdff1aSopenharmony_ci                                    filt_h0, filt_h1, filt_h2, filt_h3);
1995cabdff1aSopenharmony_ci            dst0_r >>= 6;
1996cabdff1aSopenharmony_ci            dst0_l >>= 6;
1997cabdff1aSopenharmony_ci
1998cabdff1aSopenharmony_ci            /* row 8 */
1999cabdff1aSopenharmony_ci            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2000cabdff1aSopenharmony_ci                       vec0, vec1, vec2, vec3);
2001cabdff1aSopenharmony_ci            dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2002cabdff1aSopenharmony_ci                                     filt2, filt3);
2003cabdff1aSopenharmony_ci
2004cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
2005cabdff1aSopenharmony_ci            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2006cabdff1aSopenharmony_ci                                    filt_h0, filt_h1, filt_h2, filt_h3);
2007cabdff1aSopenharmony_ci            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2008cabdff1aSopenharmony_ci                                    filt_h0, filt_h1, filt_h2, filt_h3);
2009cabdff1aSopenharmony_ci            dst1_r >>= 6;
2010cabdff1aSopenharmony_ci            dst1_l >>= 6;
2011cabdff1aSopenharmony_ci
2012cabdff1aSopenharmony_ci            MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
2013cabdff1aSopenharmony_ci            MUL2(dst1_r, weight_vec, dst1_l, weight_vec, dst1_r, dst1_l);
2014cabdff1aSopenharmony_ci            SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec);
2015cabdff1aSopenharmony_ci            ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2016cabdff1aSopenharmony_ci            ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l);
2017cabdff1aSopenharmony_ci            CLIP_SW4_0_255(dst0_r, dst1_r, dst0_l, dst1_l);
2018cabdff1aSopenharmony_ci
2019cabdff1aSopenharmony_ci            PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
2020cabdff1aSopenharmony_ci            dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2021cabdff1aSopenharmony_ci            ST_D2(dst0_r, 0, 1, dst_tmp, dst_stride);
2022cabdff1aSopenharmony_ci            dst_tmp += (2 * dst_stride);
2023cabdff1aSopenharmony_ci
2024cabdff1aSopenharmony_ci            dst10_r = dst32_r;
2025cabdff1aSopenharmony_ci            dst32_r = dst54_r;
2026cabdff1aSopenharmony_ci            dst54_r = dst76_r;
2027cabdff1aSopenharmony_ci            dst10_l = dst32_l;
2028cabdff1aSopenharmony_ci            dst32_l = dst54_l;
2029cabdff1aSopenharmony_ci            dst54_l = dst76_l;
2030cabdff1aSopenharmony_ci            dst21_r = dst43_r;
2031cabdff1aSopenharmony_ci            dst43_r = dst65_r;
2032cabdff1aSopenharmony_ci            dst65_r = dst87_r;
2033cabdff1aSopenharmony_ci            dst21_l = dst43_l;
2034cabdff1aSopenharmony_ci            dst43_l = dst65_l;
2035cabdff1aSopenharmony_ci            dst65_l = dst87_l;
2036cabdff1aSopenharmony_ci            dst6 = dst8;
2037cabdff1aSopenharmony_ci        }
2038cabdff1aSopenharmony_ci
2039cabdff1aSopenharmony_ci        src += 8;
2040cabdff1aSopenharmony_ci        dst += 8;
2041cabdff1aSopenharmony_ci    }
2042cabdff1aSopenharmony_ci}
2043cabdff1aSopenharmony_ci
2044cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src,
2045cabdff1aSopenharmony_ci                                     int32_t src_stride,
2046cabdff1aSopenharmony_ci                                     uint8_t *dst,
2047cabdff1aSopenharmony_ci                                     int32_t dst_stride,
2048cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
2049cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
2050cabdff1aSopenharmony_ci                                     int32_t height,
2051cabdff1aSopenharmony_ci                                     int32_t weight,
2052cabdff1aSopenharmony_ci                                     int32_t offset,
2053cabdff1aSopenharmony_ci                                     int32_t rnd_val)
2054cabdff1aSopenharmony_ci{
2055cabdff1aSopenharmony_ci    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2056cabdff1aSopenharmony_ci                                      filter_x, filter_y, height, weight,
2057cabdff1aSopenharmony_ci                                      offset, rnd_val, 8);
2058cabdff1aSopenharmony_ci}
2059cabdff1aSopenharmony_ci
2060cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src,
2061cabdff1aSopenharmony_ci                                      int32_t src_stride,
2062cabdff1aSopenharmony_ci                                      uint8_t *dst,
2063cabdff1aSopenharmony_ci                                      int32_t dst_stride,
2064cabdff1aSopenharmony_ci                                      const int8_t *filter_x,
2065cabdff1aSopenharmony_ci                                      const int8_t *filter_y,
2066cabdff1aSopenharmony_ci                                      int32_t height,
2067cabdff1aSopenharmony_ci                                      int32_t weight,
2068cabdff1aSopenharmony_ci                                      int32_t offset,
2069cabdff1aSopenharmony_ci                                      int32_t rnd_val)
2070cabdff1aSopenharmony_ci{
2071cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2072cabdff1aSopenharmony_ci    uint8_t *src_tmp, *dst_tmp;
2073cabdff1aSopenharmony_ci    v16u8 out;
2074cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2075cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2076cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2077cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2078cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2079cabdff1aSopenharmony_ci    v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
2080cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2081cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst10_l, dst32_l, dst54_l;
2082cabdff1aSopenharmony_ci    v8i16 dst98_r, dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
2083cabdff1aSopenharmony_ci    v8i16 dst76_l, filter_vec;
2084cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
2085cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
2086cabdff1aSopenharmony_ci
2087cabdff1aSopenharmony_ci    src -= ((3 * src_stride) + 3);
2088cabdff1aSopenharmony_ci
2089cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
2090cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2091cabdff1aSopenharmony_ci
2092cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
2093cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
2094cabdff1aSopenharmony_ci
2095cabdff1aSopenharmony_ci    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2096cabdff1aSopenharmony_ci
2097cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
2098cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
2099cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
2100cabdff1aSopenharmony_ci    denom_vec = rnd_vec - 6;
2101cabdff1aSopenharmony_ci
2102cabdff1aSopenharmony_ci    const_128 = __msa_ldi_w(128);
2103cabdff1aSopenharmony_ci    const_128 *= weight_vec;
2104cabdff1aSopenharmony_ci    offset_vec += __msa_srar_w(const_128, denom_vec);
2105cabdff1aSopenharmony_ci
2106cabdff1aSopenharmony_ci    mask0 = LD_SB(ff_hevc_mask_arr);
2107cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2108cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
2109cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
2110cabdff1aSopenharmony_ci
2111cabdff1aSopenharmony_ci    src_tmp = src;
2112cabdff1aSopenharmony_ci    dst_tmp = dst;
2113cabdff1aSopenharmony_ci
2114cabdff1aSopenharmony_ci    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2115cabdff1aSopenharmony_ci    src_tmp += (7 * src_stride);
2116cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2117cabdff1aSopenharmony_ci
2118cabdff1aSopenharmony_ci    /* row 0 row 1 row 2 row 3 */
2119cabdff1aSopenharmony_ci    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2120cabdff1aSopenharmony_ci    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2121cabdff1aSopenharmony_ci    VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2122cabdff1aSopenharmony_ci               vec11);
2123cabdff1aSopenharmony_ci    VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2124cabdff1aSopenharmony_ci               vec15);
2125cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2126cabdff1aSopenharmony_ci                             filt3);
2127cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2128cabdff1aSopenharmony_ci                             filt3);
2129cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2130cabdff1aSopenharmony_ci                             filt3);
2131cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2132cabdff1aSopenharmony_ci                             filt2, filt3);
2133cabdff1aSopenharmony_ci    VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2134cabdff1aSopenharmony_ci    VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2135cabdff1aSopenharmony_ci    VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2136cabdff1aSopenharmony_ci               vec11);
2137cabdff1aSopenharmony_ci    dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2138cabdff1aSopenharmony_ci                             filt3);
2139cabdff1aSopenharmony_ci    dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2140cabdff1aSopenharmony_ci                             filt3);
2141cabdff1aSopenharmony_ci    dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2142cabdff1aSopenharmony_ci                             filt3);
2143cabdff1aSopenharmony_ci
2144cabdff1aSopenharmony_ci    for (loop_cnt = 16; loop_cnt--;) {
2145cabdff1aSopenharmony_ci        src7 = LD_SB(src_tmp);
2146cabdff1aSopenharmony_ci        src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
2147cabdff1aSopenharmony_ci        src_tmp += src_stride;
2148cabdff1aSopenharmony_ci
2149cabdff1aSopenharmony_ci        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2150cabdff1aSopenharmony_ci                   vec3);
2151cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2152cabdff1aSopenharmony_ci                                 filt3);
2153cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
2154cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
2155cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
2156cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2157cabdff1aSopenharmony_ci
2158cabdff1aSopenharmony_ci        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2159cabdff1aSopenharmony_ci                                filt_h0, filt_h1, filt_h2, filt_h3);
2160cabdff1aSopenharmony_ci        dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2161cabdff1aSopenharmony_ci                                filt_h0, filt_h1, filt_h2, filt_h3);
2162cabdff1aSopenharmony_ci        dst0_r >>= 6;
2163cabdff1aSopenharmony_ci        dst0_l >>= 6;
2164cabdff1aSopenharmony_ci
2165cabdff1aSopenharmony_ci        MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
2166cabdff1aSopenharmony_ci        SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2167cabdff1aSopenharmony_ci        ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2168cabdff1aSopenharmony_ci        CLIP_SW2_0_255(dst0_r, dst0_l);
2169cabdff1aSopenharmony_ci        dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2170cabdff1aSopenharmony_ci        out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
2171cabdff1aSopenharmony_ci        ST_D1(out, 0, dst_tmp);
2172cabdff1aSopenharmony_ci        dst_tmp += dst_stride;
2173cabdff1aSopenharmony_ci
2174cabdff1aSopenharmony_ci        dst0 = dst1;
2175cabdff1aSopenharmony_ci        dst1 = dst2;
2176cabdff1aSopenharmony_ci        dst2 = dst3;
2177cabdff1aSopenharmony_ci        dst3 = dst4;
2178cabdff1aSopenharmony_ci        dst4 = dst5;
2179cabdff1aSopenharmony_ci        dst5 = dst6;
2180cabdff1aSopenharmony_ci        dst6 = dst7;
2181cabdff1aSopenharmony_ci    }
2182cabdff1aSopenharmony_ci
2183cabdff1aSopenharmony_ci    src += 8;
2184cabdff1aSopenharmony_ci    dst += 8;
2185cabdff1aSopenharmony_ci
2186cabdff1aSopenharmony_ci    mask4 = LD_SB(ff_hevc_mask_arr + 16);
2187cabdff1aSopenharmony_ci    mask5 = mask4 + 2;
2188cabdff1aSopenharmony_ci    mask6 = mask4 + 4;
2189cabdff1aSopenharmony_ci    mask7 = mask4 + 6;
2190cabdff1aSopenharmony_ci
2191cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
2192cabdff1aSopenharmony_ci    src += (7 * src_stride);
2193cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2194cabdff1aSopenharmony_ci
2195cabdff1aSopenharmony_ci    VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2196cabdff1aSopenharmony_ci    VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2197cabdff1aSopenharmony_ci    VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2198cabdff1aSopenharmony_ci               vec11);
2199cabdff1aSopenharmony_ci    VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2200cabdff1aSopenharmony_ci               vec15);
2201cabdff1aSopenharmony_ci    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2202cabdff1aSopenharmony_ci                              filt3);
2203cabdff1aSopenharmony_ci    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2204cabdff1aSopenharmony_ci                              filt3);
2205cabdff1aSopenharmony_ci    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2206cabdff1aSopenharmony_ci                              filt3);
2207cabdff1aSopenharmony_ci    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2208cabdff1aSopenharmony_ci                              filt3);
2209cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
2210cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
2211cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
2212cabdff1aSopenharmony_ci
2213cabdff1aSopenharmony_ci    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2214cabdff1aSopenharmony_ci
2215cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
2216cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
2217cabdff1aSopenharmony_ci        src += (4 * src_stride);
2218cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
2219cabdff1aSopenharmony_ci
2220cabdff1aSopenharmony_ci        VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2221cabdff1aSopenharmony_ci                   vec3);
2222cabdff1aSopenharmony_ci        VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2223cabdff1aSopenharmony_ci                   vec7);
2224cabdff1aSopenharmony_ci        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2225cabdff1aSopenharmony_ci                                  filt3);
2226cabdff1aSopenharmony_ci        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2227cabdff1aSopenharmony_ci                                   filt3);
2228cabdff1aSopenharmony_ci
2229cabdff1aSopenharmony_ci        dst76_r = __msa_ilvr_h(dst97, dst66);
2230cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
2231cabdff1aSopenharmony_ci        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2232cabdff1aSopenharmony_ci        dst98_r = __msa_ilvr_h(dst66, dst108);
2233cabdff1aSopenharmony_ci
2234cabdff1aSopenharmony_ci        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2235cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
2236cabdff1aSopenharmony_ci        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2237cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
2238cabdff1aSopenharmony_ci        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
2239cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
2240cabdff1aSopenharmony_ci        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
2241cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
2242cabdff1aSopenharmony_ci
2243cabdff1aSopenharmony_ci        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2244cabdff1aSopenharmony_ci        MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
2245cabdff1aSopenharmony_ci        MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
2246cabdff1aSopenharmony_ci        SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
2247cabdff1aSopenharmony_ci        ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
2248cabdff1aSopenharmony_ci        ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
2249cabdff1aSopenharmony_ci        CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
2250cabdff1aSopenharmony_ci        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
2251cabdff1aSopenharmony_ci        out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2252cabdff1aSopenharmony_ci        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2253cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
2254cabdff1aSopenharmony_ci
2255cabdff1aSopenharmony_ci        dst10_r = dst54_r;
2256cabdff1aSopenharmony_ci        dst32_r = dst76_r;
2257cabdff1aSopenharmony_ci        dst54_r = dst98_r;
2258cabdff1aSopenharmony_ci        dst21_r = dst65_r;
2259cabdff1aSopenharmony_ci        dst43_r = dst87_r;
2260cabdff1aSopenharmony_ci        dst65_r = dst109_r;
2261cabdff1aSopenharmony_ci        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2262cabdff1aSopenharmony_ci    }
2263cabdff1aSopenharmony_ci}
2264cabdff1aSopenharmony_ci
2265cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src,
2266cabdff1aSopenharmony_ci                                      int32_t src_stride,
2267cabdff1aSopenharmony_ci                                      uint8_t *dst,
2268cabdff1aSopenharmony_ci                                      int32_t dst_stride,
2269cabdff1aSopenharmony_ci                                      const int8_t *filter_x,
2270cabdff1aSopenharmony_ci                                      const int8_t *filter_y,
2271cabdff1aSopenharmony_ci                                      int32_t height,
2272cabdff1aSopenharmony_ci                                      int32_t weight,
2273cabdff1aSopenharmony_ci                                      int32_t offset,
2274cabdff1aSopenharmony_ci                                      int32_t rnd_val)
2275cabdff1aSopenharmony_ci{
2276cabdff1aSopenharmony_ci    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2277cabdff1aSopenharmony_ci                                      filter_x, filter_y, height, weight,
2278cabdff1aSopenharmony_ci                                      offset, rnd_val, 16);
2279cabdff1aSopenharmony_ci}
2280cabdff1aSopenharmony_ci
2281cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src,
2282cabdff1aSopenharmony_ci                                      int32_t src_stride,
2283cabdff1aSopenharmony_ci                                      uint8_t *dst,
2284cabdff1aSopenharmony_ci                                      int32_t dst_stride,
2285cabdff1aSopenharmony_ci                                      const int8_t *filter_x,
2286cabdff1aSopenharmony_ci                                      const int8_t *filter_y,
2287cabdff1aSopenharmony_ci                                      int32_t height,
2288cabdff1aSopenharmony_ci                                      int32_t weight,
2289cabdff1aSopenharmony_ci                                      int32_t offset,
2290cabdff1aSopenharmony_ci                                      int32_t rnd_val)
2291cabdff1aSopenharmony_ci{
2292cabdff1aSopenharmony_ci    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2293cabdff1aSopenharmony_ci                                      filter_x, filter_y, height, weight,
2294cabdff1aSopenharmony_ci                                      offset, rnd_val, 24);
2295cabdff1aSopenharmony_ci}
2296cabdff1aSopenharmony_ci
2297cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src,
2298cabdff1aSopenharmony_ci                                      int32_t src_stride,
2299cabdff1aSopenharmony_ci                                      uint8_t *dst,
2300cabdff1aSopenharmony_ci                                      int32_t dst_stride,
2301cabdff1aSopenharmony_ci                                      const int8_t *filter_x,
2302cabdff1aSopenharmony_ci                                      const int8_t *filter_y,
2303cabdff1aSopenharmony_ci                                      int32_t height,
2304cabdff1aSopenharmony_ci                                      int32_t weight,
2305cabdff1aSopenharmony_ci                                      int32_t offset,
2306cabdff1aSopenharmony_ci                                      int32_t rnd_val)
2307cabdff1aSopenharmony_ci{
2308cabdff1aSopenharmony_ci    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2309cabdff1aSopenharmony_ci                                      filter_x, filter_y, height, weight,
2310cabdff1aSopenharmony_ci                                      offset, rnd_val, 32);
2311cabdff1aSopenharmony_ci}
2312cabdff1aSopenharmony_ci
2313cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src,
2314cabdff1aSopenharmony_ci                                      int32_t src_stride,
2315cabdff1aSopenharmony_ci                                      uint8_t *dst,
2316cabdff1aSopenharmony_ci                                      int32_t dst_stride,
2317cabdff1aSopenharmony_ci                                      const int8_t *filter_x,
2318cabdff1aSopenharmony_ci                                      const int8_t *filter_y,
2319cabdff1aSopenharmony_ci                                      int32_t height,
2320cabdff1aSopenharmony_ci                                      int32_t weight,
2321cabdff1aSopenharmony_ci                                      int32_t offset,
2322cabdff1aSopenharmony_ci                                      int32_t rnd_val)
2323cabdff1aSopenharmony_ci{
2324cabdff1aSopenharmony_ci    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2325cabdff1aSopenharmony_ci                                      filter_x, filter_y, height, weight,
2326cabdff1aSopenharmony_ci                                      offset, rnd_val, 48);
2327cabdff1aSopenharmony_ci}
2328cabdff1aSopenharmony_ci
2329cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src,
2330cabdff1aSopenharmony_ci                                      int32_t src_stride,
2331cabdff1aSopenharmony_ci                                      uint8_t *dst,
2332cabdff1aSopenharmony_ci                                      int32_t dst_stride,
2333cabdff1aSopenharmony_ci                                      const int8_t *filter_x,
2334cabdff1aSopenharmony_ci                                      const int8_t *filter_y,
2335cabdff1aSopenharmony_ci                                      int32_t height,
2336cabdff1aSopenharmony_ci                                      int32_t weight,
2337cabdff1aSopenharmony_ci                                      int32_t offset,
2338cabdff1aSopenharmony_ci                                      int32_t rnd_val)
2339cabdff1aSopenharmony_ci{
2340cabdff1aSopenharmony_ci    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2341cabdff1aSopenharmony_ci                                      filter_x, filter_y, height, weight,
2342cabdff1aSopenharmony_ci                                      offset, rnd_val, 64);
2343cabdff1aSopenharmony_ci}
2344cabdff1aSopenharmony_ci
2345cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src,
2346cabdff1aSopenharmony_ci                                      int32_t src_stride,
2347cabdff1aSopenharmony_ci                                      uint8_t *dst,
2348cabdff1aSopenharmony_ci                                      int32_t dst_stride,
2349cabdff1aSopenharmony_ci                                      const int8_t *filter,
2350cabdff1aSopenharmony_ci                                      int32_t weight,
2351cabdff1aSopenharmony_ci                                      int32_t offset,
2352cabdff1aSopenharmony_ci                                      int32_t rnd_val)
2353cabdff1aSopenharmony_ci{
2354cabdff1aSopenharmony_ci    v16u8 out;
2355cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
2356cabdff1aSopenharmony_ci    v16i8 src0, src1, vec0, vec1;
2357cabdff1aSopenharmony_ci    v16i8 mask1;
2358cabdff1aSopenharmony_ci    v8i16 dst0;
2359cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l;
2360cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2361cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
2362cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2363cabdff1aSopenharmony_ci
2364cabdff1aSopenharmony_ci    src -= 1;
2365cabdff1aSopenharmony_ci
2366cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2367cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2368cabdff1aSopenharmony_ci
2369cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2370cabdff1aSopenharmony_ci
2371cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
2372cabdff1aSopenharmony_ci
2373cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
2374cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
2375cabdff1aSopenharmony_ci
2376cabdff1aSopenharmony_ci    weight *= 128;
2377cabdff1aSopenharmony_ci    rnd_val -= 6;
2378cabdff1aSopenharmony_ci
2379cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
2380cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
2381cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
2382cabdff1aSopenharmony_ci
2383cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2384cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2385cabdff1aSopenharmony_ci
2386cabdff1aSopenharmony_ci    LD_SB2(src, src_stride, src0, src1);
2387cabdff1aSopenharmony_ci    XORI_B2_128_SB(src0, src1);
2388cabdff1aSopenharmony_ci
2389cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2390cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2391cabdff1aSopenharmony_ci
2392cabdff1aSopenharmony_ci    ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
2393cabdff1aSopenharmony_ci    DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
2394cabdff1aSopenharmony_ci    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2395cabdff1aSopenharmony_ci    dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2396cabdff1aSopenharmony_ci    dst0 = __msa_adds_s_h(dst0, offset_vec);
2397cabdff1aSopenharmony_ci    CLIP_SH_0_255(dst0);
2398cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
2399cabdff1aSopenharmony_ci    ST_W2(out, 0, 1, dst, dst_stride);
2400cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
2401cabdff1aSopenharmony_ci}
2402cabdff1aSopenharmony_ci
2403cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src,
2404cabdff1aSopenharmony_ci                                      int32_t src_stride,
2405cabdff1aSopenharmony_ci                                      uint8_t *dst,
2406cabdff1aSopenharmony_ci                                      int32_t dst_stride,
2407cabdff1aSopenharmony_ci                                      const int8_t *filter,
2408cabdff1aSopenharmony_ci                                      int32_t weight,
2409cabdff1aSopenharmony_ci                                      int32_t offset,
2410cabdff1aSopenharmony_ci                                      int32_t rnd_val)
2411cabdff1aSopenharmony_ci{
2412cabdff1aSopenharmony_ci    v16u8 out;
2413cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
2414cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
2415cabdff1aSopenharmony_ci    v16i8 mask1, vec0, vec1, vec2, vec3;
2416cabdff1aSopenharmony_ci    v8i16 dst0, dst1;
2417cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2418cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
2419cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2420cabdff1aSopenharmony_ci
2421cabdff1aSopenharmony_ci    src -= 1;
2422cabdff1aSopenharmony_ci
2423cabdff1aSopenharmony_ci    /* rearranging filter */
2424cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2425cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2426cabdff1aSopenharmony_ci
2427cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2428cabdff1aSopenharmony_ci
2429cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
2430cabdff1aSopenharmony_ci
2431cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
2432cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
2433cabdff1aSopenharmony_ci
2434cabdff1aSopenharmony_ci    weight *= 128;
2435cabdff1aSopenharmony_ci    rnd_val -= 6;
2436cabdff1aSopenharmony_ci
2437cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
2438cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
2439cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
2440cabdff1aSopenharmony_ci
2441cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2442cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2443cabdff1aSopenharmony_ci
2444cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
2445cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
2446cabdff1aSopenharmony_ci
2447cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2448cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
2449cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2450cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2451cabdff1aSopenharmony_ci
2452cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
2453cabdff1aSopenharmony_ci                                   dst0, dst1);
2454cabdff1aSopenharmony_ci
2455cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2456cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2457cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
2458cabdff1aSopenharmony_ci}
2459cabdff1aSopenharmony_ci
2460cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src,
2461cabdff1aSopenharmony_ci                                              int32_t src_stride,
2462cabdff1aSopenharmony_ci                                              uint8_t *dst,
2463cabdff1aSopenharmony_ci                                              int32_t dst_stride,
2464cabdff1aSopenharmony_ci                                              const int8_t *filter,
2465cabdff1aSopenharmony_ci                                              int32_t height,
2466cabdff1aSopenharmony_ci                                              int32_t weight,
2467cabdff1aSopenharmony_ci                                              int32_t offset,
2468cabdff1aSopenharmony_ci                                              int32_t rnd_val)
2469cabdff1aSopenharmony_ci{
2470cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2471cabdff1aSopenharmony_ci    v16u8 out0, out1;
2472cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
2473cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2474cabdff1aSopenharmony_ci    v16i8 mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2475cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
2476cabdff1aSopenharmony_ci    v8i16 filter_vec;
2477cabdff1aSopenharmony_ci    v8i16 weight_vec_h, offset_vec, denom_vec;
2478cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
2479cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2480cabdff1aSopenharmony_ci
2481cabdff1aSopenharmony_ci    src -= 1;
2482cabdff1aSopenharmony_ci
2483cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2484cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2485cabdff1aSopenharmony_ci
2486cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
2487cabdff1aSopenharmony_ci
2488cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
2489cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
2490cabdff1aSopenharmony_ci
2491cabdff1aSopenharmony_ci    weight *= 128;
2492cabdff1aSopenharmony_ci    rnd_val -= 6;
2493cabdff1aSopenharmony_ci
2494cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
2495cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
2496cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
2497cabdff1aSopenharmony_ci
2498cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2499cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2500cabdff1aSopenharmony_ci
2501cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2502cabdff1aSopenharmony_ci
2503cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
2504cabdff1aSopenharmony_ci        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2505cabdff1aSopenharmony_ci        src += (8 * src_stride);
2506cabdff1aSopenharmony_ci
2507cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2508cabdff1aSopenharmony_ci
2509cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2510cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
2511cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec4, vec5);
2512cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec6, vec7);
2513cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2514cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2515cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2516cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2517cabdff1aSopenharmony_ci
2518cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2519cabdff1aSopenharmony_ci                                       weight_vec, offset_vec, rnd_vec,
2520cabdff1aSopenharmony_ci                                       dst0, dst1, dst2, dst3);
2521cabdff1aSopenharmony_ci
2522cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2523cabdff1aSopenharmony_ci        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2524cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
2525cabdff1aSopenharmony_ci    }
2526cabdff1aSopenharmony_ci}
2527cabdff1aSopenharmony_ci
2528cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src,
2529cabdff1aSopenharmony_ci                                     int32_t src_stride,
2530cabdff1aSopenharmony_ci                                     uint8_t *dst,
2531cabdff1aSopenharmony_ci                                     int32_t dst_stride,
2532cabdff1aSopenharmony_ci                                     const int8_t *filter,
2533cabdff1aSopenharmony_ci                                     int32_t height,
2534cabdff1aSopenharmony_ci                                     int32_t weight,
2535cabdff1aSopenharmony_ci                                     int32_t offset,
2536cabdff1aSopenharmony_ci                                     int32_t rnd_val)
2537cabdff1aSopenharmony_ci{
2538cabdff1aSopenharmony_ci    if (2 == height) {
2539cabdff1aSopenharmony_ci        hevc_hz_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
2540cabdff1aSopenharmony_ci                                  filter, weight, offset, rnd_val);
2541cabdff1aSopenharmony_ci    } else if (4 == height) {
2542cabdff1aSopenharmony_ci        hevc_hz_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
2543cabdff1aSopenharmony_ci                                  filter, weight, offset, rnd_val);
2544cabdff1aSopenharmony_ci    } else if (8 == height || 16 == height) {
2545cabdff1aSopenharmony_ci        hevc_hz_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
2546cabdff1aSopenharmony_ci                                          filter, height, weight,
2547cabdff1aSopenharmony_ci                                          offset, rnd_val);
2548cabdff1aSopenharmony_ci    }
2549cabdff1aSopenharmony_ci}
2550cabdff1aSopenharmony_ci
2551cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src,
2552cabdff1aSopenharmony_ci                                     int32_t src_stride,
2553cabdff1aSopenharmony_ci                                     uint8_t *dst,
2554cabdff1aSopenharmony_ci                                     int32_t dst_stride,
2555cabdff1aSopenharmony_ci                                     const int8_t *filter,
2556cabdff1aSopenharmony_ci                                     int32_t height,
2557cabdff1aSopenharmony_ci                                     int32_t weight,
2558cabdff1aSopenharmony_ci                                     int32_t offset,
2559cabdff1aSopenharmony_ci                                     int32_t rnd_val)
2560cabdff1aSopenharmony_ci{
2561cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
2562cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
2563cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2564cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2565cabdff1aSopenharmony_ci    v16i8 mask1;
2566cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2567cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2568cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2569cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
2570cabdff1aSopenharmony_ci
2571cabdff1aSopenharmony_ci    src -= 1;
2572cabdff1aSopenharmony_ci
2573cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2574cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2575cabdff1aSopenharmony_ci
2576cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
2577cabdff1aSopenharmony_ci
2578cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
2579cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
2580cabdff1aSopenharmony_ci
2581cabdff1aSopenharmony_ci    weight *= 128;
2582cabdff1aSopenharmony_ci    rnd_val -= 6;
2583cabdff1aSopenharmony_ci
2584cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
2585cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
2586cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
2587cabdff1aSopenharmony_ci
2588cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2589cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2590cabdff1aSopenharmony_ci
2591cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2592cabdff1aSopenharmony_ci
2593cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2594cabdff1aSopenharmony_ci    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2595cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2596cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2597cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2598cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2599cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2600cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2601cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2602cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2603cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2604cabdff1aSopenharmony_ci    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
2605cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
2606cabdff1aSopenharmony_ci    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
2607cabdff1aSopenharmony_ci    dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2608cabdff1aSopenharmony_ci    dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2609cabdff1aSopenharmony_ci    dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2610cabdff1aSopenharmony_ci    dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2611cabdff1aSopenharmony_ci
2612cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2613cabdff1aSopenharmony_ci                                   weight_vec, offset_vec, rnd_vec,
2614cabdff1aSopenharmony_ci                                   dst0, dst1, dst2, dst3);
2615cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
2616cabdff1aSopenharmony_ci                                   weight_vec, offset_vec, rnd_vec,
2617cabdff1aSopenharmony_ci                                   dst4, dst5, dst6, dst7);
2618cabdff1aSopenharmony_ci
2619cabdff1aSopenharmony_ci    PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2620cabdff1aSopenharmony_ci    PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
2621cabdff1aSopenharmony_ci    ST_W2(out0, 0, 2, dst, dst_stride);
2622cabdff1aSopenharmony_ci    ST_H2(out0, 2, 6, dst + 4, dst_stride);
2623cabdff1aSopenharmony_ci    ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2624cabdff1aSopenharmony_ci    ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2625cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
2626cabdff1aSopenharmony_ci    ST_W2(out2, 0, 2, dst, dst_stride);
2627cabdff1aSopenharmony_ci    ST_H2(out2, 2, 6, dst + 4, dst_stride);
2628cabdff1aSopenharmony_ci    ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
2629cabdff1aSopenharmony_ci    ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2630cabdff1aSopenharmony_ci}
2631cabdff1aSopenharmony_ci
2632cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src,
2633cabdff1aSopenharmony_ci                                      int32_t src_stride,
2634cabdff1aSopenharmony_ci                                      uint8_t *dst,
2635cabdff1aSopenharmony_ci                                      int32_t dst_stride,
2636cabdff1aSopenharmony_ci                                      const int8_t *filter,
2637cabdff1aSopenharmony_ci                                      int32_t weight,
2638cabdff1aSopenharmony_ci                                      int32_t offset,
2639cabdff1aSopenharmony_ci                                      int32_t rnd_val)
2640cabdff1aSopenharmony_ci{
2641cabdff1aSopenharmony_ci    v16u8 out;
2642cabdff1aSopenharmony_ci    v8i16 filt0, filt1, dst0, dst1;
2643cabdff1aSopenharmony_ci    v16i8 src0, src1;
2644cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2645cabdff1aSopenharmony_ci    v16i8 mask1;
2646cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3;
2647cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2648cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
2649cabdff1aSopenharmony_ci
2650cabdff1aSopenharmony_ci    src -= 1;
2651cabdff1aSopenharmony_ci
2652cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2653cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2654cabdff1aSopenharmony_ci
2655cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
2656cabdff1aSopenharmony_ci
2657cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
2658cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
2659cabdff1aSopenharmony_ci
2660cabdff1aSopenharmony_ci    weight *= 128;
2661cabdff1aSopenharmony_ci    rnd_val -= 6;
2662cabdff1aSopenharmony_ci
2663cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
2664cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
2665cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
2666cabdff1aSopenharmony_ci
2667cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2668cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2669cabdff1aSopenharmony_ci
2670cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2671cabdff1aSopenharmony_ci
2672cabdff1aSopenharmony_ci    LD_SB2(src, src_stride, src0, src1);
2673cabdff1aSopenharmony_ci    XORI_B2_128_SB(src0, src1);
2674cabdff1aSopenharmony_ci
2675cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2676cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2677cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2678cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2679cabdff1aSopenharmony_ci
2680cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
2681cabdff1aSopenharmony_ci                                   dst0, dst1);
2682cabdff1aSopenharmony_ci
2683cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2684cabdff1aSopenharmony_ci    ST_D2(out, 0, 1, dst, dst_stride);
2685cabdff1aSopenharmony_ci}
2686cabdff1aSopenharmony_ci
2687cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_8x4_msa(uint8_t *src,
2688cabdff1aSopenharmony_ci                                      int32_t src_stride,
2689cabdff1aSopenharmony_ci                                      uint8_t *dst,
2690cabdff1aSopenharmony_ci                                      int32_t dst_stride,
2691cabdff1aSopenharmony_ci                                      const int8_t *filter,
2692cabdff1aSopenharmony_ci                                      int32_t weight,
2693cabdff1aSopenharmony_ci                                      int32_t offset,
2694cabdff1aSopenharmony_ci                                      int32_t rnd_val)
2695cabdff1aSopenharmony_ci{
2696cabdff1aSopenharmony_ci    v16u8 out0, out1;
2697cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
2698cabdff1aSopenharmony_ci    v16i8 mask0, mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2699cabdff1aSopenharmony_ci    v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
2700cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2701cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
2702cabdff1aSopenharmony_ci
2703cabdff1aSopenharmony_ci    src -= 1;
2704cabdff1aSopenharmony_ci
2705cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2706cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2707cabdff1aSopenharmony_ci
2708cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
2709cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
2710cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
2711cabdff1aSopenharmony_ci
2712cabdff1aSopenharmony_ci    weight *= 128;
2713cabdff1aSopenharmony_ci    rnd_val -= 6;
2714cabdff1aSopenharmony_ci
2715cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
2716cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
2717cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
2718cabdff1aSopenharmony_ci
2719cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2720cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2721cabdff1aSopenharmony_ci
2722cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2723cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2724cabdff1aSopenharmony_ci
2725cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
2726cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
2727cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2728cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2729cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2730cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2731cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2732cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2733cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2734cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2735cabdff1aSopenharmony_ci
2736cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2737cabdff1aSopenharmony_ci                                   weight_vec, offset_vec, rnd_vec,
2738cabdff1aSopenharmony_ci                                   dst0, dst1, dst2, dst3);
2739cabdff1aSopenharmony_ci
2740cabdff1aSopenharmony_ci    PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2741cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2742cabdff1aSopenharmony_ci}
2743cabdff1aSopenharmony_ci
2744cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src,
2745cabdff1aSopenharmony_ci                                      int32_t src_stride,
2746cabdff1aSopenharmony_ci                                      uint8_t *dst,
2747cabdff1aSopenharmony_ci                                      int32_t dst_stride,
2748cabdff1aSopenharmony_ci                                      const int8_t *filter,
2749cabdff1aSopenharmony_ci                                      int32_t weight,
2750cabdff1aSopenharmony_ci                                      int32_t offset,
2751cabdff1aSopenharmony_ci                                      int32_t rnd_val)
2752cabdff1aSopenharmony_ci{
2753cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
2754cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
2755cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5;
2756cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2757cabdff1aSopenharmony_ci    v16i8 mask1;
2758cabdff1aSopenharmony_ci    v16i8 vec11;
2759cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2760cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2761cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2762cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
2763cabdff1aSopenharmony_ci
2764cabdff1aSopenharmony_ci    src -= 1;
2765cabdff1aSopenharmony_ci
2766cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2767cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2768cabdff1aSopenharmony_ci
2769cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
2770cabdff1aSopenharmony_ci
2771cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
2772cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
2773cabdff1aSopenharmony_ci
2774cabdff1aSopenharmony_ci    weight *= 128;
2775cabdff1aSopenharmony_ci    rnd_val -= 6;
2776cabdff1aSopenharmony_ci
2777cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
2778cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
2779cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
2780cabdff1aSopenharmony_ci
2781cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2782cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2783cabdff1aSopenharmony_ci
2784cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2785cabdff1aSopenharmony_ci
2786cabdff1aSopenharmony_ci    LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
2787cabdff1aSopenharmony_ci    XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
2788cabdff1aSopenharmony_ci
2789cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2790cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2791cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2792cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2793cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
2794cabdff1aSopenharmony_ci    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
2795cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2796cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2797cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2798cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2799cabdff1aSopenharmony_ci    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
2800cabdff1aSopenharmony_ci    dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
2801cabdff1aSopenharmony_ci
2802cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2803cabdff1aSopenharmony_ci                                   weight_vec, offset_vec, rnd_vec,
2804cabdff1aSopenharmony_ci                                   dst0, dst1, dst2, dst3);
2805cabdff1aSopenharmony_ci
2806cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec,
2807cabdff1aSopenharmony_ci                                   dst4, dst5);
2808cabdff1aSopenharmony_ci
2809cabdff1aSopenharmony_ci    PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
2810cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2811cabdff1aSopenharmony_ci    ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
2812cabdff1aSopenharmony_ci}
2813cabdff1aSopenharmony_ci
2814cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_8x8multiple_msa(uint8_t *src,
2815cabdff1aSopenharmony_ci                                              int32_t src_stride,
2816cabdff1aSopenharmony_ci                                              uint8_t *dst,
2817cabdff1aSopenharmony_ci                                              int32_t dst_stride,
2818cabdff1aSopenharmony_ci                                              const int8_t *filter,
2819cabdff1aSopenharmony_ci                                              int32_t height,
2820cabdff1aSopenharmony_ci                                              int32_t weight,
2821cabdff1aSopenharmony_ci                                              int32_t offset,
2822cabdff1aSopenharmony_ci                                              int32_t rnd_val)
2823cabdff1aSopenharmony_ci{
2824cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2825cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
2826cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
2827cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2828cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2829cabdff1aSopenharmony_ci    v16i8 mask1;
2830cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2831cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2832cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2833cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
2834cabdff1aSopenharmony_ci
2835cabdff1aSopenharmony_ci    src -= 1;
2836cabdff1aSopenharmony_ci
2837cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2838cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2839cabdff1aSopenharmony_ci
2840cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
2841cabdff1aSopenharmony_ci
2842cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
2843cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
2844cabdff1aSopenharmony_ci
2845cabdff1aSopenharmony_ci    weight *= 128;
2846cabdff1aSopenharmony_ci    rnd_val -= 6;
2847cabdff1aSopenharmony_ci
2848cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
2849cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
2850cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
2851cabdff1aSopenharmony_ci
2852cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2853cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2854cabdff1aSopenharmony_ci
2855cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2856cabdff1aSopenharmony_ci
2857cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
2858cabdff1aSopenharmony_ci        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2859cabdff1aSopenharmony_ci        src += (8 * src_stride);
2860cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2861cabdff1aSopenharmony_ci
2862cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2863cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2864cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2865cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2866cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2867cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2868cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2869cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2870cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2871cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
2872cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
2873cabdff1aSopenharmony_ci        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
2874cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2875cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2876cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2877cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2878cabdff1aSopenharmony_ci
2879cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2880cabdff1aSopenharmony_ci                                       weight_vec, offset_vec, rnd_vec,
2881cabdff1aSopenharmony_ci                                       dst0, dst1, dst2, dst3);
2882cabdff1aSopenharmony_ci
2883cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
2884cabdff1aSopenharmony_ci                                       weight_vec, offset_vec, rnd_vec,
2885cabdff1aSopenharmony_ci                                       dst4, dst5, dst6, dst7);
2886cabdff1aSopenharmony_ci
2887cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2888cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
2889cabdff1aSopenharmony_ci        ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2890cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
2891cabdff1aSopenharmony_ci    }
2892cabdff1aSopenharmony_ci}
2893cabdff1aSopenharmony_ci
2894cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src,
2895cabdff1aSopenharmony_ci                                     int32_t src_stride,
2896cabdff1aSopenharmony_ci                                     uint8_t *dst,
2897cabdff1aSopenharmony_ci                                     int32_t dst_stride,
2898cabdff1aSopenharmony_ci                                     const int8_t *filter,
2899cabdff1aSopenharmony_ci                                     int32_t height,
2900cabdff1aSopenharmony_ci                                     int32_t weight,
2901cabdff1aSopenharmony_ci                                     int32_t offset,
2902cabdff1aSopenharmony_ci                                     int32_t rnd_val)
2903cabdff1aSopenharmony_ci{
2904cabdff1aSopenharmony_ci    if (2 == height) {
2905cabdff1aSopenharmony_ci        hevc_hz_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
2906cabdff1aSopenharmony_ci                                  filter, weight, offset, rnd_val);
2907cabdff1aSopenharmony_ci    } else if (4 == height) {
2908cabdff1aSopenharmony_ci        hevc_hz_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride,
2909cabdff1aSopenharmony_ci                                  filter, weight, offset, rnd_val);
2910cabdff1aSopenharmony_ci    } else if (6 == height) {
2911cabdff1aSopenharmony_ci        hevc_hz_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
2912cabdff1aSopenharmony_ci                                  filter, weight, offset, rnd_val);
2913cabdff1aSopenharmony_ci    } else {
2914cabdff1aSopenharmony_ci        hevc_hz_uniwgt_4t_8x8multiple_msa(src, src_stride, dst, dst_stride,
2915cabdff1aSopenharmony_ci                                          filter, height, weight, offset,
2916cabdff1aSopenharmony_ci                                          rnd_val);
2917cabdff1aSopenharmony_ci    }
2918cabdff1aSopenharmony_ci}
2919cabdff1aSopenharmony_ci
2920cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src,
2921cabdff1aSopenharmony_ci                                      int32_t src_stride,
2922cabdff1aSopenharmony_ci                                      uint8_t *dst,
2923cabdff1aSopenharmony_ci                                      int32_t dst_stride,
2924cabdff1aSopenharmony_ci                                      const int8_t *filter,
2925cabdff1aSopenharmony_ci                                      int32_t height,
2926cabdff1aSopenharmony_ci                                      int32_t weight,
2927cabdff1aSopenharmony_ci                                      int32_t offset,
2928cabdff1aSopenharmony_ci                                      int32_t rnd_val)
2929cabdff1aSopenharmony_ci{
2930cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2931cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
2932cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
2933cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
2934cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2935cabdff1aSopenharmony_ci    v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2936cabdff1aSopenharmony_ci    };
2937cabdff1aSopenharmony_ci    v16i8 mask1;
2938cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2939cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2940cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2941cabdff1aSopenharmony_ci    v16i8 mask3, vec11;
2942cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
2943cabdff1aSopenharmony_ci
2944cabdff1aSopenharmony_ci    src -= 1;
2945cabdff1aSopenharmony_ci
2946cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2947cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2948cabdff1aSopenharmony_ci
2949cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
2950cabdff1aSopenharmony_ci
2951cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
2952cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
2953cabdff1aSopenharmony_ci
2954cabdff1aSopenharmony_ci    weight *= 128;
2955cabdff1aSopenharmony_ci    rnd_val -= 6;
2956cabdff1aSopenharmony_ci
2957cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
2958cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
2959cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
2960cabdff1aSopenharmony_ci
2961cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2962cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2963cabdff1aSopenharmony_ci
2964cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2965cabdff1aSopenharmony_ci    mask3 = mask2 + 2;
2966cabdff1aSopenharmony_ci
2967cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
2968cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
2969cabdff1aSopenharmony_ci        src += (4 * src_stride);
2970cabdff1aSopenharmony_ci
2971cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
2972cabdff1aSopenharmony_ci
2973cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2974cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2975cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2976cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2977cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec8, vec9);
2978cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec10, vec11);
2979cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2980cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2981cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2982cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2983cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
2984cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
2985cabdff1aSopenharmony_ci
2986cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2987cabdff1aSopenharmony_ci                                       weight_vec, offset_vec, rnd_vec,
2988cabdff1aSopenharmony_ci                                       dst0, dst1, dst2, dst3);
2989cabdff1aSopenharmony_ci
2990cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
2991cabdff1aSopenharmony_ci                                       rnd_vec, dst4, dst5);
2992cabdff1aSopenharmony_ci
2993cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
2994cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2995cabdff1aSopenharmony_ci        ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
2996cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
2997cabdff1aSopenharmony_ci    }
2998cabdff1aSopenharmony_ci}
2999cabdff1aSopenharmony_ci
3000cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src,
3001cabdff1aSopenharmony_ci                                      int32_t src_stride,
3002cabdff1aSopenharmony_ci                                      uint8_t *dst,
3003cabdff1aSopenharmony_ci                                      int32_t dst_stride,
3004cabdff1aSopenharmony_ci                                      const int8_t *filter,
3005cabdff1aSopenharmony_ci                                      int32_t height,
3006cabdff1aSopenharmony_ci                                      int32_t weight,
3007cabdff1aSopenharmony_ci                                      int32_t offset,
3008cabdff1aSopenharmony_ci                                      int32_t rnd_val)
3009cabdff1aSopenharmony_ci{
3010cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3011cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
3012cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3013cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3014cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3015cabdff1aSopenharmony_ci    v16i8 mask1;
3016cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3017cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3018cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3019cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
3020cabdff1aSopenharmony_ci
3021cabdff1aSopenharmony_ci    src -= 1;
3022cabdff1aSopenharmony_ci
3023cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3024cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3025cabdff1aSopenharmony_ci
3026cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
3027cabdff1aSopenharmony_ci
3028cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3029cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
3030cabdff1aSopenharmony_ci
3031cabdff1aSopenharmony_ci    weight *= 128;
3032cabdff1aSopenharmony_ci    rnd_val -= 6;
3033cabdff1aSopenharmony_ci
3034cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
3035cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
3036cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
3037cabdff1aSopenharmony_ci
3038cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3039cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3040cabdff1aSopenharmony_ci
3041cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3042cabdff1aSopenharmony_ci
3043cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
3044cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src2, src4, src6);
3045cabdff1aSopenharmony_ci        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3046cabdff1aSopenharmony_ci        src += (4 * src_stride);
3047cabdff1aSopenharmony_ci
3048cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3049cabdff1aSopenharmony_ci
3050cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3051cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3052cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3053cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3054cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3055cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3056cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3057cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3058cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3059cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
3060cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
3061cabdff1aSopenharmony_ci        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
3062cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3063cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3064cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3065cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3066cabdff1aSopenharmony_ci
3067cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3068cabdff1aSopenharmony_ci                                       weight_vec, offset_vec, rnd_vec,
3069cabdff1aSopenharmony_ci                                       dst0, dst1, dst2, dst3);
3070cabdff1aSopenharmony_ci
3071cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
3072cabdff1aSopenharmony_ci                                       weight_vec, offset_vec, rnd_vec,
3073cabdff1aSopenharmony_ci                                       dst4, dst5, dst6, dst7);
3074cabdff1aSopenharmony_ci
3075cabdff1aSopenharmony_ci        PCKEV_B4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3076cabdff1aSopenharmony_ci                    out0, out1, out2, out3);
3077cabdff1aSopenharmony_ci
3078cabdff1aSopenharmony_ci        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
3079cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
3080cabdff1aSopenharmony_ci    }
3081cabdff1aSopenharmony_ci}
3082cabdff1aSopenharmony_ci
3083cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src,
3084cabdff1aSopenharmony_ci                                      int32_t src_stride,
3085cabdff1aSopenharmony_ci                                      uint8_t *dst,
3086cabdff1aSopenharmony_ci                                      int32_t dst_stride,
3087cabdff1aSopenharmony_ci                                      const int8_t *filter,
3088cabdff1aSopenharmony_ci                                      int32_t height,
3089cabdff1aSopenharmony_ci                                      int32_t weight,
3090cabdff1aSopenharmony_ci                                      int32_t offset,
3091cabdff1aSopenharmony_ci                                      int32_t rnd_val)
3092cabdff1aSopenharmony_ci{
3093cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3094cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
3095cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
3096cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3097cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3;
3098cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3099cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3100cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3101cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
3102cabdff1aSopenharmony_ci
3103cabdff1aSopenharmony_ci    src -= 1;
3104cabdff1aSopenharmony_ci
3105cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3106cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3107cabdff1aSopenharmony_ci
3108cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
3109cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3110cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
3111cabdff1aSopenharmony_ci
3112cabdff1aSopenharmony_ci    weight *= 128;
3113cabdff1aSopenharmony_ci    rnd_val -= 6;
3114cabdff1aSopenharmony_ci
3115cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
3116cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
3117cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
3118cabdff1aSopenharmony_ci
3119cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3120cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3121cabdff1aSopenharmony_ci
3122cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3123cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3124cabdff1aSopenharmony_ci    mask2 = mask0 + 8;
3125cabdff1aSopenharmony_ci    mask3 = mask0 + 10;
3126cabdff1aSopenharmony_ci
3127cabdff1aSopenharmony_ci    for (loop_cnt = 16; loop_cnt--;) {
3128cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src0, src2);
3129cabdff1aSopenharmony_ci        LD_SB2(src + 16, src_stride, src1, src3);
3130cabdff1aSopenharmony_ci        src += (2 * src_stride);
3131cabdff1aSopenharmony_ci
3132cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
3133cabdff1aSopenharmony_ci
3134cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3135cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
3136cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3137cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec6, vec7);
3138cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3139cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3140cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3141cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3142cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3143cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec2, vec3);
3144cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3145cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3146cabdff1aSopenharmony_ci
3147cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3148cabdff1aSopenharmony_ci                                       weight_vec, offset_vec, rnd_vec,
3149cabdff1aSopenharmony_ci                                       dst0, dst1, dst2, dst3);
3150cabdff1aSopenharmony_ci
3151cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
3152cabdff1aSopenharmony_ci                                       rnd_vec, dst4, dst5);
3153cabdff1aSopenharmony_ci
3154cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
3155cabdff1aSopenharmony_ci        ST_UB2(out0, out1, dst, dst_stride);
3156cabdff1aSopenharmony_ci        ST_D2(out2, 0, 1, dst + 16, dst_stride);
3157cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
3158cabdff1aSopenharmony_ci    }
3159cabdff1aSopenharmony_ci}
3160cabdff1aSopenharmony_ci
3161cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src,
3162cabdff1aSopenharmony_ci                                      int32_t src_stride,
3163cabdff1aSopenharmony_ci                                      uint8_t *dst,
3164cabdff1aSopenharmony_ci                                      int32_t dst_stride,
3165cabdff1aSopenharmony_ci                                      const int8_t *filter,
3166cabdff1aSopenharmony_ci                                      int32_t height,
3167cabdff1aSopenharmony_ci                                      int32_t weight,
3168cabdff1aSopenharmony_ci                                      int32_t offset,
3169cabdff1aSopenharmony_ci                                      int32_t rnd_val)
3170cabdff1aSopenharmony_ci{
3171cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3172cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
3173cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5;
3174cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3175cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3176cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3;
3177cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3178cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3179cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3180cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
3181cabdff1aSopenharmony_ci
3182cabdff1aSopenharmony_ci    src -= 1;
3183cabdff1aSopenharmony_ci
3184cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3185cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3186cabdff1aSopenharmony_ci
3187cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
3188cabdff1aSopenharmony_ci
3189cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3190cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
3191cabdff1aSopenharmony_ci
3192cabdff1aSopenharmony_ci    weight *= 128;
3193cabdff1aSopenharmony_ci    rnd_val -= 6;
3194cabdff1aSopenharmony_ci
3195cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
3196cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
3197cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
3198cabdff1aSopenharmony_ci
3199cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3200cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3201cabdff1aSopenharmony_ci
3202cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3203cabdff1aSopenharmony_ci    mask2 = mask0 + 8;
3204cabdff1aSopenharmony_ci    mask3 = mask0 + 10;
3205cabdff1aSopenharmony_ci
3206cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
3207cabdff1aSopenharmony_ci        LD_SB2(src, 16, src0, src1);
3208cabdff1aSopenharmony_ci        src2 = LD_SB(src + 24);
3209cabdff1aSopenharmony_ci        src += src_stride;
3210cabdff1aSopenharmony_ci        LD_SB2(src, 16, src3, src4);
3211cabdff1aSopenharmony_ci        src5 = LD_SB(src + 24);
3212cabdff1aSopenharmony_ci        src += src_stride;
3213cabdff1aSopenharmony_ci        XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
3214cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3215cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
3216cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec4, vec5);
3217cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec6, vec7);
3218cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3219cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3220cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3221cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3222cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3223cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src4, src3, src4, mask2, mask3, vec2, vec3);
3224cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec4, vec5);
3225cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec6, vec7);
3226cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3227cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3228cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3229cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3230cabdff1aSopenharmony_ci
3231cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3232cabdff1aSopenharmony_ci                                       weight_vec, offset_vec, rnd_vec,
3233cabdff1aSopenharmony_ci                                       dst0, dst1, dst2, dst3);
3234cabdff1aSopenharmony_ci
3235cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
3236cabdff1aSopenharmony_ci                                       weight_vec, offset_vec, rnd_vec,
3237cabdff1aSopenharmony_ci                                       dst4, dst5, dst6, dst7);
3238cabdff1aSopenharmony_ci
3239cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3240cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
3241cabdff1aSopenharmony_ci        ST_UB2(out0, out1, dst, 16);
3242cabdff1aSopenharmony_ci        dst += dst_stride;
3243cabdff1aSopenharmony_ci        ST_UB2(out2, out3, dst, 16);
3244cabdff1aSopenharmony_ci        dst += dst_stride;
3245cabdff1aSopenharmony_ci    }
3246cabdff1aSopenharmony_ci}
3247cabdff1aSopenharmony_ci
3248cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src,
3249cabdff1aSopenharmony_ci                                      int32_t src_stride,
3250cabdff1aSopenharmony_ci                                      uint8_t *dst,
3251cabdff1aSopenharmony_ci                                      int32_t dst_stride,
3252cabdff1aSopenharmony_ci                                      const int8_t *filter,
3253cabdff1aSopenharmony_ci                                      int32_t weight,
3254cabdff1aSopenharmony_ci                                      int32_t offset,
3255cabdff1aSopenharmony_ci                                      int32_t rnd_val)
3256cabdff1aSopenharmony_ci{
3257cabdff1aSopenharmony_ci    v16u8 out;
3258cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
3259cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r;
3260cabdff1aSopenharmony_ci    v16i8 src2110, src4332;
3261cabdff1aSopenharmony_ci    v8i16 dst0;
3262cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l;
3263cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3264cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3265cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
3266cabdff1aSopenharmony_ci
3267cabdff1aSopenharmony_ci    src -= src_stride;
3268cabdff1aSopenharmony_ci
3269cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
3270cabdff1aSopenharmony_ci
3271cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3272cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
3273cabdff1aSopenharmony_ci
3274cabdff1aSopenharmony_ci    weight *= 128;
3275cabdff1aSopenharmony_ci    rnd_val -= 6;
3276cabdff1aSopenharmony_ci
3277cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
3278cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
3279cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
3280cabdff1aSopenharmony_ci
3281cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3282cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3283cabdff1aSopenharmony_ci
3284cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3285cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3286cabdff1aSopenharmony_ci
3287cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3288cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3289cabdff1aSopenharmony_ci    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3290cabdff1aSopenharmony_ci    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3291cabdff1aSopenharmony_ci    XORI_B2_128_SB(src2110, src4332);
3292cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3293cabdff1aSopenharmony_ci    ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
3294cabdff1aSopenharmony_ci    DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
3295cabdff1aSopenharmony_ci    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
3296cabdff1aSopenharmony_ci    dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
3297cabdff1aSopenharmony_ci    dst0 = __msa_adds_s_h(dst0, offset_vec);
3298cabdff1aSopenharmony_ci    CLIP_SH_0_255(dst0);
3299cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
3300cabdff1aSopenharmony_ci    ST_W2(out, 0, 1, dst, dst_stride);
3301cabdff1aSopenharmony_ci}
3302cabdff1aSopenharmony_ci
3303cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src,
3304cabdff1aSopenharmony_ci                                      int32_t src_stride,
3305cabdff1aSopenharmony_ci                                      uint8_t *dst,
3306cabdff1aSopenharmony_ci                                      int32_t dst_stride,
3307cabdff1aSopenharmony_ci                                      const int8_t *filter,
3308cabdff1aSopenharmony_ci                                      int32_t weight,
3309cabdff1aSopenharmony_ci                                      int32_t offset,
3310cabdff1aSopenharmony_ci                                      int32_t rnd_val)
3311cabdff1aSopenharmony_ci{
3312cabdff1aSopenharmony_ci    v16u8 out;
3313cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
3314cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3315cabdff1aSopenharmony_ci    v16i8 src2110, src4332, src6554;
3316cabdff1aSopenharmony_ci    v8i16 dst0, dst1;
3317cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3318cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3319cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
3320cabdff1aSopenharmony_ci
3321cabdff1aSopenharmony_ci    src -= src_stride;
3322cabdff1aSopenharmony_ci
3323cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
3324cabdff1aSopenharmony_ci
3325cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3326cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
3327cabdff1aSopenharmony_ci
3328cabdff1aSopenharmony_ci    weight *= 128;
3329cabdff1aSopenharmony_ci    rnd_val -= 6;
3330cabdff1aSopenharmony_ci
3331cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
3332cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
3333cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
3334cabdff1aSopenharmony_ci
3335cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3336cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3337cabdff1aSopenharmony_ci
3338cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3339cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3340cabdff1aSopenharmony_ci
3341cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3342cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3343cabdff1aSopenharmony_ci    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3344cabdff1aSopenharmony_ci               src32_r, src43_r, src54_r, src65_r);
3345cabdff1aSopenharmony_ci    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
3346cabdff1aSopenharmony_ci               src2110, src4332, src6554);
3347cabdff1aSopenharmony_ci    XORI_B3_128_SB(src2110, src4332, src6554);
3348cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3349cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3350cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
3351cabdff1aSopenharmony_ci                                   dst0, dst1);
3352cabdff1aSopenharmony_ci
3353cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3354cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3355cabdff1aSopenharmony_ci}
3356cabdff1aSopenharmony_ci
3357cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src,
3358cabdff1aSopenharmony_ci                                              int32_t src_stride,
3359cabdff1aSopenharmony_ci                                              uint8_t *dst,
3360cabdff1aSopenharmony_ci                                              int32_t dst_stride,
3361cabdff1aSopenharmony_ci                                              const int8_t *filter,
3362cabdff1aSopenharmony_ci                                              int32_t height,
3363cabdff1aSopenharmony_ci                                              int32_t weight,
3364cabdff1aSopenharmony_ci                                              int32_t offset,
3365cabdff1aSopenharmony_ci                                              int32_t rnd_val)
3366cabdff1aSopenharmony_ci{
3367cabdff1aSopenharmony_ci    int32_t loop_cnt;
3368cabdff1aSopenharmony_ci    v16u8 out0, out1;
3369cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3370cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3371cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3372cabdff1aSopenharmony_ci    v16i8 src2110, src4332, src6554, src8776;
3373cabdff1aSopenharmony_ci    v16i8 src10998;
3374cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, filt0, filt1;
3375cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3376cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
3377cabdff1aSopenharmony_ci
3378cabdff1aSopenharmony_ci    src -= src_stride;
3379cabdff1aSopenharmony_ci
3380cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
3381cabdff1aSopenharmony_ci
3382cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3383cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
3384cabdff1aSopenharmony_ci
3385cabdff1aSopenharmony_ci    weight *= 128;
3386cabdff1aSopenharmony_ci    rnd_val -= 6;
3387cabdff1aSopenharmony_ci
3388cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
3389cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
3390cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
3391cabdff1aSopenharmony_ci
3392cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3393cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3394cabdff1aSopenharmony_ci
3395cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3396cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3397cabdff1aSopenharmony_ci
3398cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
3399cabdff1aSopenharmony_ci    src += (3 * src_stride);
3400cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3401cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3402cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3403cabdff1aSopenharmony_ci
3404cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
3405cabdff1aSopenharmony_ci        LD_SB8(src, src_stride,
3406cabdff1aSopenharmony_ci               src3, src4, src5, src6, src7, src8, src9, src10);
3407cabdff1aSopenharmony_ci        src += (8 * src_stride);
3408cabdff1aSopenharmony_ci        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3409cabdff1aSopenharmony_ci                   src32_r, src43_r, src54_r, src65_r);
3410cabdff1aSopenharmony_ci        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3411cabdff1aSopenharmony_ci        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3412cabdff1aSopenharmony_ci        ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3413cabdff1aSopenharmony_ci                   src109_r, src98_r, src4332, src6554, src8776, src10998);
3414cabdff1aSopenharmony_ci        XORI_B4_128_SB(src4332, src6554, src8776, src10998);
3415cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3416cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3417cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3418cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1);
3419cabdff1aSopenharmony_ci
3420cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3421cabdff1aSopenharmony_ci                                       weight_vec, offset_vec, rnd_vec,
3422cabdff1aSopenharmony_ci                                       dst0, dst1, dst2, dst3);
3423cabdff1aSopenharmony_ci
3424cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3425cabdff1aSopenharmony_ci        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3426cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
3427cabdff1aSopenharmony_ci
3428cabdff1aSopenharmony_ci        src2 = src10;
3429cabdff1aSopenharmony_ci        src2110 = src10998;
3430cabdff1aSopenharmony_ci    }
3431cabdff1aSopenharmony_ci}
3432cabdff1aSopenharmony_ci
3433cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src,
3434cabdff1aSopenharmony_ci                                     int32_t src_stride,
3435cabdff1aSopenharmony_ci                                     uint8_t *dst,
3436cabdff1aSopenharmony_ci                                     int32_t dst_stride,
3437cabdff1aSopenharmony_ci                                     const int8_t *filter,
3438cabdff1aSopenharmony_ci                                     int32_t height,
3439cabdff1aSopenharmony_ci                                     int32_t weight,
3440cabdff1aSopenharmony_ci                                     int32_t offset,
3441cabdff1aSopenharmony_ci                                     int32_t rnd_val)
3442cabdff1aSopenharmony_ci{
3443cabdff1aSopenharmony_ci    if (2 == height) {
3444cabdff1aSopenharmony_ci        hevc_vt_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
3445cabdff1aSopenharmony_ci                                  filter, weight, offset, rnd_val);
3446cabdff1aSopenharmony_ci    } else if (4 == height) {
3447cabdff1aSopenharmony_ci        hevc_vt_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
3448cabdff1aSopenharmony_ci                                  filter, weight, offset, rnd_val);
3449cabdff1aSopenharmony_ci    } else if (0 == (height % 8)) {
3450cabdff1aSopenharmony_ci        hevc_vt_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
3451cabdff1aSopenharmony_ci                                          filter, height, weight, offset,
3452cabdff1aSopenharmony_ci                                          rnd_val);
3453cabdff1aSopenharmony_ci    }
3454cabdff1aSopenharmony_ci}
3455cabdff1aSopenharmony_ci
3456cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src,
3457cabdff1aSopenharmony_ci                                     int32_t src_stride,
3458cabdff1aSopenharmony_ci                                     uint8_t *dst,
3459cabdff1aSopenharmony_ci                                     int32_t dst_stride,
3460cabdff1aSopenharmony_ci                                     const int8_t *filter,
3461cabdff1aSopenharmony_ci                                     int32_t height,
3462cabdff1aSopenharmony_ci                                     int32_t weight,
3463cabdff1aSopenharmony_ci                                     int32_t offset,
3464cabdff1aSopenharmony_ci                                     int32_t rnd_val)
3465cabdff1aSopenharmony_ci{
3466cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
3467cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3468cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r;
3469cabdff1aSopenharmony_ci    v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
3470cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3471cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3472cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3473cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
3474cabdff1aSopenharmony_ci
3475cabdff1aSopenharmony_ci    src -= src_stride;
3476cabdff1aSopenharmony_ci
3477cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
3478cabdff1aSopenharmony_ci
3479cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3480cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
3481cabdff1aSopenharmony_ci
3482cabdff1aSopenharmony_ci    weight *= 128;
3483cabdff1aSopenharmony_ci    rnd_val -= 6;
3484cabdff1aSopenharmony_ci
3485cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
3486cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
3487cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
3488cabdff1aSopenharmony_ci
3489cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3490cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3491cabdff1aSopenharmony_ci
3492cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3493cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3494cabdff1aSopenharmony_ci
3495cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
3496cabdff1aSopenharmony_ci    src += (3 * src_stride);
3497cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3498cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
3499cabdff1aSopenharmony_ci    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3500cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3501cabdff1aSopenharmony_ci    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3502cabdff1aSopenharmony_ci    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3503cabdff1aSopenharmony_ci    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3504cabdff1aSopenharmony_ci    ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3505cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3506cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3507cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3508cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3509cabdff1aSopenharmony_ci    dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3510cabdff1aSopenharmony_ci    dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3511cabdff1aSopenharmony_ci    dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3512cabdff1aSopenharmony_ci    dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3513cabdff1aSopenharmony_ci
3514cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3515cabdff1aSopenharmony_ci                                   weight_vec, offset_vec, rnd_vec,
3516cabdff1aSopenharmony_ci                                   dst0, dst1, dst2, dst3);
3517cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
3518cabdff1aSopenharmony_ci                                   weight_vec, offset_vec, rnd_vec,
3519cabdff1aSopenharmony_ci                                   dst4, dst5, dst6, dst7);
3520cabdff1aSopenharmony_ci
3521cabdff1aSopenharmony_ci    PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3522cabdff1aSopenharmony_ci    PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
3523cabdff1aSopenharmony_ci    ST_W2(out0, 0, 2, dst, dst_stride);
3524cabdff1aSopenharmony_ci    ST_H2(out0, 2, 6, dst + 4, dst_stride);
3525cabdff1aSopenharmony_ci    ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
3526cabdff1aSopenharmony_ci    ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3527cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
3528cabdff1aSopenharmony_ci    ST_W2(out2, 0, 2, dst, dst_stride);
3529cabdff1aSopenharmony_ci    ST_H2(out2, 2, 6, dst + 4, dst_stride);
3530cabdff1aSopenharmony_ci    ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
3531cabdff1aSopenharmony_ci    ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3532cabdff1aSopenharmony_ci}
3533cabdff1aSopenharmony_ci
3534cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src,
3535cabdff1aSopenharmony_ci                                      int32_t src_stride,
3536cabdff1aSopenharmony_ci                                      uint8_t *dst,
3537cabdff1aSopenharmony_ci                                      int32_t dst_stride,
3538cabdff1aSopenharmony_ci                                      const int8_t *filter,
3539cabdff1aSopenharmony_ci                                      int32_t weight,
3540cabdff1aSopenharmony_ci                                      int32_t offset,
3541cabdff1aSopenharmony_ci                                      int32_t rnd_val)
3542cabdff1aSopenharmony_ci{
3543cabdff1aSopenharmony_ci    v16u8 out;
3544cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
3545cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r;
3546cabdff1aSopenharmony_ci    v8i16 dst0, dst1;
3547cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3548cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3549cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
3550cabdff1aSopenharmony_ci
3551cabdff1aSopenharmony_ci    src -= src_stride;
3552cabdff1aSopenharmony_ci
3553cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
3554cabdff1aSopenharmony_ci
3555cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3556cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
3557cabdff1aSopenharmony_ci
3558cabdff1aSopenharmony_ci    weight *= 128;
3559cabdff1aSopenharmony_ci    rnd_val -= 6;
3560cabdff1aSopenharmony_ci
3561cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
3562cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
3563cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
3564cabdff1aSopenharmony_ci
3565cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3566cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3567cabdff1aSopenharmony_ci
3568cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3569cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3570cabdff1aSopenharmony_ci
3571cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3572cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3573cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3574cabdff1aSopenharmony_ci    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3575cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3576cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3577cabdff1aSopenharmony_ci
3578cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
3579cabdff1aSopenharmony_ci                                   dst0, dst1);
3580cabdff1aSopenharmony_ci
3581cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3582cabdff1aSopenharmony_ci    ST_D2(out, 0, 1, dst, dst_stride);
3583cabdff1aSopenharmony_ci}
3584cabdff1aSopenharmony_ci
3585cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_8x4_msa(uint8_t *src,
3586cabdff1aSopenharmony_ci                                      int32_t src_stride,
3587cabdff1aSopenharmony_ci                                      uint8_t *dst,
3588cabdff1aSopenharmony_ci                                      int32_t dst_stride,
3589cabdff1aSopenharmony_ci                                      const int8_t *filter,
3590cabdff1aSopenharmony_ci                                      int32_t weight,
3591cabdff1aSopenharmony_ci                                      int32_t offset,
3592cabdff1aSopenharmony_ci                                      int32_t rnd_val)
3593cabdff1aSopenharmony_ci{
3594cabdff1aSopenharmony_ci    v16u8 out0, out1;
3595cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
3596cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r;
3597cabdff1aSopenharmony_ci    v16i8 src5, src6, src54_r, src65_r;
3598cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3599cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
3600cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3601cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
3602cabdff1aSopenharmony_ci
3603cabdff1aSopenharmony_ci    src -= src_stride;
3604cabdff1aSopenharmony_ci
3605cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
3606cabdff1aSopenharmony_ci
3607cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3608cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
3609cabdff1aSopenharmony_ci
3610cabdff1aSopenharmony_ci    weight *= 128;
3611cabdff1aSopenharmony_ci    rnd_val -= 6;
3612cabdff1aSopenharmony_ci
3613cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
3614cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
3615cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
3616cabdff1aSopenharmony_ci
3617cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3618cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3619cabdff1aSopenharmony_ci
3620cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3621cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3622cabdff1aSopenharmony_ci
3623cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3624cabdff1aSopenharmony_ci    src += (3 * src_stride);
3625cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3626cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3627cabdff1aSopenharmony_ci    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3628cabdff1aSopenharmony_ci    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3629cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3630cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3631cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3632cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3633cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3634cabdff1aSopenharmony_ci                                   offset_vec, rnd_vec, dst0, dst1, dst2,
3635cabdff1aSopenharmony_ci                                   dst3);
3636cabdff1aSopenharmony_ci    PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3637cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3638cabdff1aSopenharmony_ci}
3639cabdff1aSopenharmony_ci
3640cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src,
3641cabdff1aSopenharmony_ci                                      int32_t src_stride,
3642cabdff1aSopenharmony_ci                                      uint8_t *dst,
3643cabdff1aSopenharmony_ci                                      int32_t dst_stride,
3644cabdff1aSopenharmony_ci                                      const int8_t *filter,
3645cabdff1aSopenharmony_ci                                      int32_t weight,
3646cabdff1aSopenharmony_ci                                      int32_t offset,
3647cabdff1aSopenharmony_ci                                      int32_t rnd_val)
3648cabdff1aSopenharmony_ci{
3649cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
3650cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3651cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r;
3652cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src65_r, src87_r;
3653cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3654cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3655cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3656cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
3657cabdff1aSopenharmony_ci
3658cabdff1aSopenharmony_ci    src -= src_stride;
3659cabdff1aSopenharmony_ci
3660cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
3661cabdff1aSopenharmony_ci
3662cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3663cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
3664cabdff1aSopenharmony_ci
3665cabdff1aSopenharmony_ci    weight *= 128;
3666cabdff1aSopenharmony_ci    rnd_val -= 6;
3667cabdff1aSopenharmony_ci
3668cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
3669cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
3670cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
3671cabdff1aSopenharmony_ci
3672cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3673cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3674cabdff1aSopenharmony_ci
3675cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3676cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3677cabdff1aSopenharmony_ci
3678cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
3679cabdff1aSopenharmony_ci    src += (3 * src_stride);
3680cabdff1aSopenharmony_ci    LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
3681cabdff1aSopenharmony_ci
3682cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
3683cabdff1aSopenharmony_ci    XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3684cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3685cabdff1aSopenharmony_ci               src32_r, src43_r);
3686cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3687cabdff1aSopenharmony_ci               src76_r, src87_r);
3688cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3689cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3690cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3691cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3692cabdff1aSopenharmony_ci    dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3693cabdff1aSopenharmony_ci    dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3694cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3695cabdff1aSopenharmony_ci                                   offset_vec, rnd_vec, dst0, dst1, dst2, dst3);
3696cabdff1aSopenharmony_ci    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec,
3697cabdff1aSopenharmony_ci                                   dst4, dst5);
3698cabdff1aSopenharmony_ci    PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
3699cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3700cabdff1aSopenharmony_ci    ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3701cabdff1aSopenharmony_ci}
3702cabdff1aSopenharmony_ci
3703cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_8x8mult_msa(uint8_t *src,
3704cabdff1aSopenharmony_ci                                          int32_t src_stride,
3705cabdff1aSopenharmony_ci                                          uint8_t *dst,
3706cabdff1aSopenharmony_ci                                          int32_t dst_stride,
3707cabdff1aSopenharmony_ci                                          const int8_t *filter,
3708cabdff1aSopenharmony_ci                                          int32_t height,
3709cabdff1aSopenharmony_ci                                          int32_t weight,
3710cabdff1aSopenharmony_ci                                          int32_t offset,
3711cabdff1aSopenharmony_ci                                          int32_t rnd_val)
3712cabdff1aSopenharmony_ci{
3713cabdff1aSopenharmony_ci    int32_t loop_cnt;
3714cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
3715cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3716cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r;
3717cabdff1aSopenharmony_ci    v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
3718cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3719cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3720cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3721cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
3722cabdff1aSopenharmony_ci
3723cabdff1aSopenharmony_ci    src -= src_stride;
3724cabdff1aSopenharmony_ci
3725cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
3726cabdff1aSopenharmony_ci
3727cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3728cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
3729cabdff1aSopenharmony_ci
3730cabdff1aSopenharmony_ci    weight *= 128;
3731cabdff1aSopenharmony_ci    rnd_val -= 6;
3732cabdff1aSopenharmony_ci
3733cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
3734cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
3735cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
3736cabdff1aSopenharmony_ci
3737cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3738cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3739cabdff1aSopenharmony_ci
3740cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3741cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3742cabdff1aSopenharmony_ci
3743cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
3744cabdff1aSopenharmony_ci    src += (3 * src_stride);
3745cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
3746cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3747cabdff1aSopenharmony_ci
3748cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
3749cabdff1aSopenharmony_ci        LD_SB8(src, src_stride,
3750cabdff1aSopenharmony_ci               src3, src4, src5, src6, src7, src8, src9, src10);
3751cabdff1aSopenharmony_ci        src += (8 * src_stride);
3752cabdff1aSopenharmony_ci        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3753cabdff1aSopenharmony_ci        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3754cabdff1aSopenharmony_ci        ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3755cabdff1aSopenharmony_ci        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3756cabdff1aSopenharmony_ci        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3757cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3758cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3759cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3760cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3761cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3762cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3763cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3764cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3765cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3766cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
3767cabdff1aSopenharmony_ci                                       dst3);
3768cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
3769cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst4, dst5, dst6,
3770cabdff1aSopenharmony_ci                                       dst7);
3771cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3772cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
3773cabdff1aSopenharmony_ci        ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
3774cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
3775cabdff1aSopenharmony_ci
3776cabdff1aSopenharmony_ci        src2 = src10;
3777cabdff1aSopenharmony_ci        src10_r = src98_r;
3778cabdff1aSopenharmony_ci        src21_r = src109_r;
3779cabdff1aSopenharmony_ci    }
3780cabdff1aSopenharmony_ci}
3781cabdff1aSopenharmony_ci
3782cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src,
3783cabdff1aSopenharmony_ci                                     int32_t src_stride,
3784cabdff1aSopenharmony_ci                                     uint8_t *dst,
3785cabdff1aSopenharmony_ci                                     int32_t dst_stride,
3786cabdff1aSopenharmony_ci                                     const int8_t *filter,
3787cabdff1aSopenharmony_ci                                     int32_t height,
3788cabdff1aSopenharmony_ci                                     int32_t weight,
3789cabdff1aSopenharmony_ci                                     int32_t offset,
3790cabdff1aSopenharmony_ci                                     int32_t rnd_val)
3791cabdff1aSopenharmony_ci{
3792cabdff1aSopenharmony_ci    if (2 == height) {
3793cabdff1aSopenharmony_ci        hevc_vt_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
3794cabdff1aSopenharmony_ci                                  filter, weight, offset, rnd_val);
3795cabdff1aSopenharmony_ci    } else if (4 == height) {
3796cabdff1aSopenharmony_ci        hevc_vt_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride,
3797cabdff1aSopenharmony_ci                                  filter, weight, offset, rnd_val);
3798cabdff1aSopenharmony_ci    } else if (6 == height) {
3799cabdff1aSopenharmony_ci        hevc_vt_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
3800cabdff1aSopenharmony_ci                                  filter, weight, offset, rnd_val);
3801cabdff1aSopenharmony_ci    } else {
3802cabdff1aSopenharmony_ci        hevc_vt_uniwgt_4t_8x8mult_msa(src, src_stride, dst, dst_stride,
3803cabdff1aSopenharmony_ci                                      filter, height, weight, offset,
3804cabdff1aSopenharmony_ci                                      rnd_val);
3805cabdff1aSopenharmony_ci    }
3806cabdff1aSopenharmony_ci}
3807cabdff1aSopenharmony_ci
3808cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src,
3809cabdff1aSopenharmony_ci                                      int32_t src_stride,
3810cabdff1aSopenharmony_ci                                      uint8_t *dst,
3811cabdff1aSopenharmony_ci                                      int32_t dst_stride,
3812cabdff1aSopenharmony_ci                                      const int8_t *filter,
3813cabdff1aSopenharmony_ci                                      int32_t height,
3814cabdff1aSopenharmony_ci                                      int32_t weight,
3815cabdff1aSopenharmony_ci                                      int32_t offset,
3816cabdff1aSopenharmony_ci                                      int32_t rnd_val)
3817cabdff1aSopenharmony_ci{
3818cabdff1aSopenharmony_ci    int32_t loop_cnt;
3819cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3, out4, out5;
3820cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3821cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r;
3822cabdff1aSopenharmony_ci    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3823cabdff1aSopenharmony_ci    v16i8 src2110, src4332;
3824cabdff1aSopenharmony_ci    v16i8 src54_r, src76_r, src98_r, src65_r, src87_r, src109_r;
3825cabdff1aSopenharmony_ci    v16i8 src76_l, src98_l, src87_l, src109_l, src6554, src8776, src10998;
3826cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3827cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3828cabdff1aSopenharmony_ci    v8i16 dst9, dst10, dst11, filter_vec, weight_vec_h, offset_vec, denom_vec;
3829cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
3830cabdff1aSopenharmony_ci
3831cabdff1aSopenharmony_ci    src -= (1 * src_stride);
3832cabdff1aSopenharmony_ci
3833cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
3834cabdff1aSopenharmony_ci
3835cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3836cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
3837cabdff1aSopenharmony_ci
3838cabdff1aSopenharmony_ci    weight *= 128;
3839cabdff1aSopenharmony_ci    rnd_val -= 6;
3840cabdff1aSopenharmony_ci
3841cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
3842cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
3843cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
3844cabdff1aSopenharmony_ci
3845cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3846cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3847cabdff1aSopenharmony_ci
3848cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3849cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3850cabdff1aSopenharmony_ci
3851cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
3852cabdff1aSopenharmony_ci    src += (3 * src_stride);
3853cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
3854cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3855cabdff1aSopenharmony_ci    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3856cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3857cabdff1aSopenharmony_ci
3858cabdff1aSopenharmony_ci    for (loop_cnt = 2; loop_cnt--;) {
3859cabdff1aSopenharmony_ci        LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3860cabdff1aSopenharmony_ci        src += (8 * src_stride);
3861cabdff1aSopenharmony_ci        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3862cabdff1aSopenharmony_ci        ILVRL_B2_SB(src3, src2, src32_r, src32_l);
3863cabdff1aSopenharmony_ci        ILVRL_B2_SB(src4, src3, src43_r, src43_l);
3864cabdff1aSopenharmony_ci        ILVRL_B2_SB(src5, src4, src54_r, src54_l);
3865cabdff1aSopenharmony_ci        ILVRL_B2_SB(src6, src5, src65_r, src65_l);
3866cabdff1aSopenharmony_ci        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3867cabdff1aSopenharmony_ci        src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3868cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3869cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3870cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3871cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3872cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3873cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3874cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3875cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
3876cabdff1aSopenharmony_ci                                       dst3);
3877cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
3878cabdff1aSopenharmony_ci                                       rnd_vec, dst4, dst5);
3879cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
3880cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3881cabdff1aSopenharmony_ci        ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
3882cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
3883cabdff1aSopenharmony_ci
3884cabdff1aSopenharmony_ci        ILVRL_B2_SB(src7, src6, src76_r, src76_l);
3885cabdff1aSopenharmony_ci        ILVRL_B2_SB(src8, src7, src87_r, src87_l);
3886cabdff1aSopenharmony_ci        ILVRL_B2_SB(src9, src8, src98_r, src98_l);
3887cabdff1aSopenharmony_ci        ILVRL_B2_SB(src10, src9, src109_r, src109_l);
3888cabdff1aSopenharmony_ci        src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
3889cabdff1aSopenharmony_ci        src10998 = (v16i8) __msa_ilvr_d((v2i64) src109_l, (v2i64) src98_l);
3890cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3891cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3892cabdff1aSopenharmony_ci        dst8 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3893cabdff1aSopenharmony_ci        dst9 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3894cabdff1aSopenharmony_ci        dst10 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3895cabdff1aSopenharmony_ci        dst11 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1);
3896cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst6, dst7, dst8, dst9, weight_vec,
3897cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst6, dst7, dst8,
3898cabdff1aSopenharmony_ci                                       dst9);
3899cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst10, dst11, weight_vec, offset_vec,
3900cabdff1aSopenharmony_ci                                       rnd_vec, dst10, dst11);
3901cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
3902cabdff1aSopenharmony_ci        ST_D4(out3, out4, 0, 1, 0, 1, dst, dst_stride);
3903cabdff1aSopenharmony_ci        ST_W4(out5, 0, 1, 2, 3, dst + 8, dst_stride);
3904cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
3905cabdff1aSopenharmony_ci
3906cabdff1aSopenharmony_ci        src2 = src10;
3907cabdff1aSopenharmony_ci        src10_r = src98_r;
3908cabdff1aSopenharmony_ci        src21_r = src109_r;
3909cabdff1aSopenharmony_ci        src2110 = src10998;
3910cabdff1aSopenharmony_ci    }
3911cabdff1aSopenharmony_ci}
3912cabdff1aSopenharmony_ci
3913cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src,
3914cabdff1aSopenharmony_ci                                      int32_t src_stride,
3915cabdff1aSopenharmony_ci                                      uint8_t *dst,
3916cabdff1aSopenharmony_ci                                      int32_t dst_stride,
3917cabdff1aSopenharmony_ci                                      const int8_t *filter,
3918cabdff1aSopenharmony_ci                                      int32_t height,
3919cabdff1aSopenharmony_ci                                      int32_t weight,
3920cabdff1aSopenharmony_ci                                      int32_t offset,
3921cabdff1aSopenharmony_ci                                      int32_t rnd_val)
3922cabdff1aSopenharmony_ci{
3923cabdff1aSopenharmony_ci    int32_t loop_cnt;
3924cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
3925cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5;
3926cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r;
3927cabdff1aSopenharmony_ci    v16i8 src10_l, src32_l, src21_l, src43_l;
3928cabdff1aSopenharmony_ci    v16i8 src54_r, src54_l, src65_r, src65_l, src6;
3929cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3930cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3931cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3932cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
3933cabdff1aSopenharmony_ci
3934cabdff1aSopenharmony_ci    src -= src_stride;
3935cabdff1aSopenharmony_ci
3936cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
3937cabdff1aSopenharmony_ci
3938cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3939cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
3940cabdff1aSopenharmony_ci
3941cabdff1aSopenharmony_ci    weight *= 128;
3942cabdff1aSopenharmony_ci    rnd_val -= 6;
3943cabdff1aSopenharmony_ci
3944cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
3945cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
3946cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
3947cabdff1aSopenharmony_ci
3948cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3949cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3950cabdff1aSopenharmony_ci
3951cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3952cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3953cabdff1aSopenharmony_ci
3954cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
3955cabdff1aSopenharmony_ci    src += (3 * src_stride);
3956cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
3957cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3958cabdff1aSopenharmony_ci    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3959cabdff1aSopenharmony_ci
3960cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
3961cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src3, src4, src5, src6);
3962cabdff1aSopenharmony_ci        src += (4 * src_stride);
3963cabdff1aSopenharmony_ci        XORI_B4_128_SB(src3, src4, src5, src6);
3964cabdff1aSopenharmony_ci        ILVRL_B2_SB(src3, src2, src32_r, src32_l);
3965cabdff1aSopenharmony_ci        ILVRL_B2_SB(src4, src3, src43_r, src43_l);
3966cabdff1aSopenharmony_ci        ILVRL_B2_SB(src5, src4, src54_r, src54_l);
3967cabdff1aSopenharmony_ci        ILVRL_B2_SB(src6, src5, src65_r, src65_l);
3968cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3969cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3970cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3971cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3972cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3973cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3974cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
3975cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
3976cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3977cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
3978cabdff1aSopenharmony_ci                                       dst3);
3979cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
3980cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst4, dst5, dst6,
3981cabdff1aSopenharmony_ci                                       dst7);
3982cabdff1aSopenharmony_ci        PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
3983cabdff1aSopenharmony_ci                    out2, out3);
3984cabdff1aSopenharmony_ci        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
3985cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
3986cabdff1aSopenharmony_ci
3987cabdff1aSopenharmony_ci        src2 = src6;
3988cabdff1aSopenharmony_ci        src10_r = src54_r;
3989cabdff1aSopenharmony_ci        src21_r = src65_r;
3990cabdff1aSopenharmony_ci        src10_l = src54_l;
3991cabdff1aSopenharmony_ci        src21_l = src65_l;
3992cabdff1aSopenharmony_ci    }
3993cabdff1aSopenharmony_ci}
3994cabdff1aSopenharmony_ci
3995cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src,
3996cabdff1aSopenharmony_ci                                      int32_t src_stride,
3997cabdff1aSopenharmony_ci                                      uint8_t *dst,
3998cabdff1aSopenharmony_ci                                      int32_t dst_stride,
3999cabdff1aSopenharmony_ci                                      const int8_t *filter,
4000cabdff1aSopenharmony_ci                                      int32_t height,
4001cabdff1aSopenharmony_ci                                      int32_t weight,
4002cabdff1aSopenharmony_ci                                      int32_t offset,
4003cabdff1aSopenharmony_ci                                      int32_t rnd_val)
4004cabdff1aSopenharmony_ci{
4005cabdff1aSopenharmony_ci    uint32_t loop_cnt;
4006cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3, out4, out5;
4007cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5;
4008cabdff1aSopenharmony_ci    v16i8 src6, src7, src8, src9, src10, src11, src12, src13;
4009cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
4010cabdff1aSopenharmony_ci    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4011cabdff1aSopenharmony_ci    v16i8 src87_r, src98_r, src109_r, src1110_r, src1211_r, src1312_r;
4012cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4013cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
4014cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec, dst11;
4015cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
4016cabdff1aSopenharmony_ci
4017cabdff1aSopenharmony_ci    src -= src_stride;
4018cabdff1aSopenharmony_ci
4019cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
4020cabdff1aSopenharmony_ci
4021cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
4022cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
4023cabdff1aSopenharmony_ci
4024cabdff1aSopenharmony_ci    weight *= 128;
4025cabdff1aSopenharmony_ci    rnd_val -= 6;
4026cabdff1aSopenharmony_ci
4027cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
4028cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
4029cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
4030cabdff1aSopenharmony_ci
4031cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
4032cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
4033cabdff1aSopenharmony_ci
4034cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
4035cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4036cabdff1aSopenharmony_ci
4037cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
4038cabdff1aSopenharmony_ci    LD_SB3(src + 16, src_stride, src7, src8, src9);
4039cabdff1aSopenharmony_ci    src += (3 * src_stride);
4040cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
4041cabdff1aSopenharmony_ci    XORI_B3_128_SB(src7, src8, src9);
4042cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4043cabdff1aSopenharmony_ci    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4044cabdff1aSopenharmony_ci    ILVR_B2_SB(src8, src7, src9, src8, src87_r, src98_r);
4045cabdff1aSopenharmony_ci
4046cabdff1aSopenharmony_ci    for (loop_cnt = 8; loop_cnt--;) {
4047cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src3, src4, src5, src6);
4048cabdff1aSopenharmony_ci        LD_SB4(src + 16, src_stride, src10, src11, src12, src13);
4049cabdff1aSopenharmony_ci        src += (4 * src_stride);
4050cabdff1aSopenharmony_ci        XORI_B4_128_SB(src3, src4, src5, src6);
4051cabdff1aSopenharmony_ci        XORI_B4_128_SB(src10, src11, src12, src13);
4052cabdff1aSopenharmony_ci        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4053cabdff1aSopenharmony_ci        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4054cabdff1aSopenharmony_ci        ILVRL_B2_SB(src5, src4, src54_r, src54_l);
4055cabdff1aSopenharmony_ci        ILVRL_B2_SB(src6, src5, src65_r, src65_l);
4056cabdff1aSopenharmony_ci        ILVR_B2_SB(src10, src9, src11, src10, src109_r, src1110_r);
4057cabdff1aSopenharmony_ci        ILVR_B2_SB(src12, src11, src13, src12, src1211_r, src1312_r);
4058cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4059cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4060cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
4061cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
4062cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4063cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4064cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
4065cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
4066cabdff1aSopenharmony_ci        dst8 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4067cabdff1aSopenharmony_ci        dst9 = HEVC_FILT_4TAP_SH(src98_r, src1110_r, filt0, filt1);
4068cabdff1aSopenharmony_ci        dst10 = HEVC_FILT_4TAP_SH(src109_r, src1211_r, filt0, filt1);
4069cabdff1aSopenharmony_ci        dst11 = HEVC_FILT_4TAP_SH(src1110_r, src1312_r, filt0, filt1);
4070cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
4071cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
4072cabdff1aSopenharmony_ci                                       dst3);
4073cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
4074cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst4, dst5, dst6,
4075cabdff1aSopenharmony_ci                                       dst7);
4076cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
4077cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst8, dst9, dst10,
4078cabdff1aSopenharmony_ci                                       dst11);
4079cabdff1aSopenharmony_ci        PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
4080cabdff1aSopenharmony_ci                    out2, out3);
4081cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
4082cabdff1aSopenharmony_ci        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
4083cabdff1aSopenharmony_ci        ST_D4(out4, out5, 0, 1, 0, 1, dst + 16, dst_stride);
4084cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
4085cabdff1aSopenharmony_ci
4086cabdff1aSopenharmony_ci        src2 = src6;
4087cabdff1aSopenharmony_ci        src9 = src13;
4088cabdff1aSopenharmony_ci        src10_r = src54_r;
4089cabdff1aSopenharmony_ci        src21_r = src65_r;
4090cabdff1aSopenharmony_ci        src10_l = src54_l;
4091cabdff1aSopenharmony_ci        src21_l = src65_l;
4092cabdff1aSopenharmony_ci        src87_r = src1211_r;
4093cabdff1aSopenharmony_ci        src98_r = src1312_r;
4094cabdff1aSopenharmony_ci    }
4095cabdff1aSopenharmony_ci}
4096cabdff1aSopenharmony_ci
4097cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src,
4098cabdff1aSopenharmony_ci                                      int32_t src_stride,
4099cabdff1aSopenharmony_ci                                      uint8_t *dst,
4100cabdff1aSopenharmony_ci                                      int32_t dst_stride,
4101cabdff1aSopenharmony_ci                                      const int8_t *filter,
4102cabdff1aSopenharmony_ci                                      int32_t height,
4103cabdff1aSopenharmony_ci                                      int32_t weight,
4104cabdff1aSopenharmony_ci                                      int32_t offset,
4105cabdff1aSopenharmony_ci                                      int32_t rnd_val)
4106cabdff1aSopenharmony_ci{
4107cabdff1aSopenharmony_ci    uint32_t loop_cnt;
4108cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
4109cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
4110cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src76_r, src98_r;
4111cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src65_r, src87_r;
4112cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4113cabdff1aSopenharmony_ci    v16i8 src10_l, src32_l, src76_l, src98_l;
4114cabdff1aSopenharmony_ci    v16i8 src21_l, src43_l, src65_l, src87_l;
4115cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4116cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
4117cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
4118cabdff1aSopenharmony_ci
4119cabdff1aSopenharmony_ci    src -= src_stride;
4120cabdff1aSopenharmony_ci
4121cabdff1aSopenharmony_ci    weight = weight & 0x0000FFFF;
4122cabdff1aSopenharmony_ci
4123cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
4124cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
4125cabdff1aSopenharmony_ci
4126cabdff1aSopenharmony_ci    weight *= 128;
4127cabdff1aSopenharmony_ci    rnd_val -= 6;
4128cabdff1aSopenharmony_ci
4129cabdff1aSopenharmony_ci    weight_vec_h = __msa_fill_h(weight);
4130cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
4131cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val);
4132cabdff1aSopenharmony_ci
4133cabdff1aSopenharmony_ci    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
4134cabdff1aSopenharmony_ci    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
4135cabdff1aSopenharmony_ci
4136cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
4137cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4138cabdff1aSopenharmony_ci
4139cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
4140cabdff1aSopenharmony_ci    LD_SB3(src + 16, src_stride, src5, src6, src7);
4141cabdff1aSopenharmony_ci    src += (3 * src_stride);
4142cabdff1aSopenharmony_ci    XORI_B6_128_SB(src0, src1, src2, src5, src6, src7);
4143cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4144cabdff1aSopenharmony_ci    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4145cabdff1aSopenharmony_ci    ILVR_B2_SB(src6, src5, src7, src6, src65_r, src76_r);
4146cabdff1aSopenharmony_ci    ILVL_B2_SB(src6, src5, src7, src6, src65_l, src76_l);
4147cabdff1aSopenharmony_ci
4148cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
4149cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src3, src4);
4150cabdff1aSopenharmony_ci        LD_SB2(src + 16, src_stride, src8, src9);
4151cabdff1aSopenharmony_ci        src += (2 * src_stride);
4152cabdff1aSopenharmony_ci        XORI_B4_128_SB(src3, src4, src8, src9);
4153cabdff1aSopenharmony_ci        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4154cabdff1aSopenharmony_ci        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4155cabdff1aSopenharmony_ci        ILVRL_B2_SB(src8, src7, src87_r, src87_l);
4156cabdff1aSopenharmony_ci        ILVRL_B2_SB(src9, src8, src98_r, src98_l);
4157cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4158cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4159cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4160cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4161cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
4162cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4163cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP_SH(src65_l, src87_l, filt0, filt1);
4164cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
4165cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
4166cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst0, dst1, dst2,
4167cabdff1aSopenharmony_ci                                       dst3);
4168cabdff1aSopenharmony_ci        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
4169cabdff1aSopenharmony_ci                                       offset_vec, rnd_vec, dst4, dst5, dst6,
4170cabdff1aSopenharmony_ci                                       dst7);
4171cabdff1aSopenharmony_ci        PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst6, dst4, dst7, dst5, out0, out1,
4172cabdff1aSopenharmony_ci                    out2, out3);
4173cabdff1aSopenharmony_ci        ST_UB2(out0, out2, dst, 16);
4174cabdff1aSopenharmony_ci        dst += dst_stride;
4175cabdff1aSopenharmony_ci        ST_UB2(out1, out3, dst, 16);
4176cabdff1aSopenharmony_ci        dst += dst_stride;
4177cabdff1aSopenharmony_ci
4178cabdff1aSopenharmony_ci        src2 = src4;
4179cabdff1aSopenharmony_ci        src7 = src9;
4180cabdff1aSopenharmony_ci        src10_r = src32_r;
4181cabdff1aSopenharmony_ci        src21_r = src43_r;
4182cabdff1aSopenharmony_ci        src10_l = src32_l;
4183cabdff1aSopenharmony_ci        src21_l = src43_l;
4184cabdff1aSopenharmony_ci        src65_r = src87_r;
4185cabdff1aSopenharmony_ci        src76_r = src98_r;
4186cabdff1aSopenharmony_ci        src65_l = src87_l;
4187cabdff1aSopenharmony_ci        src76_l = src98_l;
4188cabdff1aSopenharmony_ci    }
4189cabdff1aSopenharmony_ci}
4190cabdff1aSopenharmony_ci
4191cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src,
4192cabdff1aSopenharmony_ci                                      int32_t src_stride,
4193cabdff1aSopenharmony_ci                                      uint8_t *dst,
4194cabdff1aSopenharmony_ci                                      int32_t dst_stride,
4195cabdff1aSopenharmony_ci                                      const int8_t *filter_x,
4196cabdff1aSopenharmony_ci                                      const int8_t *filter_y,
4197cabdff1aSopenharmony_ci                                      int32_t weight,
4198cabdff1aSopenharmony_ci                                      int32_t offset,
4199cabdff1aSopenharmony_ci                                      int32_t rnd_val)
4200cabdff1aSopenharmony_ci{
4201cabdff1aSopenharmony_ci    v16u8 out;
4202cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
4203cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4204cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4205cabdff1aSopenharmony_ci    v16i8 mask1;
4206cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filter_vec, tmp;
4207cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4208cabdff1aSopenharmony_ci    v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
4209cabdff1aSopenharmony_ci    v8i16 offset_vec, const_128, denom_vec;
4210cabdff1aSopenharmony_ci    v4i32 dst0, dst1, weight_vec, rnd_vec;
4211cabdff1aSopenharmony_ci
4212cabdff1aSopenharmony_ci    src -= (src_stride + 1);
4213cabdff1aSopenharmony_ci
4214cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
4215cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4216cabdff1aSopenharmony_ci
4217cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
4218cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
4219cabdff1aSopenharmony_ci
4220cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4221cabdff1aSopenharmony_ci
4222cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
4223cabdff1aSopenharmony_ci
4224cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
4225cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
4226cabdff1aSopenharmony_ci
4227cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
4228cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val - 6);
4229cabdff1aSopenharmony_ci    const_128 = __msa_fill_h((128 * weight));
4230cabdff1aSopenharmony_ci    offset_vec += __msa_srar_h(const_128, denom_vec);
4231cabdff1aSopenharmony_ci
4232cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4233cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4234cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
4235cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
4236cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4237cabdff1aSopenharmony_ci    dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4238cabdff1aSopenharmony_ci    dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4239cabdff1aSopenharmony_ci    dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4240cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst31, dst20, dst10, dst32);
4241cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst42, dst31, dst21, dst43);
4242cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4243cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4244cabdff1aSopenharmony_ci    dst0 >>= 6;
4245cabdff1aSopenharmony_ci    dst1 >>= 6;
4246cabdff1aSopenharmony_ci    MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
4247cabdff1aSopenharmony_ci    SRAR_W2_SW(dst0, dst1, rnd_vec);
4248cabdff1aSopenharmony_ci    tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4249cabdff1aSopenharmony_ci    tmp += offset_vec;
4250cabdff1aSopenharmony_ci    CLIP_SH_0_255(tmp);
4251cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
4252cabdff1aSopenharmony_ci    ST_W2(out, 0, 1, dst, dst_stride);
4253cabdff1aSopenharmony_ci}
4254cabdff1aSopenharmony_ci
4255cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src,
4256cabdff1aSopenharmony_ci                                      int32_t src_stride,
4257cabdff1aSopenharmony_ci                                      uint8_t *dst,
4258cabdff1aSopenharmony_ci                                      int32_t dst_stride,
4259cabdff1aSopenharmony_ci                                      const int8_t *filter_x,
4260cabdff1aSopenharmony_ci                                      const int8_t *filter_y,
4261cabdff1aSopenharmony_ci                                      int32_t weight,
4262cabdff1aSopenharmony_ci                                      int32_t offset,
4263cabdff1aSopenharmony_ci                                      int32_t rnd_val)
4264cabdff1aSopenharmony_ci{
4265cabdff1aSopenharmony_ci    v16u8 out;
4266cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
4267cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4268cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1;
4269cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4270cabdff1aSopenharmony_ci    v16i8 mask1;
4271cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4272cabdff1aSopenharmony_ci    v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65;
4273cabdff1aSopenharmony_ci    v8i16 offset_vec, const_128, denom_vec;
4274cabdff1aSopenharmony_ci    v4i32 dst0, dst1, dst2, dst3, weight_vec, rnd_vec;
4275cabdff1aSopenharmony_ci
4276cabdff1aSopenharmony_ci    src -= (src_stride + 1);
4277cabdff1aSopenharmony_ci
4278cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
4279cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4280cabdff1aSopenharmony_ci
4281cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
4282cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
4283cabdff1aSopenharmony_ci
4284cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4285cabdff1aSopenharmony_ci
4286cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
4287cabdff1aSopenharmony_ci
4288cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
4289cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
4290cabdff1aSopenharmony_ci
4291cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
4292cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val - 6);
4293cabdff1aSopenharmony_ci    const_128 = __msa_fill_h((128 * weight));
4294cabdff1aSopenharmony_ci    offset_vec += __msa_srar_h(const_128, denom_vec);
4295cabdff1aSopenharmony_ci
4296cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
4297cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4298cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4299cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4300cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4301cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4302cabdff1aSopenharmony_ci    dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4303cabdff1aSopenharmony_ci    dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4304cabdff1aSopenharmony_ci    dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4305cabdff1aSopenharmony_ci    dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4306cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
4307cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
4308cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
4309cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4310cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4311cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
4312cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
4313cabdff1aSopenharmony_ci    SRA_4V(dst0, dst1, dst2, dst3, 6);
4314cabdff1aSopenharmony_ci    MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
4315cabdff1aSopenharmony_ci    MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
4316cabdff1aSopenharmony_ci    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4317cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4318cabdff1aSopenharmony_ci    ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4319cabdff1aSopenharmony_ci    CLIP_SH2_0_255(tmp0, tmp1);
4320cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4321cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
4322cabdff1aSopenharmony_ci}
4323cabdff1aSopenharmony_ci
4324cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src,
4325cabdff1aSopenharmony_ci                                              int32_t src_stride,
4326cabdff1aSopenharmony_ci                                              uint8_t *dst,
4327cabdff1aSopenharmony_ci                                              int32_t dst_stride,
4328cabdff1aSopenharmony_ci                                              const int8_t *filter_x,
4329cabdff1aSopenharmony_ci                                              const int8_t *filter_y,
4330cabdff1aSopenharmony_ci                                              int32_t height,
4331cabdff1aSopenharmony_ci                                              int32_t weight,
4332cabdff1aSopenharmony_ci                                              int32_t offset,
4333cabdff1aSopenharmony_ci                                              int32_t rnd_val)
4334cabdff1aSopenharmony_ci{
4335cabdff1aSopenharmony_ci    uint32_t loop_cnt;
4336cabdff1aSopenharmony_ci    v16u8 out0, out1;
4337cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4338cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4339cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4340cabdff1aSopenharmony_ci    v16i8 mask1;
4341cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
4342cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4343cabdff1aSopenharmony_ci    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4344cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4345cabdff1aSopenharmony_ci    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4346cabdff1aSopenharmony_ci    v8i16 dst98_r, dst109_r, offset_vec, const_128, denom_vec;
4347cabdff1aSopenharmony_ci    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
4348cabdff1aSopenharmony_ci
4349cabdff1aSopenharmony_ci    src -= (src_stride + 1);
4350cabdff1aSopenharmony_ci
4351cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
4352cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4353cabdff1aSopenharmony_ci
4354cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
4355cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
4356cabdff1aSopenharmony_ci
4357cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4358cabdff1aSopenharmony_ci
4359cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
4360cabdff1aSopenharmony_ci
4361cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
4362cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
4363cabdff1aSopenharmony_ci
4364cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
4365cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val - 6);
4366cabdff1aSopenharmony_ci    const_128 = __msa_fill_h((128 * weight));
4367cabdff1aSopenharmony_ci    offset_vec += __msa_srar_h(const_128, denom_vec);
4368cabdff1aSopenharmony_ci
4369cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
4370cabdff1aSopenharmony_ci    src += (3 * src_stride);
4371cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
4372cabdff1aSopenharmony_ci
4373cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4374cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4375cabdff1aSopenharmony_ci    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4376cabdff1aSopenharmony_ci    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4377cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4378cabdff1aSopenharmony_ci    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4379cabdff1aSopenharmony_ci
4380cabdff1aSopenharmony_ci    for (loop_cnt = height >> 3; loop_cnt--;) {
4381cabdff1aSopenharmony_ci        LD_SB8(src, src_stride,
4382cabdff1aSopenharmony_ci               src3, src4, src5, src6, src7, src8, src9, src10);
4383cabdff1aSopenharmony_ci        src += (8 * src_stride);
4384cabdff1aSopenharmony_ci        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4385cabdff1aSopenharmony_ci
4386cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4387cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4388cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4389cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4390cabdff1aSopenharmony_ci        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4391cabdff1aSopenharmony_ci        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4392cabdff1aSopenharmony_ci        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4393cabdff1aSopenharmony_ci        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4394cabdff1aSopenharmony_ci        dst32_r = __msa_ilvr_h(dst73, dst22);
4395cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4396cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4397cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4398cabdff1aSopenharmony_ci        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4399cabdff1aSopenharmony_ci        dst76_r = __msa_ilvr_h(dst22, dst106);
4400cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4401cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4402cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4403cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4404cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4405cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4406cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4407cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4408cabdff1aSopenharmony_ci        SRA_4V(dst0, dst1, dst2, dst3, 6);
4409cabdff1aSopenharmony_ci        SRA_4V(dst4, dst5, dst6, dst7, 6);
4410cabdff1aSopenharmony_ci        MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
4411cabdff1aSopenharmony_ci        MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
4412cabdff1aSopenharmony_ci        MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
4413cabdff1aSopenharmony_ci        MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
4414cabdff1aSopenharmony_ci        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4415cabdff1aSopenharmony_ci        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4416cabdff1aSopenharmony_ci        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4417cabdff1aSopenharmony_ci                    tmp2, tmp3);
4418cabdff1aSopenharmony_ci        ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4419cabdff1aSopenharmony_ci        ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4420cabdff1aSopenharmony_ci        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4421cabdff1aSopenharmony_ci        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4422cabdff1aSopenharmony_ci        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4423cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
4424cabdff1aSopenharmony_ci
4425cabdff1aSopenharmony_ci        dst10_r = dst98_r;
4426cabdff1aSopenharmony_ci        dst21_r = dst109_r;
4427cabdff1aSopenharmony_ci        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4428cabdff1aSopenharmony_ci    }
4429cabdff1aSopenharmony_ci}
4430cabdff1aSopenharmony_ci
4431cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src,
4432cabdff1aSopenharmony_ci                                     int32_t src_stride,
4433cabdff1aSopenharmony_ci                                     uint8_t *dst,
4434cabdff1aSopenharmony_ci                                     int32_t dst_stride,
4435cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
4436cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
4437cabdff1aSopenharmony_ci                                     int32_t height,
4438cabdff1aSopenharmony_ci                                     int32_t weight,
4439cabdff1aSopenharmony_ci                                     int32_t offset,
4440cabdff1aSopenharmony_ci                                     int32_t rnd_val)
4441cabdff1aSopenharmony_ci{
4442cabdff1aSopenharmony_ci    if (2 == height) {
4443cabdff1aSopenharmony_ci        hevc_hv_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
4444cabdff1aSopenharmony_ci                                  filter_x, filter_y, weight,
4445cabdff1aSopenharmony_ci                                  offset, rnd_val);
4446cabdff1aSopenharmony_ci    } else if (4 == height) {
4447cabdff1aSopenharmony_ci        hevc_hv_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
4448cabdff1aSopenharmony_ci                                  filter_x,filter_y, weight,
4449cabdff1aSopenharmony_ci                                  offset, rnd_val);
4450cabdff1aSopenharmony_ci    } else if (0 == (height % 8)) {
4451cabdff1aSopenharmony_ci        hevc_hv_uniwgt_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
4452cabdff1aSopenharmony_ci                                          filter_x, filter_y, height, weight,
4453cabdff1aSopenharmony_ci                                          offset, rnd_val);
4454cabdff1aSopenharmony_ci    }
4455cabdff1aSopenharmony_ci}
4456cabdff1aSopenharmony_ci
4457cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src,
4458cabdff1aSopenharmony_ci                                     int32_t src_stride,
4459cabdff1aSopenharmony_ci                                     uint8_t *dst,
4460cabdff1aSopenharmony_ci                                     int32_t dst_stride,
4461cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
4462cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
4463cabdff1aSopenharmony_ci                                     int32_t height,
4464cabdff1aSopenharmony_ci                                     int32_t weight,
4465cabdff1aSopenharmony_ci                                     int32_t offset,
4466cabdff1aSopenharmony_ci                                     int32_t rnd_val)
4467cabdff1aSopenharmony_ci{
4468cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
4469cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4470cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4471cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4472cabdff1aSopenharmony_ci    v16i8 mask1;
4473cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filter_vec;
4474cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4475cabdff1aSopenharmony_ci    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4476cabdff1aSopenharmony_ci    v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4477cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4478cabdff1aSopenharmony_ci    v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4479cabdff1aSopenharmony_ci    v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4480cabdff1aSopenharmony_ci    v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4481cabdff1aSopenharmony_ci    v8i16 offset_vec, const_128, denom_vec;
4482cabdff1aSopenharmony_ci    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
4483cabdff1aSopenharmony_ci    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, weight_vec, rnd_vec;
4484cabdff1aSopenharmony_ci
4485cabdff1aSopenharmony_ci    src -= (src_stride + 1);
4486cabdff1aSopenharmony_ci
4487cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
4488cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4489cabdff1aSopenharmony_ci
4490cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
4491cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
4492cabdff1aSopenharmony_ci
4493cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4494cabdff1aSopenharmony_ci
4495cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
4496cabdff1aSopenharmony_ci
4497cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
4498cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
4499cabdff1aSopenharmony_ci
4500cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
4501cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val - 6);
4502cabdff1aSopenharmony_ci    const_128 = __msa_fill_h((128 * weight));
4503cabdff1aSopenharmony_ci    offset_vec += __msa_srar_h(const_128, denom_vec);
4504cabdff1aSopenharmony_ci
4505cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
4506cabdff1aSopenharmony_ci    src += (3 * src_stride);
4507cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
4508cabdff1aSopenharmony_ci
4509cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4510cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4511cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4512cabdff1aSopenharmony_ci    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4513cabdff1aSopenharmony_ci    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4514cabdff1aSopenharmony_ci    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4515cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4516cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4517cabdff1aSopenharmony_ci
4518cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
4519cabdff1aSopenharmony_ci    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4520cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4521cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4522cabdff1aSopenharmony_ci    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4523cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4524cabdff1aSopenharmony_ci    dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4525cabdff1aSopenharmony_ci    dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4526cabdff1aSopenharmony_ci    dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4527cabdff1aSopenharmony_ci    dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4528cabdff1aSopenharmony_ci    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4529cabdff1aSopenharmony_ci    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4530cabdff1aSopenharmony_ci    VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4531cabdff1aSopenharmony_ci    VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4532cabdff1aSopenharmony_ci    dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4533cabdff1aSopenharmony_ci    dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4534cabdff1aSopenharmony_ci    dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4535cabdff1aSopenharmony_ci    dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4536cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4537cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4538cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4539cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4540cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4541cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4542cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4543cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4544cabdff1aSopenharmony_ci    PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4545cabdff1aSopenharmony_ci    PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4546cabdff1aSopenharmony_ci    dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4547cabdff1aSopenharmony_ci    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4548cabdff1aSopenharmony_ci    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4549cabdff1aSopenharmony_ci    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4550cabdff1aSopenharmony_ci    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4551cabdff1aSopenharmony_ci    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4552cabdff1aSopenharmony_ci    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4553cabdff1aSopenharmony_ci    dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4554cabdff1aSopenharmony_ci    dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4555cabdff1aSopenharmony_ci    dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4556cabdff1aSopenharmony_ci    dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4557cabdff1aSopenharmony_ci    dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4558cabdff1aSopenharmony_ci    dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4559cabdff1aSopenharmony_ci    SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4560cabdff1aSopenharmony_ci    SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4561cabdff1aSopenharmony_ci    SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4562cabdff1aSopenharmony_ci    MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4563cabdff1aSopenharmony_ci    MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4564cabdff1aSopenharmony_ci    MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
4565cabdff1aSopenharmony_ci    MUL2(dst6_r, weight_vec, dst7_r, weight_vec, dst6_r, dst7_r);
4566cabdff1aSopenharmony_ci    MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4567cabdff1aSopenharmony_ci    MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4568cabdff1aSopenharmony_ci    SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
4569cabdff1aSopenharmony_ci    SRAR_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, rnd_vec);
4570cabdff1aSopenharmony_ci    SRAR_W4_SW(dst0_l, dst1_l, dst2_l, dst3_l, rnd_vec);
4571cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4572cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
4573cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
4574cabdff1aSopenharmony_ci    ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4575cabdff1aSopenharmony_ci    ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4576cabdff1aSopenharmony_ci    ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
4577cabdff1aSopenharmony_ci    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4578cabdff1aSopenharmony_ci    CLIP_SH2_0_255(tmp4, tmp5);
4579cabdff1aSopenharmony_ci    PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
4580cabdff1aSopenharmony_ci    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4581cabdff1aSopenharmony_ci    ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
4582cabdff1aSopenharmony_ci}
4583cabdff1aSopenharmony_ci
4584cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src,
4585cabdff1aSopenharmony_ci                                      int32_t src_stride,
4586cabdff1aSopenharmony_ci                                      uint8_t *dst,
4587cabdff1aSopenharmony_ci                                      int32_t dst_stride,
4588cabdff1aSopenharmony_ci                                      const int8_t *filter_x,
4589cabdff1aSopenharmony_ci                                      const int8_t *filter_y,
4590cabdff1aSopenharmony_ci                                      int32_t weight,
4591cabdff1aSopenharmony_ci                                      int32_t offset,
4592cabdff1aSopenharmony_ci                                      int32_t rnd_val)
4593cabdff1aSopenharmony_ci{
4594cabdff1aSopenharmony_ci    v16u8 out;
4595cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
4596cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4597cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filter_vec;
4598cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4599cabdff1aSopenharmony_ci    v16i8 mask1;
4600cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4601cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4;
4602cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4603cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4604cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4605cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1;
4606cabdff1aSopenharmony_ci    v8i16 offset_vec, const_128, denom_vec;
4607cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
4608cabdff1aSopenharmony_ci
4609cabdff1aSopenharmony_ci    src -= (src_stride + 1);
4610cabdff1aSopenharmony_ci
4611cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
4612cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4613cabdff1aSopenharmony_ci
4614cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
4615cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
4616cabdff1aSopenharmony_ci
4617cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4618cabdff1aSopenharmony_ci
4619cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
4620cabdff1aSopenharmony_ci
4621cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
4622cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
4623cabdff1aSopenharmony_ci
4624cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
4625cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val - 6);
4626cabdff1aSopenharmony_ci    const_128 = __msa_fill_h((128 * weight));
4627cabdff1aSopenharmony_ci    offset_vec += __msa_srar_h(const_128, denom_vec);
4628cabdff1aSopenharmony_ci
4629cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4630cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4631cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4632cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4633cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4634cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4635cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4636cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4637cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4638cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4639cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4640cabdff1aSopenharmony_ci    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4641cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4642cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4643cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4644cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4645cabdff1aSopenharmony_ci    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4646cabdff1aSopenharmony_ci    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4647cabdff1aSopenharmony_ci    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4648cabdff1aSopenharmony_ci    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4649cabdff1aSopenharmony_ci    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4650cabdff1aSopenharmony_ci    MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4651cabdff1aSopenharmony_ci    MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4652cabdff1aSopenharmony_ci    SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4653cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4654cabdff1aSopenharmony_ci    ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4655cabdff1aSopenharmony_ci    CLIP_SH2_0_255(tmp0, tmp1);
4656cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4657cabdff1aSopenharmony_ci    ST_D2(out, 0, 1, dst, dst_stride);
4658cabdff1aSopenharmony_ci}
4659cabdff1aSopenharmony_ci
4660cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src,
4661cabdff1aSopenharmony_ci                                          int32_t src_stride,
4662cabdff1aSopenharmony_ci                                          uint8_t *dst,
4663cabdff1aSopenharmony_ci                                          int32_t dst_stride,
4664cabdff1aSopenharmony_ci                                          const int8_t *filter_x,
4665cabdff1aSopenharmony_ci                                          const int8_t *filter_y,
4666cabdff1aSopenharmony_ci                                          int32_t width8mult,
4667cabdff1aSopenharmony_ci                                          int32_t weight,
4668cabdff1aSopenharmony_ci                                          int32_t offset,
4669cabdff1aSopenharmony_ci                                          int32_t rnd_val)
4670cabdff1aSopenharmony_ci{
4671cabdff1aSopenharmony_ci    uint32_t cnt;
4672cabdff1aSopenharmony_ci    v16u8 out0, out1;
4673cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
4674cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4675cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
4676cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4677cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4678cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4679cabdff1aSopenharmony_ci    v8i16 offset_vec, const_128, denom_vec;
4680cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4681cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
4682cabdff1aSopenharmony_ci
4683cabdff1aSopenharmony_ci    src -= (src_stride + 1);
4684cabdff1aSopenharmony_ci
4685cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
4686cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4687cabdff1aSopenharmony_ci
4688cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
4689cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
4690cabdff1aSopenharmony_ci
4691cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4692cabdff1aSopenharmony_ci
4693cabdff1aSopenharmony_ci    mask0 = LD_SB(ff_hevc_mask_arr);
4694cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
4695cabdff1aSopenharmony_ci
4696cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
4697cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
4698cabdff1aSopenharmony_ci
4699cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
4700cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val - 6);
4701cabdff1aSopenharmony_ci    const_128 = __msa_fill_h((128 * weight));
4702cabdff1aSopenharmony_ci    offset_vec += __msa_srar_h(const_128, denom_vec);
4703cabdff1aSopenharmony_ci
4704cabdff1aSopenharmony_ci    for (cnt = width8mult; cnt--;) {
4705cabdff1aSopenharmony_ci        LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
4706cabdff1aSopenharmony_ci        src += 8;
4707cabdff1aSopenharmony_ci        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4708cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4709cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4710cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4711cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4712cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4713cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4714cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4715cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4716cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4717cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4718cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4719cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4720cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4721cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4722cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4723cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4724cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4725cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4726cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4727cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4728cabdff1aSopenharmony_ci        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4729cabdff1aSopenharmony_ci        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4730cabdff1aSopenharmony_ci        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4731cabdff1aSopenharmony_ci        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4732cabdff1aSopenharmony_ci        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4733cabdff1aSopenharmony_ci        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4734cabdff1aSopenharmony_ci        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4735cabdff1aSopenharmony_ci        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4736cabdff1aSopenharmony_ci        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4737cabdff1aSopenharmony_ci        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4738cabdff1aSopenharmony_ci        MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4739cabdff1aSopenharmony_ci        MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4740cabdff1aSopenharmony_ci        MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4741cabdff1aSopenharmony_ci        MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4742cabdff1aSopenharmony_ci        SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4743cabdff1aSopenharmony_ci        SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
4744cabdff1aSopenharmony_ci        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4745cabdff1aSopenharmony_ci                    dst3_r, tmp0, tmp1, tmp2, tmp3);
4746cabdff1aSopenharmony_ci        ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4747cabdff1aSopenharmony_ci        ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4748cabdff1aSopenharmony_ci        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4749cabdff1aSopenharmony_ci        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4750cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4751cabdff1aSopenharmony_ci        dst += 8;
4752cabdff1aSopenharmony_ci    }
4753cabdff1aSopenharmony_ci}
4754cabdff1aSopenharmony_ci
4755cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src,
4756cabdff1aSopenharmony_ci                                      int32_t src_stride,
4757cabdff1aSopenharmony_ci                                      uint8_t *dst,
4758cabdff1aSopenharmony_ci                                      int32_t dst_stride,
4759cabdff1aSopenharmony_ci                                      const int8_t *filter_x,
4760cabdff1aSopenharmony_ci                                      const int8_t *filter_y,
4761cabdff1aSopenharmony_ci                                      int32_t weight,
4762cabdff1aSopenharmony_ci                                      int32_t offset,
4763cabdff1aSopenharmony_ci                                      int32_t rnd_val)
4764cabdff1aSopenharmony_ci{
4765cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
4766cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4767cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4768cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filter_vec;
4769cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4770cabdff1aSopenharmony_ci    v16i8 mask1;
4771cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4772cabdff1aSopenharmony_ci    v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
4773cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4774cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4775cabdff1aSopenharmony_ci    v4i32 dst4_r, dst4_l, dst5_r, dst5_l, weight_vec, rnd_vec;
4776cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4777cabdff1aSopenharmony_ci    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4778cabdff1aSopenharmony_ci    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4779cabdff1aSopenharmony_ci    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4780cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4781cabdff1aSopenharmony_ci    v8i16 offset_vec, const_128, denom_vec;
4782cabdff1aSopenharmony_ci
4783cabdff1aSopenharmony_ci    src -= (src_stride + 1);
4784cabdff1aSopenharmony_ci
4785cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
4786cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4787cabdff1aSopenharmony_ci
4788cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
4789cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
4790cabdff1aSopenharmony_ci
4791cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4792cabdff1aSopenharmony_ci
4793cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
4794cabdff1aSopenharmony_ci
4795cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
4796cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
4797cabdff1aSopenharmony_ci
4798cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
4799cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val - 6);
4800cabdff1aSopenharmony_ci    const_128 = __msa_fill_h((128 * weight));
4801cabdff1aSopenharmony_ci    offset_vec += __msa_srar_h(const_128, denom_vec);
4802cabdff1aSopenharmony_ci
4803cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4804cabdff1aSopenharmony_ci    src += (5 * src_stride);
4805cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src5, src6, src7, src8);
4806cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4807cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
4808cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4809cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4810cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4811cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4812cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4813cabdff1aSopenharmony_ci    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
4814cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
4815cabdff1aSopenharmony_ci    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
4816cabdff1aSopenharmony_ci    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
4817cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4818cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4819cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4820cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4821cabdff1aSopenharmony_ci    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4822cabdff1aSopenharmony_ci    dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
4823cabdff1aSopenharmony_ci    dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
4824cabdff1aSopenharmony_ci    dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
4825cabdff1aSopenharmony_ci    dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
4826cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4827cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4828cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4829cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4830cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4831cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4832cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
4833cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
4834cabdff1aSopenharmony_ci    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4835cabdff1aSopenharmony_ci    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4836cabdff1aSopenharmony_ci    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4837cabdff1aSopenharmony_ci    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4838cabdff1aSopenharmony_ci    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4839cabdff1aSopenharmony_ci    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4840cabdff1aSopenharmony_ci    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4841cabdff1aSopenharmony_ci    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4842cabdff1aSopenharmony_ci    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4843cabdff1aSopenharmony_ci    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
4844cabdff1aSopenharmony_ci    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4845cabdff1aSopenharmony_ci    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
4846cabdff1aSopenharmony_ci    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4847cabdff1aSopenharmony_ci    SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4848cabdff1aSopenharmony_ci    SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
4849cabdff1aSopenharmony_ci    MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4850cabdff1aSopenharmony_ci    MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4851cabdff1aSopenharmony_ci    MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
4852cabdff1aSopenharmony_ci    MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4853cabdff1aSopenharmony_ci    MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4854cabdff1aSopenharmony_ci    MUL2(dst4_l, weight_vec, dst5_l, weight_vec, dst4_l, dst5_l);
4855cabdff1aSopenharmony_ci    SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4856cabdff1aSopenharmony_ci    SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
4857cabdff1aSopenharmony_ci    SRAR_W4_SW(dst4_r, dst4_l, dst5_r, dst5_l, rnd_vec);
4858cabdff1aSopenharmony_ci    PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
4859cabdff1aSopenharmony_ci                tmp0, tmp1, tmp2, tmp3);
4860cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
4861cabdff1aSopenharmony_ci    ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4862cabdff1aSopenharmony_ci    ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4863cabdff1aSopenharmony_ci    ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
4864cabdff1aSopenharmony_ci    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4865cabdff1aSopenharmony_ci    CLIP_SH2_0_255(tmp4, tmp5);
4866cabdff1aSopenharmony_ci    PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
4867cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4868cabdff1aSopenharmony_ci    ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
4869cabdff1aSopenharmony_ci}
4870cabdff1aSopenharmony_ci
4871cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src,
4872cabdff1aSopenharmony_ci                                              int32_t src_stride,
4873cabdff1aSopenharmony_ci                                              uint8_t *dst,
4874cabdff1aSopenharmony_ci                                              int32_t dst_stride,
4875cabdff1aSopenharmony_ci                                              const int8_t *filter_x,
4876cabdff1aSopenharmony_ci                                              const int8_t *filter_y,
4877cabdff1aSopenharmony_ci                                              int32_t height,
4878cabdff1aSopenharmony_ci                                              int32_t weight,
4879cabdff1aSopenharmony_ci                                              int32_t offset,
4880cabdff1aSopenharmony_ci                                              int32_t rnd_val,
4881cabdff1aSopenharmony_ci                                              int32_t width8mult)
4882cabdff1aSopenharmony_ci{
4883cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
4884cabdff1aSopenharmony_ci    uint8_t *src_tmp;
4885cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
4886cabdff1aSopenharmony_ci    v16u8 out0, out1;
4887cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
4888cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4889cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filter_vec;
4890cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4891cabdff1aSopenharmony_ci    v16i8 mask1;
4892cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4893cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4894cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4895cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4896cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4897cabdff1aSopenharmony_ci    v8i16 offset_vec, const_128, denom_vec;
4898cabdff1aSopenharmony_ci    v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
4899cabdff1aSopenharmony_ci    v4i32 weight_vec, rnd_vec;
4900cabdff1aSopenharmony_ci
4901cabdff1aSopenharmony_ci    src -= (src_stride + 1);
4902cabdff1aSopenharmony_ci
4903cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
4904cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4905cabdff1aSopenharmony_ci
4906cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
4907cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
4908cabdff1aSopenharmony_ci
4909cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4910cabdff1aSopenharmony_ci
4911cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
4912cabdff1aSopenharmony_ci
4913cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
4914cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
4915cabdff1aSopenharmony_ci
4916cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
4917cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val - 6);
4918cabdff1aSopenharmony_ci    const_128 = __msa_fill_h((128 * weight));
4919cabdff1aSopenharmony_ci    offset_vec += __msa_srar_h(const_128, denom_vec);
4920cabdff1aSopenharmony_ci
4921cabdff1aSopenharmony_ci    for (cnt = width8mult; cnt--;) {
4922cabdff1aSopenharmony_ci        src_tmp = src;
4923cabdff1aSopenharmony_ci        dst_tmp = dst;
4924cabdff1aSopenharmony_ci
4925cabdff1aSopenharmony_ci        LD_SB3(src_tmp, src_stride, src0, src1, src2);
4926cabdff1aSopenharmony_ci        src_tmp += (3 * src_stride);
4927cabdff1aSopenharmony_ci        XORI_B3_128_SB(src0, src1, src2);
4928cabdff1aSopenharmony_ci
4929cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4930cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4931cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4932cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4933cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4934cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4935cabdff1aSopenharmony_ci
4936cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4937cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4938cabdff1aSopenharmony_ci
4939cabdff1aSopenharmony_ci        for (loop_cnt = height >> 2; loop_cnt--;) {
4940cabdff1aSopenharmony_ci            LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
4941cabdff1aSopenharmony_ci            src_tmp += (4 * src_stride);
4942cabdff1aSopenharmony_ci            XORI_B4_128_SB(src3, src4, src5, src6);
4943cabdff1aSopenharmony_ci
4944cabdff1aSopenharmony_ci            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4945cabdff1aSopenharmony_ci            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4946cabdff1aSopenharmony_ci            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4947cabdff1aSopenharmony_ci            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4948cabdff1aSopenharmony_ci            dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4949cabdff1aSopenharmony_ci            dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4950cabdff1aSopenharmony_ci            dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4951cabdff1aSopenharmony_ci            dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4952cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4953cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4954cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4955cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4956cabdff1aSopenharmony_ci            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4957cabdff1aSopenharmony_ci            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4958cabdff1aSopenharmony_ci            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4959cabdff1aSopenharmony_ci            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4960cabdff1aSopenharmony_ci            dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4961cabdff1aSopenharmony_ci            dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4962cabdff1aSopenharmony_ci            dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4963cabdff1aSopenharmony_ci            dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4964cabdff1aSopenharmony_ci            SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4965cabdff1aSopenharmony_ci            SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4966cabdff1aSopenharmony_ci            MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4967cabdff1aSopenharmony_ci            MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4968cabdff1aSopenharmony_ci            MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4969cabdff1aSopenharmony_ci            MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4970cabdff1aSopenharmony_ci            SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4971cabdff1aSopenharmony_ci            SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
4972cabdff1aSopenharmony_ci            PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4973cabdff1aSopenharmony_ci                        dst3_r, tmp0, tmp1, tmp2, tmp3);
4974cabdff1aSopenharmony_ci            ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4975cabdff1aSopenharmony_ci            ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4976cabdff1aSopenharmony_ci            CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4977cabdff1aSopenharmony_ci            PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4978cabdff1aSopenharmony_ci            ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4979cabdff1aSopenharmony_ci            dst_tmp += (4 * dst_stride);
4980cabdff1aSopenharmony_ci
4981cabdff1aSopenharmony_ci            dst10_r = dst54_r;
4982cabdff1aSopenharmony_ci            dst10_l = dst54_l;
4983cabdff1aSopenharmony_ci            dst21_r = dst65_r;
4984cabdff1aSopenharmony_ci            dst21_l = dst65_l;
4985cabdff1aSopenharmony_ci            dst2 = dst6;
4986cabdff1aSopenharmony_ci        }
4987cabdff1aSopenharmony_ci
4988cabdff1aSopenharmony_ci        src += 8;
4989cabdff1aSopenharmony_ci        dst += 8;
4990cabdff1aSopenharmony_ci    }
4991cabdff1aSopenharmony_ci}
4992cabdff1aSopenharmony_ci
4993cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src,
4994cabdff1aSopenharmony_ci                                     int32_t src_stride,
4995cabdff1aSopenharmony_ci                                     uint8_t *dst,
4996cabdff1aSopenharmony_ci                                     int32_t dst_stride,
4997cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
4998cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
4999cabdff1aSopenharmony_ci                                     int32_t height,
5000cabdff1aSopenharmony_ci                                     int32_t weight,
5001cabdff1aSopenharmony_ci                                     int32_t offset,
5002cabdff1aSopenharmony_ci                                     int32_t rnd_val)
5003cabdff1aSopenharmony_ci{
5004cabdff1aSopenharmony_ci
5005cabdff1aSopenharmony_ci    if (2 == height) {
5006cabdff1aSopenharmony_ci        hevc_hv_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
5007cabdff1aSopenharmony_ci                                  filter_x, filter_y, weight,
5008cabdff1aSopenharmony_ci                                  offset, rnd_val);
5009cabdff1aSopenharmony_ci    } else if (4 == height) {
5010cabdff1aSopenharmony_ci        hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride,
5011cabdff1aSopenharmony_ci                                      filter_x, filter_y, 1, weight,
5012cabdff1aSopenharmony_ci                                      offset, rnd_val);
5013cabdff1aSopenharmony_ci    } else if (6 == height) {
5014cabdff1aSopenharmony_ci        hevc_hv_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
5015cabdff1aSopenharmony_ci                                  filter_x, filter_y, weight,
5016cabdff1aSopenharmony_ci                                  offset, rnd_val);
5017cabdff1aSopenharmony_ci    } else if (0 == (height % 4)) {
5018cabdff1aSopenharmony_ci        hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5019cabdff1aSopenharmony_ci                                          filter_x, filter_y, height, weight,
5020cabdff1aSopenharmony_ci                                          offset, rnd_val, 1);
5021cabdff1aSopenharmony_ci    }
5022cabdff1aSopenharmony_ci}
5023cabdff1aSopenharmony_ci
5024cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src,
5025cabdff1aSopenharmony_ci                                      int32_t src_stride,
5026cabdff1aSopenharmony_ci                                      uint8_t *dst,
5027cabdff1aSopenharmony_ci                                      int32_t dst_stride,
5028cabdff1aSopenharmony_ci                                      const int8_t *filter_x,
5029cabdff1aSopenharmony_ci                                      const int8_t *filter_y,
5030cabdff1aSopenharmony_ci                                      int32_t height,
5031cabdff1aSopenharmony_ci                                      int32_t weight,
5032cabdff1aSopenharmony_ci                                      int32_t offset,
5033cabdff1aSopenharmony_ci                                      int32_t rnd_val)
5034cabdff1aSopenharmony_ci{
5035cabdff1aSopenharmony_ci    uint32_t loop_cnt;
5036cabdff1aSopenharmony_ci    uint8_t *src_tmp, *dst_tmp;
5037cabdff1aSopenharmony_ci    v16u8 out0, out1;
5038cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5039cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5040cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3;
5041cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
5042cabdff1aSopenharmony_ci    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5043cabdff1aSopenharmony_ci    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5044cabdff1aSopenharmony_ci    v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5045cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5046cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5047cabdff1aSopenharmony_ci    v8i16 offset_vec, const_128, denom_vec;
5048cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5049cabdff1aSopenharmony_ci    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
5050cabdff1aSopenharmony_ci
5051cabdff1aSopenharmony_ci    src -= (src_stride + 1);
5052cabdff1aSopenharmony_ci
5053cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
5054cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5055cabdff1aSopenharmony_ci
5056cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
5057cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
5058cabdff1aSopenharmony_ci
5059cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5060cabdff1aSopenharmony_ci
5061cabdff1aSopenharmony_ci    mask0 = LD_SB(ff_hevc_mask_arr);
5062cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
5063cabdff1aSopenharmony_ci
5064cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
5065cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val);
5066cabdff1aSopenharmony_ci
5067cabdff1aSopenharmony_ci    offset_vec = __msa_fill_h(offset);
5068cabdff1aSopenharmony_ci    denom_vec = __msa_fill_h(rnd_val - 6);
5069cabdff1aSopenharmony_ci    const_128 = __msa_fill_h((128 * weight));
5070cabdff1aSopenharmony_ci    offset_vec += __msa_srar_h(const_128, denom_vec);
5071cabdff1aSopenharmony_ci
5072cabdff1aSopenharmony_ci    src_tmp = src;
5073cabdff1aSopenharmony_ci    dst_tmp = dst;
5074cabdff1aSopenharmony_ci
5075cabdff1aSopenharmony_ci    LD_SB3(src_tmp, src_stride, src0, src1, src2);
5076cabdff1aSopenharmony_ci    src_tmp += (3 * src_stride);
5077cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
5078cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5079cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5080cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5081cabdff1aSopenharmony_ci    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5082cabdff1aSopenharmony_ci    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5083cabdff1aSopenharmony_ci    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5084cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5085cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5086cabdff1aSopenharmony_ci
5087cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
5088cabdff1aSopenharmony_ci        LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
5089cabdff1aSopenharmony_ci        src_tmp += (4 * src_stride);
5090cabdff1aSopenharmony_ci        XORI_B4_128_SB(src3, src4, src5, src6);
5091cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5092cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5093cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5094cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5095cabdff1aSopenharmony_ci        dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5096cabdff1aSopenharmony_ci        dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5097cabdff1aSopenharmony_ci        dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5098cabdff1aSopenharmony_ci        dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5099cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5100cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5101cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5102cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5103cabdff1aSopenharmony_ci        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5104cabdff1aSopenharmony_ci        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5105cabdff1aSopenharmony_ci        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5106cabdff1aSopenharmony_ci        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5107cabdff1aSopenharmony_ci        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5108cabdff1aSopenharmony_ci        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5109cabdff1aSopenharmony_ci        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5110cabdff1aSopenharmony_ci        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5111cabdff1aSopenharmony_ci        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5112cabdff1aSopenharmony_ci        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5113cabdff1aSopenharmony_ci        MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
5114cabdff1aSopenharmony_ci        MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
5115cabdff1aSopenharmony_ci        MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
5116cabdff1aSopenharmony_ci        MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
5117cabdff1aSopenharmony_ci        SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5118cabdff1aSopenharmony_ci        SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
5119cabdff1aSopenharmony_ci        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5120cabdff1aSopenharmony_ci                    dst3_r, tmp0, tmp1, tmp2, tmp3);
5121cabdff1aSopenharmony_ci        ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
5122cabdff1aSopenharmony_ci        ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
5123cabdff1aSopenharmony_ci        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5124cabdff1aSopenharmony_ci        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5125cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5126cabdff1aSopenharmony_ci        dst_tmp += (4 * dst_stride);
5127cabdff1aSopenharmony_ci
5128cabdff1aSopenharmony_ci        dst10_r = dst54_r;
5129cabdff1aSopenharmony_ci        dst10_l = dst54_l;
5130cabdff1aSopenharmony_ci        dst21_r = dst65_r;
5131cabdff1aSopenharmony_ci        dst21_l = dst65_l;
5132cabdff1aSopenharmony_ci        dsth2 = dsth6;
5133cabdff1aSopenharmony_ci    }
5134cabdff1aSopenharmony_ci
5135cabdff1aSopenharmony_ci    src += 8;
5136cabdff1aSopenharmony_ci    dst += 8;
5137cabdff1aSopenharmony_ci
5138cabdff1aSopenharmony_ci    mask2 = LD_SB(ff_hevc_mask_arr + 16);
5139cabdff1aSopenharmony_ci    mask3 = mask2 + 2;
5140cabdff1aSopenharmony_ci
5141cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
5142cabdff1aSopenharmony_ci    src += (3 * src_stride);
5143cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
5144cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
5145cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
5146cabdff1aSopenharmony_ci    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5147cabdff1aSopenharmony_ci    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5148cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
5149cabdff1aSopenharmony_ci    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5150cabdff1aSopenharmony_ci
5151cabdff1aSopenharmony_ci    for (loop_cnt = 2; loop_cnt--;) {
5152cabdff1aSopenharmony_ci        LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9,
5153cabdff1aSopenharmony_ci               src10);
5154cabdff1aSopenharmony_ci        src += (8 * src_stride);
5155cabdff1aSopenharmony_ci        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
5156cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5157cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5158cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5159cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5160cabdff1aSopenharmony_ci        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5161cabdff1aSopenharmony_ci        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5162cabdff1aSopenharmony_ci        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5163cabdff1aSopenharmony_ci        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5164cabdff1aSopenharmony_ci        dst32_r = __msa_ilvr_h(dst73, dst22);
5165cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
5166cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
5167cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
5168cabdff1aSopenharmony_ci        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5169cabdff1aSopenharmony_ci        dst76_r = __msa_ilvr_h(dst22, dst106);
5170cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5171cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5172cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5173cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5174cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5175cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5176cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5177cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5178cabdff1aSopenharmony_ci        SRA_4V(dst0, dst1, dst2, dst3, 6);
5179cabdff1aSopenharmony_ci        SRA_4V(dst4, dst5, dst6, dst7, 6);
5180cabdff1aSopenharmony_ci        MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
5181cabdff1aSopenharmony_ci        MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
5182cabdff1aSopenharmony_ci        MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
5183cabdff1aSopenharmony_ci        MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
5184cabdff1aSopenharmony_ci        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5185cabdff1aSopenharmony_ci        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5186cabdff1aSopenharmony_ci        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5187cabdff1aSopenharmony_ci                    tmp2, tmp3);
5188cabdff1aSopenharmony_ci        ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
5189cabdff1aSopenharmony_ci        ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
5190cabdff1aSopenharmony_ci        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5191cabdff1aSopenharmony_ci        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5192cabdff1aSopenharmony_ci        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5193cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
5194cabdff1aSopenharmony_ci
5195cabdff1aSopenharmony_ci        dst10_r = dst98_r;
5196cabdff1aSopenharmony_ci        dst21_r = dst109_r;
5197cabdff1aSopenharmony_ci        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5198cabdff1aSopenharmony_ci    }
5199cabdff1aSopenharmony_ci}
5200cabdff1aSopenharmony_ci
5201cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src,
5202cabdff1aSopenharmony_ci                                      int32_t src_stride,
5203cabdff1aSopenharmony_ci                                      uint8_t *dst,
5204cabdff1aSopenharmony_ci                                      int32_t dst_stride,
5205cabdff1aSopenharmony_ci                                      const int8_t *filter_x,
5206cabdff1aSopenharmony_ci                                      const int8_t *filter_y,
5207cabdff1aSopenharmony_ci                                      int32_t height,
5208cabdff1aSopenharmony_ci                                      int32_t weight,
5209cabdff1aSopenharmony_ci                                      int32_t offset,
5210cabdff1aSopenharmony_ci                                      int32_t rnd_val)
5211cabdff1aSopenharmony_ci{
5212cabdff1aSopenharmony_ci    if (4 == height) {
5213cabdff1aSopenharmony_ci        hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride,
5214cabdff1aSopenharmony_ci                                      filter_x, filter_y, 2, weight, offset,
5215cabdff1aSopenharmony_ci                                      rnd_val);
5216cabdff1aSopenharmony_ci    } else {
5217cabdff1aSopenharmony_ci        hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5218cabdff1aSopenharmony_ci                                          filter_x, filter_y, height, weight,
5219cabdff1aSopenharmony_ci                                          offset, rnd_val, 2);
5220cabdff1aSopenharmony_ci    }
5221cabdff1aSopenharmony_ci}
5222cabdff1aSopenharmony_ci
5223cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src,
5224cabdff1aSopenharmony_ci                                      int32_t src_stride,
5225cabdff1aSopenharmony_ci                                      uint8_t *dst,
5226cabdff1aSopenharmony_ci                                      int32_t dst_stride,
5227cabdff1aSopenharmony_ci                                      const int8_t *filter_x,
5228cabdff1aSopenharmony_ci                                      const int8_t *filter_y,
5229cabdff1aSopenharmony_ci                                      int32_t height,
5230cabdff1aSopenharmony_ci                                      int32_t weight,
5231cabdff1aSopenharmony_ci                                      int32_t offset,
5232cabdff1aSopenharmony_ci                                      int32_t rnd_val)
5233cabdff1aSopenharmony_ci{
5234cabdff1aSopenharmony_ci    hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5235cabdff1aSopenharmony_ci                                      filter_x, filter_y, height, weight,
5236cabdff1aSopenharmony_ci                                      offset, rnd_val, 3);
5237cabdff1aSopenharmony_ci}
5238cabdff1aSopenharmony_ci
5239cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src,
5240cabdff1aSopenharmony_ci                                      int32_t src_stride,
5241cabdff1aSopenharmony_ci                                      uint8_t *dst,
5242cabdff1aSopenharmony_ci                                      int32_t dst_stride,
5243cabdff1aSopenharmony_ci                                      const int8_t *filter_x,
5244cabdff1aSopenharmony_ci                                      const int8_t *filter_y,
5245cabdff1aSopenharmony_ci                                      int32_t height,
5246cabdff1aSopenharmony_ci                                      int32_t weight,
5247cabdff1aSopenharmony_ci                                      int32_t offset,
5248cabdff1aSopenharmony_ci                                      int32_t rnd_val)
5249cabdff1aSopenharmony_ci{
5250cabdff1aSopenharmony_ci    hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5251cabdff1aSopenharmony_ci                                      filter_x, filter_y, height, weight,
5252cabdff1aSopenharmony_ci                                      offset, rnd_val, 4);
5253cabdff1aSopenharmony_ci}
5254cabdff1aSopenharmony_ci
5255cabdff1aSopenharmony_ci#define UNIWGT_MC_COPY(WIDTH)                                                \
5256cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \
5257cabdff1aSopenharmony_ci                                                      ptrdiff_t dst_stride,  \
5258cabdff1aSopenharmony_ci                                                      uint8_t *src,          \
5259cabdff1aSopenharmony_ci                                                      ptrdiff_t src_stride,  \
5260cabdff1aSopenharmony_ci                                                      int height,            \
5261cabdff1aSopenharmony_ci                                                      int denom,             \
5262cabdff1aSopenharmony_ci                                                      int weight,            \
5263cabdff1aSopenharmony_ci                                                      int offset,            \
5264cabdff1aSopenharmony_ci                                                      intptr_t mx,           \
5265cabdff1aSopenharmony_ci                                                      intptr_t my,           \
5266cabdff1aSopenharmony_ci                                                      int width)             \
5267cabdff1aSopenharmony_ci{                                                                            \
5268cabdff1aSopenharmony_ci    int shift = denom + 14 - 8;                                              \
5269cabdff1aSopenharmony_ci    hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride,        \
5270cabdff1aSopenharmony_ci                                    height, weight, offset, shift);          \
5271cabdff1aSopenharmony_ci}
5272cabdff1aSopenharmony_ci
5273cabdff1aSopenharmony_ciUNIWGT_MC_COPY(4);
5274cabdff1aSopenharmony_ciUNIWGT_MC_COPY(6);
5275cabdff1aSopenharmony_ciUNIWGT_MC_COPY(8);
5276cabdff1aSopenharmony_ciUNIWGT_MC_COPY(12);
5277cabdff1aSopenharmony_ciUNIWGT_MC_COPY(16);
5278cabdff1aSopenharmony_ciUNIWGT_MC_COPY(24);
5279cabdff1aSopenharmony_ciUNIWGT_MC_COPY(32);
5280cabdff1aSopenharmony_ciUNIWGT_MC_COPY(48);
5281cabdff1aSopenharmony_ciUNIWGT_MC_COPY(64);
5282cabdff1aSopenharmony_ci
5283cabdff1aSopenharmony_ci#undef UNIWGT_MC_COPY
5284cabdff1aSopenharmony_ci
5285cabdff1aSopenharmony_ci#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                        \
5286cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,        \
5287cabdff1aSopenharmony_ci                                                         ptrdiff_t            \
5288cabdff1aSopenharmony_ci                                                         dst_stride,          \
5289cabdff1aSopenharmony_ci                                                         uint8_t *src,        \
5290cabdff1aSopenharmony_ci                                                         ptrdiff_t            \
5291cabdff1aSopenharmony_ci                                                         src_stride,          \
5292cabdff1aSopenharmony_ci                                                         int height,          \
5293cabdff1aSopenharmony_ci                                                         int denom,           \
5294cabdff1aSopenharmony_ci                                                         int weight,          \
5295cabdff1aSopenharmony_ci                                                         int offset,          \
5296cabdff1aSopenharmony_ci                                                         intptr_t mx,         \
5297cabdff1aSopenharmony_ci                                                         intptr_t my,         \
5298cabdff1aSopenharmony_ci                                                         int width)           \
5299cabdff1aSopenharmony_ci{                                                                             \
5300cabdff1aSopenharmony_ci    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];             \
5301cabdff1aSopenharmony_ci    int shift = denom + 14 - 8;                                               \
5302cabdff1aSopenharmony_ci                                                                              \
5303cabdff1aSopenharmony_ci    hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,        \
5304cabdff1aSopenharmony_ci                                                 dst_stride, filter, height,  \
5305cabdff1aSopenharmony_ci                                                 weight, offset, shift);      \
5306cabdff1aSopenharmony_ci}
5307cabdff1aSopenharmony_ci
5308cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 4, 8, hz, mx);
5309cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 8, 8, hz, mx);
5310cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 12, 8, hz, mx);
5311cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 16, 8, hz, mx);
5312cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 24, 8, hz, mx);
5313cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 32, 8, hz, mx);
5314cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 48, 8, hz, mx);
5315cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 64, 8, hz, mx);
5316cabdff1aSopenharmony_ci
5317cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 4, 8, vt, my);
5318cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 8, 8, vt, my);
5319cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 12, 8, vt, my);
5320cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 16, 8, vt, my);
5321cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 24, 8, vt, my);
5322cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 32, 8, vt, my);
5323cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 48, 8, vt, my);
5324cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 64, 8, vt, my);
5325cabdff1aSopenharmony_ci
5326cabdff1aSopenharmony_ciUNI_W_MC(epel, h, 4, 4, hz, mx);
5327cabdff1aSopenharmony_ciUNI_W_MC(epel, h, 6, 4, hz, mx);
5328cabdff1aSopenharmony_ciUNI_W_MC(epel, h, 8, 4, hz, mx);
5329cabdff1aSopenharmony_ciUNI_W_MC(epel, h, 12, 4, hz, mx);
5330cabdff1aSopenharmony_ciUNI_W_MC(epel, h, 16, 4, hz, mx);
5331cabdff1aSopenharmony_ciUNI_W_MC(epel, h, 24, 4, hz, mx);
5332cabdff1aSopenharmony_ciUNI_W_MC(epel, h, 32, 4, hz, mx);
5333cabdff1aSopenharmony_ci
5334cabdff1aSopenharmony_ciUNI_W_MC(epel, v, 4, 4, vt, my);
5335cabdff1aSopenharmony_ciUNI_W_MC(epel, v, 6, 4, vt, my);
5336cabdff1aSopenharmony_ciUNI_W_MC(epel, v, 8, 4, vt, my);
5337cabdff1aSopenharmony_ciUNI_W_MC(epel, v, 12, 4, vt, my);
5338cabdff1aSopenharmony_ciUNI_W_MC(epel, v, 16, 4, vt, my);
5339cabdff1aSopenharmony_ciUNI_W_MC(epel, v, 24, 4, vt, my);
5340cabdff1aSopenharmony_ciUNI_W_MC(epel, v, 32, 4, vt, my);
5341cabdff1aSopenharmony_ci
5342cabdff1aSopenharmony_ci#undef UNI_W_MC
5343cabdff1aSopenharmony_ci
5344cabdff1aSopenharmony_ci#define UNI_W_MC_HV(PEL, WIDTH, TAP)                                          \
5345cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst,           \
5346cabdff1aSopenharmony_ci                                                      ptrdiff_t dst_stride,   \
5347cabdff1aSopenharmony_ci                                                      uint8_t *src,           \
5348cabdff1aSopenharmony_ci                                                      ptrdiff_t src_stride,   \
5349cabdff1aSopenharmony_ci                                                      int height,             \
5350cabdff1aSopenharmony_ci                                                      int denom,              \
5351cabdff1aSopenharmony_ci                                                      int weight,             \
5352cabdff1aSopenharmony_ci                                                      int offset,             \
5353cabdff1aSopenharmony_ci                                                      intptr_t mx,            \
5354cabdff1aSopenharmony_ci                                                      intptr_t my,            \
5355cabdff1aSopenharmony_ci                                                      int width)              \
5356cabdff1aSopenharmony_ci{                                                                             \
5357cabdff1aSopenharmony_ci    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];                 \
5358cabdff1aSopenharmony_ci    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];                 \
5359cabdff1aSopenharmony_ci    int shift = denom + 14 - 8;                                               \
5360cabdff1aSopenharmony_ci                                                                              \
5361cabdff1aSopenharmony_ci    hevc_hv_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride,  \
5362cabdff1aSopenharmony_ci                                           filter_x, filter_y,  height,       \
5363cabdff1aSopenharmony_ci                                           weight, offset, shift);            \
5364cabdff1aSopenharmony_ci}
5365cabdff1aSopenharmony_ci
5366cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 4, 8);
5367cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 8, 8);
5368cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 12, 8);
5369cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 16, 8);
5370cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 24, 8);
5371cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 32, 8);
5372cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 48, 8);
5373cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 64, 8);
5374cabdff1aSopenharmony_ci
5375cabdff1aSopenharmony_ciUNI_W_MC_HV(epel, 4, 4);
5376cabdff1aSopenharmony_ciUNI_W_MC_HV(epel, 6, 4);
5377cabdff1aSopenharmony_ciUNI_W_MC_HV(epel, 8, 4);
5378cabdff1aSopenharmony_ciUNI_W_MC_HV(epel, 12, 4);
5379cabdff1aSopenharmony_ciUNI_W_MC_HV(epel, 16, 4);
5380cabdff1aSopenharmony_ciUNI_W_MC_HV(epel, 24, 4);
5381cabdff1aSopenharmony_ciUNI_W_MC_HV(epel, 32, 4);
5382cabdff1aSopenharmony_ci
5383cabdff1aSopenharmony_ci#undef UNI_W_MC_HV
5384