1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
22cabdff1aSopenharmony_ci#include "libavcodec/mips/hevcdsp_mips.h"
23cabdff1aSopenharmony_ci#include "libavcodec/mips/hevc_macros_msa.h"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_cistatic const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26cabdff1aSopenharmony_ci    /* 8 width cases */
27cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
29cabdff1aSopenharmony_ci};
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_ci#define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset,  \
32cabdff1aSopenharmony_ci                           out0, out1)                              \
33cabdff1aSopenharmony_ci{                                                                   \
34cabdff1aSopenharmony_ci    v4i32 out0_r, out1_r, out0_l, out1_l;                           \
35cabdff1aSopenharmony_ci                                                                    \
36cabdff1aSopenharmony_ci    ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r);               \
37cabdff1aSopenharmony_ci    ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l);               \
38cabdff1aSopenharmony_ci                                                                    \
39cabdff1aSopenharmony_ci    out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt);  \
40cabdff1aSopenharmony_ci    out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt);  \
41cabdff1aSopenharmony_ci    out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt);  \
42cabdff1aSopenharmony_ci    out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt);  \
43cabdff1aSopenharmony_ci                                                                    \
44cabdff1aSopenharmony_ci    SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd);                \
45cabdff1aSopenharmony_ci    PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1);        \
46cabdff1aSopenharmony_ci    CLIP_SH2_0_255(out0, out1);                                     \
47cabdff1aSopenharmony_ci}
48cabdff1aSopenharmony_ci
49cabdff1aSopenharmony_ci#define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3,       \
50cabdff1aSopenharmony_ci                           wgt, rnd, offset, out0, out1, out2, out3)         \
51cabdff1aSopenharmony_ci{                                                                            \
52cabdff1aSopenharmony_ci    HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1);  \
53cabdff1aSopenharmony_ci    HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3);  \
54cabdff1aSopenharmony_ci}
55cabdff1aSopenharmony_ci
56cabdff1aSopenharmony_ci#define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd,  \
57cabdff1aSopenharmony_ci                                    offset, out0, out1)              \
58cabdff1aSopenharmony_ci{                                                                    \
59cabdff1aSopenharmony_ci    v4i32 out0_r, out1_r, out0_l, out1_l;                            \
60cabdff1aSopenharmony_ci                                                                     \
61cabdff1aSopenharmony_ci    ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r);                \
62cabdff1aSopenharmony_ci    ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l);                \
63cabdff1aSopenharmony_ci    out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt);   \
64cabdff1aSopenharmony_ci    out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt);   \
65cabdff1aSopenharmony_ci    out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt);   \
66cabdff1aSopenharmony_ci    out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt);   \
67cabdff1aSopenharmony_ci    SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd);                 \
68cabdff1aSopenharmony_ci    PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1);         \
69cabdff1aSopenharmony_ci    CLIP_SH2_0_255(out0, out1);                                      \
70cabdff1aSopenharmony_ci}
71cabdff1aSopenharmony_ci
72cabdff1aSopenharmony_ci#define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2,  \
73cabdff1aSopenharmony_ci                                    vec3, wgt, rnd, offset, out0, out1,    \
74cabdff1aSopenharmony_ci                                    out2, out3)                            \
75cabdff1aSopenharmony_ci{                                                                          \
76cabdff1aSopenharmony_ci    HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset,    \
77cabdff1aSopenharmony_ci                                out0, out1);                               \
78cabdff1aSopenharmony_ci    HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset,    \
79cabdff1aSopenharmony_ci                                out2, out3);                               \
80cabdff1aSopenharmony_ci}
81cabdff1aSopenharmony_ci
82cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
83cabdff1aSopenharmony_ci                                   int32_t src_stride,
84cabdff1aSopenharmony_ci                                   int16_t *src1_ptr,
85cabdff1aSopenharmony_ci                                   int32_t src2_stride,
86cabdff1aSopenharmony_ci                                   uint8_t *dst,
87cabdff1aSopenharmony_ci                                   int32_t dst_stride,
88cabdff1aSopenharmony_ci                                   int32_t height,
89cabdff1aSopenharmony_ci                                   int32_t weight0,
90cabdff1aSopenharmony_ci                                   int32_t weight1,
91cabdff1aSopenharmony_ci                                   int32_t offset0,
92cabdff1aSopenharmony_ci                                   int32_t offset1,
93cabdff1aSopenharmony_ci                                   int32_t rnd_val)
94cabdff1aSopenharmony_ci{
95cabdff1aSopenharmony_ci    uint32_t loop_cnt, tp0, tp1, tp2, tp3;
96cabdff1aSopenharmony_ci    uint64_t tpd0, tpd1, tpd2, tpd3;
97cabdff1aSopenharmony_ci    int32_t offset, weight;
98cabdff1aSopenharmony_ci    v16u8 out0, out1;
99cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
100cabdff1aSopenharmony_ci    v16i8 src0 = { 0 }, src1 = { 0 };
101cabdff1aSopenharmony_ci    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
102cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, weight_vec;
103cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
104cabdff1aSopenharmony_ci
105cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
106cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
107cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
108cabdff1aSopenharmony_ci
109cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
110cabdff1aSopenharmony_ci    weight_vec = (v8i16) __msa_fill_w(weight);
111cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
112cabdff1aSopenharmony_ci
113cabdff1aSopenharmony_ci    if (2 == height) {
114cabdff1aSopenharmony_ci        LW2(src0_ptr, src_stride, tp0, tp1);
115cabdff1aSopenharmony_ci        INSERT_W2_SB(tp0, tp1, src0);
116cabdff1aSopenharmony_ci        LD2(src1_ptr, src2_stride, tpd0, tpd1);
117cabdff1aSopenharmony_ci        INSERT_D2_SH(tpd0, tpd1, in0);
118cabdff1aSopenharmony_ci
119cabdff1aSopenharmony_ci        dst0 = (v8i16) __msa_ilvr_b(zero, src0);
120cabdff1aSopenharmony_ci        dst0 <<= 6;
121cabdff1aSopenharmony_ci
122cabdff1aSopenharmony_ci        ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
123cabdff1aSopenharmony_ci        dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
124cabdff1aSopenharmony_ci        dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
125cabdff1aSopenharmony_ci        SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
126cabdff1aSopenharmony_ci        dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
127cabdff1aSopenharmony_ci        CLIP_SH_0_255(dst0);
128cabdff1aSopenharmony_ci        out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
129cabdff1aSopenharmony_ci        ST_W2(out0, 0, 1, dst, dst_stride);
130cabdff1aSopenharmony_ci    } else if (4 == height) {
131cabdff1aSopenharmony_ci        LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
132cabdff1aSopenharmony_ci        INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
133cabdff1aSopenharmony_ci        LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
134cabdff1aSopenharmony_ci        INSERT_D2_SH(tpd0, tpd1, in0);
135cabdff1aSopenharmony_ci        INSERT_D2_SH(tpd2, tpd3, in1);
136cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
137cabdff1aSopenharmony_ci        SLLI_2V(dst0, dst1, 6);
138cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec,
139cabdff1aSopenharmony_ci                                    offset_vec, dst0, dst1);
140cabdff1aSopenharmony_ci        out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
141cabdff1aSopenharmony_ci        ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
142cabdff1aSopenharmony_ci    } else if (0 == height % 8) {
143cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 3); loop_cnt--;) {
144cabdff1aSopenharmony_ci            LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
145cabdff1aSopenharmony_ci            src0_ptr += 4 * src_stride;
146cabdff1aSopenharmony_ci            INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
147cabdff1aSopenharmony_ci            LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
148cabdff1aSopenharmony_ci            src0_ptr += 4 * src_stride;
149cabdff1aSopenharmony_ci            INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
150cabdff1aSopenharmony_ci            LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
151cabdff1aSopenharmony_ci            src1_ptr += (4 * src2_stride);
152cabdff1aSopenharmony_ci            INSERT_D2_SH(tpd0, tpd1, in0);
153cabdff1aSopenharmony_ci            INSERT_D2_SH(tpd2, tpd3, in1);
154cabdff1aSopenharmony_ci            LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
155cabdff1aSopenharmony_ci            src1_ptr += (4 * src2_stride);
156cabdff1aSopenharmony_ci            INSERT_D2_SH(tpd0, tpd1, in2);
157cabdff1aSopenharmony_ci            INSERT_D2_SH(tpd2, tpd3, in3);
158cabdff1aSopenharmony_ci            ILVRL_B2_SH(zero, src0, dst0, dst1);
159cabdff1aSopenharmony_ci            ILVRL_B2_SH(zero, src1, dst2, dst3);
160cabdff1aSopenharmony_ci            SLLI_4V(dst0, dst1, dst2, dst3, 6);
161cabdff1aSopenharmony_ci            HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
162cabdff1aSopenharmony_ci                                        in3, weight_vec, rnd_vec, offset_vec,
163cabdff1aSopenharmony_ci                                        dst0, dst1, dst2, dst3);
164cabdff1aSopenharmony_ci            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
165cabdff1aSopenharmony_ci            ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
166cabdff1aSopenharmony_ci            dst += (8 * dst_stride);
167cabdff1aSopenharmony_ci        }
168cabdff1aSopenharmony_ci    }
169cabdff1aSopenharmony_ci}
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr,
172cabdff1aSopenharmony_ci                                   int32_t src_stride,
173cabdff1aSopenharmony_ci                                   int16_t *src1_ptr,
174cabdff1aSopenharmony_ci                                   int32_t src2_stride,
175cabdff1aSopenharmony_ci                                   uint8_t *dst,
176cabdff1aSopenharmony_ci                                   int32_t dst_stride,
177cabdff1aSopenharmony_ci                                   int32_t height,
178cabdff1aSopenharmony_ci                                   int32_t weight0,
179cabdff1aSopenharmony_ci                                   int32_t weight1,
180cabdff1aSopenharmony_ci                                   int32_t offset0,
181cabdff1aSopenharmony_ci                                   int32_t offset1,
182cabdff1aSopenharmony_ci                                   int32_t rnd_val)
183cabdff1aSopenharmony_ci{
184cabdff1aSopenharmony_ci    uint32_t loop_cnt;
185cabdff1aSopenharmony_ci    int32_t offset, weight;
186cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
187cabdff1aSopenharmony_ci    v16u8 out0, out1;
188cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
189cabdff1aSopenharmony_ci    v16i8 src0 = { 0 }, src1 = { 0 };
190cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
191cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
192cabdff1aSopenharmony_ci    v4i32 offset_vec, weight_vec, rnd_vec;
193cabdff1aSopenharmony_ci
194cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
195cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
196cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
197cabdff1aSopenharmony_ci
198cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
199cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
200cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
201cabdff1aSopenharmony_ci
202cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
203cabdff1aSopenharmony_ci        LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
204cabdff1aSopenharmony_ci        src0_ptr += (4 * src_stride);
205cabdff1aSopenharmony_ci        INSERT_D2_SB(tp0, tp1, src0);
206cabdff1aSopenharmony_ci        INSERT_D2_SB(tp2, tp3, src1);
207cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
208cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
209cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
210cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src1, dst2, dst3);
211cabdff1aSopenharmony_ci        SLLI_4V(dst0, dst1, dst2, dst3, 6);
212cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3,
213cabdff1aSopenharmony_ci                                    in0, in1, in2, in3,
214cabdff1aSopenharmony_ci                                    weight_vec, rnd_vec, offset_vec,
215cabdff1aSopenharmony_ci                                    dst0, dst1, dst2, dst3);
216cabdff1aSopenharmony_ci        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
217cabdff1aSopenharmony_ci        ST_W2(out0, 0, 2, dst, dst_stride);
218cabdff1aSopenharmony_ci        ST_H2(out0, 2, 6, dst + 4, dst_stride);
219cabdff1aSopenharmony_ci        ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
220cabdff1aSopenharmony_ci        ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
221cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
222cabdff1aSopenharmony_ci    }
223cabdff1aSopenharmony_ci}
224cabdff1aSopenharmony_ci
225cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
226cabdff1aSopenharmony_ci                                   int32_t src_stride,
227cabdff1aSopenharmony_ci                                   int16_t *src1_ptr,
228cabdff1aSopenharmony_ci                                   int32_t src2_stride,
229cabdff1aSopenharmony_ci                                   uint8_t *dst,
230cabdff1aSopenharmony_ci                                   int32_t dst_stride,
231cabdff1aSopenharmony_ci                                   int32_t height,
232cabdff1aSopenharmony_ci                                   int32_t weight0,
233cabdff1aSopenharmony_ci                                   int32_t weight1,
234cabdff1aSopenharmony_ci                                   int32_t offset0,
235cabdff1aSopenharmony_ci                                   int32_t offset1,
236cabdff1aSopenharmony_ci                                   int32_t rnd_val)
237cabdff1aSopenharmony_ci{
238cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
239cabdff1aSopenharmony_ci    int32_t offset, weight;
240cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
241cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
242cabdff1aSopenharmony_ci    v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 };
243cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5;
244cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
245cabdff1aSopenharmony_ci    v4i32 offset_vec, weight_vec, rnd_vec;
246cabdff1aSopenharmony_ci
247cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
248cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
249cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
250cabdff1aSopenharmony_ci
251cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
252cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
253cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
254cabdff1aSopenharmony_ci
255cabdff1aSopenharmony_ci    if (2 == height) {
256cabdff1aSopenharmony_ci        LD2(src0_ptr, src_stride, tp0, tp1);
257cabdff1aSopenharmony_ci        INSERT_D2_SB(tp0, tp1, src0);
258cabdff1aSopenharmony_ci        LD_SH2(src1_ptr, src2_stride, in0, in1);
259cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
260cabdff1aSopenharmony_ci        SLLI_2V(dst0, dst1, 6);
261cabdff1aSopenharmony_ci
262cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
263cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
264cabdff1aSopenharmony_ci                           dst0, dst1);
265cabdff1aSopenharmony_ci
266cabdff1aSopenharmony_ci        out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
267cabdff1aSopenharmony_ci        ST_D2(out0, 0, 1, dst, dst_stride);
268cabdff1aSopenharmony_ci    } else if (6 == height) {
269cabdff1aSopenharmony_ci        LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
270cabdff1aSopenharmony_ci        src0_ptr += 4 * src_stride;
271cabdff1aSopenharmony_ci        INSERT_D2_SB(tp0, tp1, src0);
272cabdff1aSopenharmony_ci        INSERT_D2_SB(tp2, tp3, src1);
273cabdff1aSopenharmony_ci        LD2(src0_ptr, src_stride, tp0, tp1);
274cabdff1aSopenharmony_ci        INSERT_D2_SB(tp0, tp1, src2);
275cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
276cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src1, dst2, dst3);
277cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src2, dst4, dst5);
278cabdff1aSopenharmony_ci        LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
279cabdff1aSopenharmony_ci        SLLI_4V(dst0, dst1, dst2, dst3, 6);
280cabdff1aSopenharmony_ci        SLLI_2V(dst4, dst5, 6);
281cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
282cabdff1aSopenharmony_ci                                    weight_vec, rnd_vec, offset_vec, dst0, dst1,
283cabdff1aSopenharmony_ci                                    dst2, dst3);
284cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
285cabdff1aSopenharmony_ci                                    offset_vec, dst4, dst5);
286cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
287cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
288cabdff1aSopenharmony_ci        ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
289cabdff1aSopenharmony_ci    } else if (0 == height % 4) {
290cabdff1aSopenharmony_ci        uint32_t loop_cnt;
291cabdff1aSopenharmony_ci
292cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 2); loop_cnt--;) {
293cabdff1aSopenharmony_ci            LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
294cabdff1aSopenharmony_ci            src0_ptr += (4 * src_stride);
295cabdff1aSopenharmony_ci            INSERT_D2_SB(tp0, tp1, src0);
296cabdff1aSopenharmony_ci            INSERT_D2_SB(tp2, tp3, src1);
297cabdff1aSopenharmony_ci            ILVRL_B2_SH(zero, src0, dst0, dst1);
298cabdff1aSopenharmony_ci            ILVRL_B2_SH(zero, src1, dst2, dst3);
299cabdff1aSopenharmony_ci            LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
300cabdff1aSopenharmony_ci            src1_ptr += (4 * src2_stride);
301cabdff1aSopenharmony_ci
302cabdff1aSopenharmony_ci            SLLI_4V(dst0, dst1, dst2, dst3, 6);
303cabdff1aSopenharmony_ci            HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
304cabdff1aSopenharmony_ci                                        in3, weight_vec, rnd_vec, offset_vec,
305cabdff1aSopenharmony_ci                                        dst0, dst1, dst2, dst3);
306cabdff1aSopenharmony_ci            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
307cabdff1aSopenharmony_ci            ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
308cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
309cabdff1aSopenharmony_ci        }
310cabdff1aSopenharmony_ci    }
311cabdff1aSopenharmony_ci}
312cabdff1aSopenharmony_ci
313cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr,
314cabdff1aSopenharmony_ci                                    int32_t src_stride,
315cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
316cabdff1aSopenharmony_ci                                    int32_t src2_stride,
317cabdff1aSopenharmony_ci                                    uint8_t *dst,
318cabdff1aSopenharmony_ci                                    int32_t dst_stride,
319cabdff1aSopenharmony_ci                                    int32_t height,
320cabdff1aSopenharmony_ci                                    int32_t weight0,
321cabdff1aSopenharmony_ci                                    int32_t weight1,
322cabdff1aSopenharmony_ci                                    int32_t offset0,
323cabdff1aSopenharmony_ci                                    int32_t offset1,
324cabdff1aSopenharmony_ci                                    int32_t rnd_val)
325cabdff1aSopenharmony_ci{
326cabdff1aSopenharmony_ci    uint32_t loop_cnt;
327cabdff1aSopenharmony_ci    int32_t offset, weight;
328cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
329cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
330cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
331cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
332cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
333cabdff1aSopenharmony_ci    v4i32 offset_vec, weight_vec, rnd_vec;
334cabdff1aSopenharmony_ci
335cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
336cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
337cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
338cabdff1aSopenharmony_ci
339cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
340cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
341cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
342cabdff1aSopenharmony_ci
343cabdff1aSopenharmony_ci    for (loop_cnt = (16 >> 2); loop_cnt--;) {
344cabdff1aSopenharmony_ci        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
345cabdff1aSopenharmony_ci        src0_ptr += (4 * src_stride);
346cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
347cabdff1aSopenharmony_ci        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
348cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
349cabdff1aSopenharmony_ci
350cabdff1aSopenharmony_ci        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
351cabdff1aSopenharmony_ci        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
352cabdff1aSopenharmony_ci                   dst0, dst1, dst2, dst3);
353cabdff1aSopenharmony_ci
354cabdff1aSopenharmony_ci        SLLI_4V(dst0, dst1, dst2, dst3, 6);
355cabdff1aSopenharmony_ci        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
356cabdff1aSopenharmony_ci        ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
357cabdff1aSopenharmony_ci
358cabdff1aSopenharmony_ci        dst4 <<= 6;
359cabdff1aSopenharmony_ci        dst5 <<= 6;
360cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
361cabdff1aSopenharmony_ci                                    weight_vec, rnd_vec, offset_vec, dst0, dst1,
362cabdff1aSopenharmony_ci                                    dst2, dst3);
363cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
364cabdff1aSopenharmony_ci                                    offset_vec, dst4, dst5);
365cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
366cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
367cabdff1aSopenharmony_ci        ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
368cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
369cabdff1aSopenharmony_ci    }
370cabdff1aSopenharmony_ci}
371cabdff1aSopenharmony_ci
372cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr,
373cabdff1aSopenharmony_ci                                    int32_t src_stride,
374cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
375cabdff1aSopenharmony_ci                                    int32_t src2_stride,
376cabdff1aSopenharmony_ci                                    uint8_t *dst,
377cabdff1aSopenharmony_ci                                    int32_t dst_stride,
378cabdff1aSopenharmony_ci                                    int32_t height,
379cabdff1aSopenharmony_ci                                    int32_t weight0,
380cabdff1aSopenharmony_ci                                    int32_t weight1,
381cabdff1aSopenharmony_ci                                    int32_t offset0,
382cabdff1aSopenharmony_ci                                    int32_t offset1,
383cabdff1aSopenharmony_ci                                    int32_t rnd_val)
384cabdff1aSopenharmony_ci{
385cabdff1aSopenharmony_ci    uint32_t loop_cnt;
386cabdff1aSopenharmony_ci    int32_t offset, weight;
387cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
388cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
389cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
390cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
391cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
392cabdff1aSopenharmony_ci    v4i32 offset_vec, weight_vec, rnd_vec;
393cabdff1aSopenharmony_ci
394cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
395cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
396cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
397cabdff1aSopenharmony_ci
398cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
399cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
400cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
401cabdff1aSopenharmony_ci
402cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
403cabdff1aSopenharmony_ci        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
404cabdff1aSopenharmony_ci        src0_ptr += (4 * src_stride);
405cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
406cabdff1aSopenharmony_ci        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
407cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
408cabdff1aSopenharmony_ci        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
409cabdff1aSopenharmony_ci                   tmp2, tmp3);
410cabdff1aSopenharmony_ci        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
411cabdff1aSopenharmony_ci                   tmp6, tmp7);
412cabdff1aSopenharmony_ci        SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
413cabdff1aSopenharmony_ci        SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
414cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp1, tmp4, tmp5, in0, in1, in4, in5,
415cabdff1aSopenharmony_ci                                    weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
416cabdff1aSopenharmony_ci                                    tmp4, tmp5);
417cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp3, tmp6, tmp7, in2, in3, in6, in7,
418cabdff1aSopenharmony_ci                                    weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
419cabdff1aSopenharmony_ci                                    tmp6, tmp7);
420cabdff1aSopenharmony_ci        PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
421cabdff1aSopenharmony_ci        PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
422cabdff1aSopenharmony_ci        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
423cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
424cabdff1aSopenharmony_ci    }
425cabdff1aSopenharmony_ci}
426cabdff1aSopenharmony_ci
427cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr,
428cabdff1aSopenharmony_ci                                    int32_t src_stride,
429cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
430cabdff1aSopenharmony_ci                                    int32_t src2_stride,
431cabdff1aSopenharmony_ci                                    uint8_t *dst,
432cabdff1aSopenharmony_ci                                    int32_t dst_stride,
433cabdff1aSopenharmony_ci                                    int32_t height,
434cabdff1aSopenharmony_ci                                    int32_t weight0,
435cabdff1aSopenharmony_ci                                    int32_t weight1,
436cabdff1aSopenharmony_ci                                    int32_t offset0,
437cabdff1aSopenharmony_ci                                    int32_t offset1,
438cabdff1aSopenharmony_ci                                    int32_t rnd_val)
439cabdff1aSopenharmony_ci{
440cabdff1aSopenharmony_ci    uint32_t loop_cnt;
441cabdff1aSopenharmony_ci    int32_t offset, weight;
442cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3, out4, out5;
443cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
444cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
445cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
446cabdff1aSopenharmony_ci    v4i32 offset_vec, weight_vec, rnd_vec;
447cabdff1aSopenharmony_ci
448cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
449cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
450cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
451cabdff1aSopenharmony_ci
452cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
453cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
454cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
455cabdff1aSopenharmony_ci
456cabdff1aSopenharmony_ci    for (loop_cnt = 8; loop_cnt--;) {
457cabdff1aSopenharmony_ci        LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
458cabdff1aSopenharmony_ci        LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
459cabdff1aSopenharmony_ci        src0_ptr += (4 * src_stride);
460cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
461cabdff1aSopenharmony_ci        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
462cabdff1aSopenharmony_ci        LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
463cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
464cabdff1aSopenharmony_ci
465cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
466cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src1, dst2, dst3);
467cabdff1aSopenharmony_ci        ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
468cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src4, dst6, dst7);
469cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src5, dst8, dst9);
470cabdff1aSopenharmony_ci        ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
471cabdff1aSopenharmony_ci        SLLI_4V(dst0, dst1, dst2, dst3, 6);
472cabdff1aSopenharmony_ci        SLLI_4V(dst4, dst5, dst6, dst7, 6);
473cabdff1aSopenharmony_ci        SLLI_4V(dst8, dst9, dst10, dst11, 6);
474cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in4, in1, in5,
475cabdff1aSopenharmony_ci                                    weight_vec, rnd_vec, offset_vec, dst0, dst1,
476cabdff1aSopenharmony_ci                                    dst2, dst3);
477cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4_MAX_SATU(dst4, dst5, dst6, dst7, in8, in9, in2, in6,
478cabdff1aSopenharmony_ci                                    weight_vec, rnd_vec, offset_vec, dst4, dst5,
479cabdff1aSopenharmony_ci                                    dst6, dst7);
480cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4_MAX_SATU(dst8, dst9, dst10, dst11, in3, in7, in10,
481cabdff1aSopenharmony_ci                                    in11, weight_vec, rnd_vec, offset_vec,
482cabdff1aSopenharmony_ci                                    dst8, dst9, dst10, dst11);
483cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
484cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
485cabdff1aSopenharmony_ci        ST_UB4(out0, out1, out3, out4, dst, dst_stride);
486cabdff1aSopenharmony_ci        ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
487cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
488cabdff1aSopenharmony_ci    }
489cabdff1aSopenharmony_ci}
490cabdff1aSopenharmony_ci
491cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr,
492cabdff1aSopenharmony_ci                                    int32_t src_stride,
493cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
494cabdff1aSopenharmony_ci                                    int32_t src2_stride,
495cabdff1aSopenharmony_ci                                    uint8_t *dst,
496cabdff1aSopenharmony_ci                                    int32_t dst_stride,
497cabdff1aSopenharmony_ci                                    int32_t height,
498cabdff1aSopenharmony_ci                                    int32_t weight0,
499cabdff1aSopenharmony_ci                                    int32_t weight1,
500cabdff1aSopenharmony_ci                                    int32_t offset0,
501cabdff1aSopenharmony_ci                                    int32_t offset1,
502cabdff1aSopenharmony_ci                                    int32_t rnd_val)
503cabdff1aSopenharmony_ci{
504cabdff1aSopenharmony_ci    uint32_t loop_cnt;
505cabdff1aSopenharmony_ci    int32_t offset, weight;
506cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
507cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
508cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
509cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
510cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
511cabdff1aSopenharmony_ci    v4i32 offset_vec, weight_vec, rnd_vec;
512cabdff1aSopenharmony_ci
513cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
514cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
515cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
516cabdff1aSopenharmony_ci
517cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
518cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
519cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
520cabdff1aSopenharmony_ci
521cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
522cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, 16, src0, src1);
523cabdff1aSopenharmony_ci        src0_ptr += src_stride;
524cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, 16, src2, src3);
525cabdff1aSopenharmony_ci        src0_ptr += src_stride;
526cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
527cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
528cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
529cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
530cabdff1aSopenharmony_ci
531cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, tmp0, tmp4);
532cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src1, tmp1, tmp5);
533cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src2, tmp2, tmp6);
534cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src3, tmp3, tmp7);
535cabdff1aSopenharmony_ci        SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
536cabdff1aSopenharmony_ci        SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
537cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
538cabdff1aSopenharmony_ci                                    weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
539cabdff1aSopenharmony_ci                                    tmp1, tmp5);
540cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
541cabdff1aSopenharmony_ci                                    weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
542cabdff1aSopenharmony_ci                                    tmp3, tmp7);
543cabdff1aSopenharmony_ci        PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
544cabdff1aSopenharmony_ci        PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
545cabdff1aSopenharmony_ci        ST_UB2(out0, out1, dst, 16);
546cabdff1aSopenharmony_ci        dst += dst_stride;
547cabdff1aSopenharmony_ci        ST_UB2(out2, out3, dst, 16);
548cabdff1aSopenharmony_ci        dst += dst_stride;
549cabdff1aSopenharmony_ci    }
550cabdff1aSopenharmony_ci}
551cabdff1aSopenharmony_ci
552cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr,
553cabdff1aSopenharmony_ci                                    int32_t src_stride,
554cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
555cabdff1aSopenharmony_ci                                    int32_t src2_stride,
556cabdff1aSopenharmony_ci                                    uint8_t *dst,
557cabdff1aSopenharmony_ci                                    int32_t dst_stride,
558cabdff1aSopenharmony_ci                                    int32_t height,
559cabdff1aSopenharmony_ci                                    int32_t weight0,
560cabdff1aSopenharmony_ci                                    int32_t weight1,
561cabdff1aSopenharmony_ci                                    int32_t offset0,
562cabdff1aSopenharmony_ci                                    int32_t offset1,
563cabdff1aSopenharmony_ci                                    int32_t rnd_val)
564cabdff1aSopenharmony_ci{
565cabdff1aSopenharmony_ci    uint32_t loop_cnt;
566cabdff1aSopenharmony_ci    int32_t offset, weight;
567cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
568cabdff1aSopenharmony_ci    v16i8 src0, src1, src2;
569cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
570cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
571cabdff1aSopenharmony_ci    v4i32 offset_vec, weight_vec, rnd_vec;
572cabdff1aSopenharmony_ci
573cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
574cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
575cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
576cabdff1aSopenharmony_ci
577cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
578cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
579cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
580cabdff1aSopenharmony_ci
581cabdff1aSopenharmony_ci    for (loop_cnt = 64; loop_cnt--;) {
582cabdff1aSopenharmony_ci        LD_SB3(src0_ptr, 16, src0, src1, src2);
583cabdff1aSopenharmony_ci        src0_ptr += src_stride;
584cabdff1aSopenharmony_ci        LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
585cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
586cabdff1aSopenharmony_ci
587cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src0, dst0, dst1);
588cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src1, dst2, dst3);
589cabdff1aSopenharmony_ci        ILVRL_B2_SH(zero, src2, dst4, dst5);
590cabdff1aSopenharmony_ci        SLLI_4V(dst0, dst1, dst2, dst3, 6);
591cabdff1aSopenharmony_ci        SLLI_2V(dst4, dst5, 6);
592cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
593cabdff1aSopenharmony_ci                                    weight_vec, rnd_vec, offset_vec, dst0, dst1,
594cabdff1aSopenharmony_ci                                    dst2, dst3);
595cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
596cabdff1aSopenharmony_ci                                    offset_vec, dst4, dst5);
597cabdff1aSopenharmony_ci        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
598cabdff1aSopenharmony_ci        ST_UB2(out0, out1, dst, 16);
599cabdff1aSopenharmony_ci        ST_UB(out2, dst + 32);
600cabdff1aSopenharmony_ci        dst += dst_stride;
601cabdff1aSopenharmony_ci    }
602cabdff1aSopenharmony_ci}
603cabdff1aSopenharmony_ci
604cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr,
605cabdff1aSopenharmony_ci                                    int32_t src_stride,
606cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
607cabdff1aSopenharmony_ci                                    int32_t src2_stride,
608cabdff1aSopenharmony_ci                                    uint8_t *dst,
609cabdff1aSopenharmony_ci                                    int32_t dst_stride,
610cabdff1aSopenharmony_ci                                    int32_t height,
611cabdff1aSopenharmony_ci                                    int32_t weight0,
612cabdff1aSopenharmony_ci                                    int32_t weight1,
613cabdff1aSopenharmony_ci                                    int32_t offset0,
614cabdff1aSopenharmony_ci                                    int32_t offset1,
615cabdff1aSopenharmony_ci                                    int32_t rnd_val)
616cabdff1aSopenharmony_ci{
617cabdff1aSopenharmony_ci    uint32_t loop_cnt;
618cabdff1aSopenharmony_ci    int32_t offset, weight;
619cabdff1aSopenharmony_ci    v16u8 out0, out1, out2, out3;
620cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
621cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
622cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
623cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
624cabdff1aSopenharmony_ci    v4i32 offset_vec, weight_vec, rnd_vec;
625cabdff1aSopenharmony_ci
626cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
627cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
628cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
629cabdff1aSopenharmony_ci
630cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
631cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
632cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
633cabdff1aSopenharmony_ci
634cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
635cabdff1aSopenharmony_ci        LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
636cabdff1aSopenharmony_ci        src0_ptr += src_stride;
637cabdff1aSopenharmony_ci        LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
638cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
639cabdff1aSopenharmony_ci
640cabdff1aSopenharmony_ci        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
641cabdff1aSopenharmony_ci                   tmp2, tmp3);
642cabdff1aSopenharmony_ci        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
643cabdff1aSopenharmony_ci                   tmp6, tmp7);
644cabdff1aSopenharmony_ci        SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
645cabdff1aSopenharmony_ci        SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
646cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
647cabdff1aSopenharmony_ci                                    weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
648cabdff1aSopenharmony_ci                                    tmp1, tmp5);
649cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
650cabdff1aSopenharmony_ci                                    weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
651cabdff1aSopenharmony_ci                                    tmp3, tmp7);
652cabdff1aSopenharmony_ci        PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
653cabdff1aSopenharmony_ci        PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
654cabdff1aSopenharmony_ci        ST_UB4(out0, out1, out2, out3, dst, 16);
655cabdff1aSopenharmony_ci        dst += dst_stride;
656cabdff1aSopenharmony_ci    }
657cabdff1aSopenharmony_ci}
658cabdff1aSopenharmony_ci
659cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr,
660cabdff1aSopenharmony_ci                                    int32_t src_stride,
661cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
662cabdff1aSopenharmony_ci                                    int32_t src2_stride,
663cabdff1aSopenharmony_ci                                    uint8_t *dst,
664cabdff1aSopenharmony_ci                                    int32_t dst_stride,
665cabdff1aSopenharmony_ci                                    const int8_t *filter,
666cabdff1aSopenharmony_ci                                    int32_t height,
667cabdff1aSopenharmony_ci                                    int32_t weight0,
668cabdff1aSopenharmony_ci                                    int32_t weight1,
669cabdff1aSopenharmony_ci                                    int32_t offset0,
670cabdff1aSopenharmony_ci                                    int32_t offset1,
671cabdff1aSopenharmony_ci                                    int32_t rnd_val)
672cabdff1aSopenharmony_ci{
673cabdff1aSopenharmony_ci    uint32_t loop_cnt;
674cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
675cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
676cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
677cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3;
678cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3;
679cabdff1aSopenharmony_ci    v8i16 dst0, dst1;
680cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
681cabdff1aSopenharmony_ci    v8i16 filter_vec, out0, out1;
682cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
683cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
684cabdff1aSopenharmony_ci
685cabdff1aSopenharmony_ci    src0_ptr -= 3;
686cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
687cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
688cabdff1aSopenharmony_ci
689cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
690cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
691cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
692cabdff1aSopenharmony_ci
693cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
694cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
695cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
696cabdff1aSopenharmony_ci    constant = 128 * weight1;
697cabdff1aSopenharmony_ci    constant <<= 6;
698cabdff1aSopenharmony_ci    offset += constant;
699cabdff1aSopenharmony_ci
700cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
701cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
702cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
703cabdff1aSopenharmony_ci
704cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
705cabdff1aSopenharmony_ci        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
706cabdff1aSopenharmony_ci        src0_ptr += (4 * src_stride);
707cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
708cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
709cabdff1aSopenharmony_ci        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
710cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
711cabdff1aSopenharmony_ci
712cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
713cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
714cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
715cabdff1aSopenharmony_ci                                 filt3);
716cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
717cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
718cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
719cabdff1aSopenharmony_ci                                 filt3);
720cabdff1aSopenharmony_ci
721cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
722cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
723cabdff1aSopenharmony_ci                           out0, out1);
724cabdff1aSopenharmony_ci
725cabdff1aSopenharmony_ci        out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
726cabdff1aSopenharmony_ci        ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
727cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
728cabdff1aSopenharmony_ci    }
729cabdff1aSopenharmony_ci}
730cabdff1aSopenharmony_ci
731cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr,
732cabdff1aSopenharmony_ci                                    int32_t src_stride,
733cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
734cabdff1aSopenharmony_ci                                    int32_t src2_stride,
735cabdff1aSopenharmony_ci                                    uint8_t *dst,
736cabdff1aSopenharmony_ci                                    int32_t dst_stride,
737cabdff1aSopenharmony_ci                                    const int8_t *filter,
738cabdff1aSopenharmony_ci                                    int32_t height,
739cabdff1aSopenharmony_ci                                    int32_t weight0,
740cabdff1aSopenharmony_ci                                    int32_t weight1,
741cabdff1aSopenharmony_ci                                    int32_t offset0,
742cabdff1aSopenharmony_ci                                    int32_t offset1,
743cabdff1aSopenharmony_ci                                    int32_t rnd_val)
744cabdff1aSopenharmony_ci{
745cabdff1aSopenharmony_ci    uint32_t loop_cnt;
746cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
747cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
748cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
749cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3;
750cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3;
751cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
752cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
753cabdff1aSopenharmony_ci    v8i16 filter_vec, out0, out1, out2, out3;
754cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
755cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
756cabdff1aSopenharmony_ci
757cabdff1aSopenharmony_ci    src0_ptr -= 3;
758cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
759cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
760cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
761cabdff1aSopenharmony_ci    constant = 128 * weight1;
762cabdff1aSopenharmony_ci    constant <<= 6;
763cabdff1aSopenharmony_ci    offset += constant;
764cabdff1aSopenharmony_ci
765cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
766cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
767cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
768cabdff1aSopenharmony_ci
769cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
770cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
771cabdff1aSopenharmony_ci
772cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
773cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
774cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
775cabdff1aSopenharmony_ci
776cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
777cabdff1aSopenharmony_ci        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
778cabdff1aSopenharmony_ci        src0_ptr += (4 * src_stride);
779cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
780cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
781cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
782cabdff1aSopenharmony_ci
783cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
784cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
785cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
786cabdff1aSopenharmony_ci                                 filt3);
787cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
788cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
789cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
790cabdff1aSopenharmony_ci                                 filt3);
791cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
792cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
793cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
794cabdff1aSopenharmony_ci                                 filt3);
795cabdff1aSopenharmony_ci        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
796cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
797cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
798cabdff1aSopenharmony_ci                                 filt3);
799cabdff1aSopenharmony_ci
800cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
801cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
802cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
803cabdff1aSopenharmony_ci                           out0, out1, out2, out3);
804cabdff1aSopenharmony_ci
805cabdff1aSopenharmony_ci        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
806cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
807cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
808cabdff1aSopenharmony_ci    }
809cabdff1aSopenharmony_ci}
810cabdff1aSopenharmony_ci
811cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
812cabdff1aSopenharmony_ci                                     int32_t src_stride,
813cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
814cabdff1aSopenharmony_ci                                     int32_t src2_stride,
815cabdff1aSopenharmony_ci                                     uint8_t *dst,
816cabdff1aSopenharmony_ci                                     int32_t dst_stride,
817cabdff1aSopenharmony_ci                                     const int8_t *filter,
818cabdff1aSopenharmony_ci                                     int32_t height,
819cabdff1aSopenharmony_ci                                     int32_t weight0,
820cabdff1aSopenharmony_ci                                     int32_t weight1,
821cabdff1aSopenharmony_ci                                     int32_t offset0,
822cabdff1aSopenharmony_ci                                     int32_t offset1,
823cabdff1aSopenharmony_ci                                     int32_t rnd_val)
824cabdff1aSopenharmony_ci{
825cabdff1aSopenharmony_ci    uint32_t loop_cnt;
826cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
827cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
828cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
829cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
830cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
831cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
832cabdff1aSopenharmony_ci
833cabdff1aSopenharmony_ci    src0_ptr -= 3;
834cabdff1aSopenharmony_ci
835cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
836cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
837cabdff1aSopenharmony_ci    constant = 128 * weight1;
838cabdff1aSopenharmony_ci    constant <<= 6;
839cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
840cabdff1aSopenharmony_ci    offset += constant;
841cabdff1aSopenharmony_ci
842cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
843cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
844cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
845cabdff1aSopenharmony_ci
846cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
847cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
848cabdff1aSopenharmony_ci
849cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
850cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
851cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
852cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
853cabdff1aSopenharmony_ci    mask4 = LD_SB(&ff_hevc_mask_arr[16]);
854cabdff1aSopenharmony_ci    mask5 = mask4 + 2;
855cabdff1aSopenharmony_ci    mask6 = mask4 + 4;
856cabdff1aSopenharmony_ci    mask7 = mask4 + 6;
857cabdff1aSopenharmony_ci
858cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
859cabdff1aSopenharmony_ci        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
860cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
861cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
862cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
863cabdff1aSopenharmony_ci                   vec3);
864cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
865cabdff1aSopenharmony_ci                                 filt3);
866cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
867cabdff1aSopenharmony_ci                   vec3);
868cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
869cabdff1aSopenharmony_ci                                 filt3);
870cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
871cabdff1aSopenharmony_ci                   vec3);
872cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
873cabdff1aSopenharmony_ci                                 filt3);
874cabdff1aSopenharmony_ci        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
875cabdff1aSopenharmony_ci                   vec3);
876cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
877cabdff1aSopenharmony_ci                                 filt3);
878cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
879cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec, out0, out1, out2,
880cabdff1aSopenharmony_ci                           out3);
881cabdff1aSopenharmony_ci        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
882cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
883cabdff1aSopenharmony_ci
884cabdff1aSopenharmony_ci        LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3);
885cabdff1aSopenharmony_ci        src0_ptr += (4 * src_stride);
886cabdff1aSopenharmony_ci        LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
887cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
888cabdff1aSopenharmony_ci        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
889cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
890cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
891cabdff1aSopenharmony_ci                   vec3);
892cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
893cabdff1aSopenharmony_ci                                 filt3);
894cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
895cabdff1aSopenharmony_ci                   vec3);
896cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
897cabdff1aSopenharmony_ci                                 filt3);
898cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec,
899cabdff1aSopenharmony_ci                           offset_vec, out0, out1);
900cabdff1aSopenharmony_ci        out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
901cabdff1aSopenharmony_ci        ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
902cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
903cabdff1aSopenharmony_ci    }
904cabdff1aSopenharmony_ci}
905cabdff1aSopenharmony_ci
906cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr,
907cabdff1aSopenharmony_ci                                     int32_t src_stride,
908cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
909cabdff1aSopenharmony_ci                                     int32_t src2_stride,
910cabdff1aSopenharmony_ci                                     uint8_t *dst,
911cabdff1aSopenharmony_ci                                     int32_t dst_stride,
912cabdff1aSopenharmony_ci                                     const int8_t *filter,
913cabdff1aSopenharmony_ci                                     int32_t height,
914cabdff1aSopenharmony_ci                                     int32_t weight0,
915cabdff1aSopenharmony_ci                                     int32_t weight1,
916cabdff1aSopenharmony_ci                                     int32_t offset0,
917cabdff1aSopenharmony_ci                                     int32_t offset1,
918cabdff1aSopenharmony_ci                                     int32_t rnd_val)
919cabdff1aSopenharmony_ci{
920cabdff1aSopenharmony_ci    uint32_t loop_cnt;
921cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
922cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
923cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
924cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
925cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3;
926cabdff1aSopenharmony_ci    v8i16 filter_vec, out0, out1, out2, out3;
927cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3;
928cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
929cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
930cabdff1aSopenharmony_ci    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
931cabdff1aSopenharmony_ci
932cabdff1aSopenharmony_ci    src0_ptr -= 3;
933cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
934cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
935cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
936cabdff1aSopenharmony_ci    constant = 128 * weight1;
937cabdff1aSopenharmony_ci    constant <<= 6;
938cabdff1aSopenharmony_ci    offset += constant;
939cabdff1aSopenharmony_ci
940cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
941cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
942cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
943cabdff1aSopenharmony_ci
944cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
945cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
946cabdff1aSopenharmony_ci
947cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
948cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
949cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
950cabdff1aSopenharmony_ci
951cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
952cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, 8, src0, src1);
953cabdff1aSopenharmony_ci        src0_ptr += src_stride;
954cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, 8, src2, src3);
955cabdff1aSopenharmony_ci        src0_ptr += src_stride;
956cabdff1aSopenharmony_ci        LD_SH2(src1_ptr, 8, in0, in1);
957cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
958cabdff1aSopenharmony_ci        LD_SH2(src1_ptr, 8, in2, in3);
959cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
960cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
961cabdff1aSopenharmony_ci
962cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
963cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
964cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
965cabdff1aSopenharmony_ci                                 filt3);
966cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
967cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
968cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
969cabdff1aSopenharmony_ci                                 filt3);
970cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
971cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
972cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
973cabdff1aSopenharmony_ci                                 filt3);
974cabdff1aSopenharmony_ci        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
975cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
976cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
977cabdff1aSopenharmony_ci                                 filt3);
978cabdff1aSopenharmony_ci
979cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
980cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
981cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
982cabdff1aSopenharmony_ci                           out0, out1, out2, out3);
983cabdff1aSopenharmony_ci
984cabdff1aSopenharmony_ci        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
985cabdff1aSopenharmony_ci        ST_SH2(out0, out1, dst, dst_stride);
986cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
987cabdff1aSopenharmony_ci    }
988cabdff1aSopenharmony_ci}
989cabdff1aSopenharmony_ci
990cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
991cabdff1aSopenharmony_ci                                     int32_t src_stride,
992cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
993cabdff1aSopenharmony_ci                                     int32_t src2_stride,
994cabdff1aSopenharmony_ci                                     uint8_t *dst,
995cabdff1aSopenharmony_ci                                     int32_t dst_stride,
996cabdff1aSopenharmony_ci                                     const int8_t *filter,
997cabdff1aSopenharmony_ci                                     int32_t height,
998cabdff1aSopenharmony_ci                                     int32_t weight0,
999cabdff1aSopenharmony_ci                                     int32_t weight1,
1000cabdff1aSopenharmony_ci                                     int32_t offset0,
1001cabdff1aSopenharmony_ci                                     int32_t offset1,
1002cabdff1aSopenharmony_ci                                     int32_t rnd_val)
1003cabdff1aSopenharmony_ci{
1004cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1005cabdff1aSopenharmony_ci    uint64_t dst_val0;
1006cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
1007cabdff1aSopenharmony_ci    v16i8 src0, src1;
1008cabdff1aSopenharmony_ci    v8i16 in0, in1, in2;
1009cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1010cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1011cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3;
1012cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2;
1013cabdff1aSopenharmony_ci    v4i32 dst2_r, dst2_l;
1014cabdff1aSopenharmony_ci    v8i16 filter_vec, out0, out1, out2;
1015cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
1016cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1017cabdff1aSopenharmony_ci
1018cabdff1aSopenharmony_ci    src0_ptr = src0_ptr - 3;
1019cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
1020cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
1021cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
1022cabdff1aSopenharmony_ci    constant = 128 * weight1;
1023cabdff1aSopenharmony_ci    constant <<= 6;
1024cabdff1aSopenharmony_ci    offset += constant;
1025cabdff1aSopenharmony_ci
1026cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
1027cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1028cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
1029cabdff1aSopenharmony_ci
1030cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
1031cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1032cabdff1aSopenharmony_ci
1033cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1034cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1035cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1036cabdff1aSopenharmony_ci    mask4 = mask0 + 8;
1037cabdff1aSopenharmony_ci    mask5 = mask0 + 10;
1038cabdff1aSopenharmony_ci    mask6 = mask0 + 12;
1039cabdff1aSopenharmony_ci    mask7 = mask0 + 14;
1040cabdff1aSopenharmony_ci
1041cabdff1aSopenharmony_ci    LD_SB2(src0_ptr, 16, src0, src1);
1042cabdff1aSopenharmony_ci    src0_ptr += src_stride;
1043cabdff1aSopenharmony_ci    LD_SH2(src1_ptr, 8, in0, in1);
1044cabdff1aSopenharmony_ci    in2 = LD_SH(src1_ptr + 16);
1045cabdff1aSopenharmony_ci    src1_ptr += src2_stride;
1046cabdff1aSopenharmony_ci    XORI_B2_128_SB(src0, src1);
1047cabdff1aSopenharmony_ci
1048cabdff1aSopenharmony_ci    for (loop_cnt = 31; loop_cnt--;) {
1049cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1050cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1051cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1052cabdff1aSopenharmony_ci                                 filt3);
1053cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1054cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1055cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1056cabdff1aSopenharmony_ci                                 filt3);
1057cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1058cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1059cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1060cabdff1aSopenharmony_ci                                 filt3);
1061cabdff1aSopenharmony_ci
1062cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
1063cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
1064cabdff1aSopenharmony_ci                           out0, out1);
1065cabdff1aSopenharmony_ci
1066cabdff1aSopenharmony_ci        ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1067cabdff1aSopenharmony_ci        dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1068cabdff1aSopenharmony_ci                                 (v8i16) weight_vec);
1069cabdff1aSopenharmony_ci        dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1070cabdff1aSopenharmony_ci                                 (v8i16) weight_vec);
1071cabdff1aSopenharmony_ci        SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1072cabdff1aSopenharmony_ci        out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1073cabdff1aSopenharmony_ci        CLIP_SH_0_255(out2);
1074cabdff1aSopenharmony_ci
1075cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, 16, src0, src1);
1076cabdff1aSopenharmony_ci        src0_ptr += src_stride;
1077cabdff1aSopenharmony_ci        LD_SH2(src1_ptr, 8, in0, in1);
1078cabdff1aSopenharmony_ci        in2 = LD_SH(src1_ptr + 16);
1079cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1080cabdff1aSopenharmony_ci        XORI_B2_128_SB(src0, src1);
1081cabdff1aSopenharmony_ci        PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1082cabdff1aSopenharmony_ci        dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1083cabdff1aSopenharmony_ci        ST_SH(out0, dst);
1084cabdff1aSopenharmony_ci        SD(dst_val0, dst + 16);
1085cabdff1aSopenharmony_ci        dst += dst_stride;
1086cabdff1aSopenharmony_ci    }
1087cabdff1aSopenharmony_ci
1088cabdff1aSopenharmony_ci    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1089cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1090cabdff1aSopenharmony_ci                             filt3);
1091cabdff1aSopenharmony_ci    VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1092cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1093cabdff1aSopenharmony_ci                             filt3);
1094cabdff1aSopenharmony_ci    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1095cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1096cabdff1aSopenharmony_ci                             filt3);
1097cabdff1aSopenharmony_ci    HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec,
1098cabdff1aSopenharmony_ci                       out0, out1);
1099cabdff1aSopenharmony_ci    ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1100cabdff1aSopenharmony_ci    dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
1101cabdff1aSopenharmony_ci    dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
1102cabdff1aSopenharmony_ci    SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1103cabdff1aSopenharmony_ci    out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1104cabdff1aSopenharmony_ci    CLIP_SH_0_255(out2);
1105cabdff1aSopenharmony_ci    PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1106cabdff1aSopenharmony_ci    dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1107cabdff1aSopenharmony_ci    ST_SH(out0, dst);
1108cabdff1aSopenharmony_ci    SD(dst_val0, dst + 16);
1109cabdff1aSopenharmony_ci    dst += dst_stride;
1110cabdff1aSopenharmony_ci}
1111cabdff1aSopenharmony_ci
1112cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr,
1113cabdff1aSopenharmony_ci                                     int32_t src_stride,
1114cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
1115cabdff1aSopenharmony_ci                                     int32_t src2_stride,
1116cabdff1aSopenharmony_ci                                     uint8_t *dst,
1117cabdff1aSopenharmony_ci                                     int32_t dst_stride,
1118cabdff1aSopenharmony_ci                                     const int8_t *filter,
1119cabdff1aSopenharmony_ci                                     int32_t height,
1120cabdff1aSopenharmony_ci                                     int32_t weight0,
1121cabdff1aSopenharmony_ci                                     int32_t weight1,
1122cabdff1aSopenharmony_ci                                     int32_t offset0,
1123cabdff1aSopenharmony_ci                                     int32_t offset1,
1124cabdff1aSopenharmony_ci                                     int32_t rnd_val)
1125cabdff1aSopenharmony_ci{
1126cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1127cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
1128cabdff1aSopenharmony_ci    v16i8 src0, src1, src2;
1129cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
1130cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1131cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1132cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1133cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3;
1134cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
1135cabdff1aSopenharmony_ci    v8i16 filter_vec, out0, out1, out2, out3;
1136cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
1137cabdff1aSopenharmony_ci
1138cabdff1aSopenharmony_ci    src0_ptr -= 3;
1139cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
1140cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
1141cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
1142cabdff1aSopenharmony_ci    constant = 128 * weight1;
1143cabdff1aSopenharmony_ci    constant <<= 6;
1144cabdff1aSopenharmony_ci    offset += constant;
1145cabdff1aSopenharmony_ci
1146cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
1147cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1148cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
1149cabdff1aSopenharmony_ci
1150cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
1151cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1152cabdff1aSopenharmony_ci
1153cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1154cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1155cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1156cabdff1aSopenharmony_ci    mask4 = mask0 + 8;
1157cabdff1aSopenharmony_ci    mask5 = mask0 + 10;
1158cabdff1aSopenharmony_ci    mask6 = mask0 + 12;
1159cabdff1aSopenharmony_ci    mask7 = mask0 + 14;
1160cabdff1aSopenharmony_ci
1161cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
1162cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, 16, src0, src1);
1163cabdff1aSopenharmony_ci        src2 = LD_SB(src0_ptr + 24);
1164cabdff1aSopenharmony_ci        src0_ptr += src_stride;
1165cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1166cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1167cabdff1aSopenharmony_ci
1168cabdff1aSopenharmony_ci        XORI_B3_128_SB(src0, src1, src2);
1169cabdff1aSopenharmony_ci
1170cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1171cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1172cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1173cabdff1aSopenharmony_ci                                 filt3);
1174cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1175cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1176cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1177cabdff1aSopenharmony_ci                                 filt3);
1178cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1179cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1180cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1181cabdff1aSopenharmony_ci                                 filt3);
1182cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1183cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1184cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1185cabdff1aSopenharmony_ci                                 filt3);
1186cabdff1aSopenharmony_ci
1187cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1188cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
1189cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
1190cabdff1aSopenharmony_ci                           out0, out1, out2, out3);
1191cabdff1aSopenharmony_ci
1192cabdff1aSopenharmony_ci        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1193cabdff1aSopenharmony_ci        ST_SH2(out0, out1, dst, 16);
1194cabdff1aSopenharmony_ci        dst += dst_stride;
1195cabdff1aSopenharmony_ci    }
1196cabdff1aSopenharmony_ci}
1197cabdff1aSopenharmony_ci
1198cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1199cabdff1aSopenharmony_ci                                     int32_t src_stride,
1200cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
1201cabdff1aSopenharmony_ci                                     int32_t src2_stride,
1202cabdff1aSopenharmony_ci                                     uint8_t *dst,
1203cabdff1aSopenharmony_ci                                     int32_t dst_stride,
1204cabdff1aSopenharmony_ci                                     const int8_t *filter,
1205cabdff1aSopenharmony_ci                                     int32_t height,
1206cabdff1aSopenharmony_ci                                     int32_t weight0,
1207cabdff1aSopenharmony_ci                                     int32_t weight1,
1208cabdff1aSopenharmony_ci                                     int32_t offset0,
1209cabdff1aSopenharmony_ci                                     int32_t offset1,
1210cabdff1aSopenharmony_ci                                     int32_t rnd_val)
1211cabdff1aSopenharmony_ci{
1212cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1213cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
1214cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
1215cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
1216cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1217cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1218cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1219cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3;
1220cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
1221cabdff1aSopenharmony_ci    v8i16 filter_vec, out0, out1, out2, out3;
1222cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
1223cabdff1aSopenharmony_ci
1224cabdff1aSopenharmony_ci    src0_ptr -= 3;
1225cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
1226cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
1227cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
1228cabdff1aSopenharmony_ci    constant = 128 * weight1;
1229cabdff1aSopenharmony_ci    constant <<= 6;
1230cabdff1aSopenharmony_ci    offset += constant;
1231cabdff1aSopenharmony_ci
1232cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
1233cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1234cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
1235cabdff1aSopenharmony_ci
1236cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
1237cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1238cabdff1aSopenharmony_ci
1239cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1240cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1241cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1242cabdff1aSopenharmony_ci    mask4 = mask0 + 8;
1243cabdff1aSopenharmony_ci    mask5 = mask0 + 10;
1244cabdff1aSopenharmony_ci    mask6 = mask0 + 12;
1245cabdff1aSopenharmony_ci    mask7 = mask0 + 14;
1246cabdff1aSopenharmony_ci
1247cabdff1aSopenharmony_ci    for (loop_cnt = 64; loop_cnt--;) {
1248cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, 16, src0, src1);
1249cabdff1aSopenharmony_ci        src2 = LD_SB(src0_ptr + 24);
1250cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1251cabdff1aSopenharmony_ci        XORI_B3_128_SB(src0, src1, src2);
1252cabdff1aSopenharmony_ci        LD_SB2(src0_ptr + 32, 8, src3, src4);
1253cabdff1aSopenharmony_ci        src0_ptr += src_stride;
1254cabdff1aSopenharmony_ci        XORI_B2_128_SB(src3, src4);
1255cabdff1aSopenharmony_ci
1256cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1257cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1258cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1259cabdff1aSopenharmony_ci                                 filt3);
1260cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1261cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1262cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1263cabdff1aSopenharmony_ci                                 filt3);
1264cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1265cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1266cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1267cabdff1aSopenharmony_ci                                 filt3);
1268cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1269cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1270cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1271cabdff1aSopenharmony_ci                                 filt3);
1272cabdff1aSopenharmony_ci
1273cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
1274cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
1275cabdff1aSopenharmony_ci                           out0, out1, out2, out3);
1276cabdff1aSopenharmony_ci
1277cabdff1aSopenharmony_ci        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1278cabdff1aSopenharmony_ci        ST_SH2(out0, out1, dst, 16);
1279cabdff1aSopenharmony_ci
1280cabdff1aSopenharmony_ci        LD_SH2(src1_ptr + 32, 8, in2, in3);
1281cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1282cabdff1aSopenharmony_ci
1283cabdff1aSopenharmony_ci        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1284cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1285cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1286cabdff1aSopenharmony_ci                                 filt3);
1287cabdff1aSopenharmony_ci        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1288cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1289cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1290cabdff1aSopenharmony_ci                                 filt3);
1291cabdff1aSopenharmony_ci
1292cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP2(dst0, dst1, in2, in3,
1293cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
1294cabdff1aSopenharmony_ci                           out0, out1);
1295cabdff1aSopenharmony_ci
1296cabdff1aSopenharmony_ci        out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1297cabdff1aSopenharmony_ci        ST_SH(out0, dst + 32);
1298cabdff1aSopenharmony_ci        dst += dst_stride;
1299cabdff1aSopenharmony_ci    }
1300cabdff1aSopenharmony_ci}
1301cabdff1aSopenharmony_ci
1302cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1303cabdff1aSopenharmony_ci                                     int32_t src_stride,
1304cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
1305cabdff1aSopenharmony_ci                                     int32_t src2_stride,
1306cabdff1aSopenharmony_ci                                     uint8_t *dst,
1307cabdff1aSopenharmony_ci                                     int32_t dst_stride,
1308cabdff1aSopenharmony_ci                                     const int8_t *filter,
1309cabdff1aSopenharmony_ci                                     int32_t height,
1310cabdff1aSopenharmony_ci                                     int32_t weight0,
1311cabdff1aSopenharmony_ci                                     int32_t weight1,
1312cabdff1aSopenharmony_ci                                     int32_t offset0,
1313cabdff1aSopenharmony_ci                                     int32_t offset1,
1314cabdff1aSopenharmony_ci                                     int32_t rnd_val)
1315cabdff1aSopenharmony_ci{
1316cabdff1aSopenharmony_ci    uint8_t *src0_ptr_tmp;
1317cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
1318cabdff1aSopenharmony_ci    int16_t *src1_ptr_tmp;
1319cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
1320cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
1321cabdff1aSopenharmony_ci    v16i8 src0, src1, src2;
1322cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
1323cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1324cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1325cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1326cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3;
1327cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
1328cabdff1aSopenharmony_ci    v8i16 filter_vec, out0, out1, out2, out3;
1329cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
1330cabdff1aSopenharmony_ci
1331cabdff1aSopenharmony_ci    src0_ptr -= 3;
1332cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
1333cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
1334cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
1335cabdff1aSopenharmony_ci    constant = 128 * weight1;
1336cabdff1aSopenharmony_ci    constant <<= 6;
1337cabdff1aSopenharmony_ci    offset += constant;
1338cabdff1aSopenharmony_ci
1339cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
1340cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1341cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
1342cabdff1aSopenharmony_ci
1343cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
1344cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1345cabdff1aSopenharmony_ci
1346cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1347cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1348cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1349cabdff1aSopenharmony_ci    mask4 = mask0 + 8;
1350cabdff1aSopenharmony_ci    mask5 = mask0 + 10;
1351cabdff1aSopenharmony_ci    mask6 = mask0 + 12;
1352cabdff1aSopenharmony_ci    mask7 = mask0 + 14;
1353cabdff1aSopenharmony_ci
1354cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
1355cabdff1aSopenharmony_ci        src0_ptr_tmp = src0_ptr;
1356cabdff1aSopenharmony_ci        dst_tmp = dst;
1357cabdff1aSopenharmony_ci        src1_ptr_tmp = src1_ptr;
1358cabdff1aSopenharmony_ci
1359cabdff1aSopenharmony_ci        for (cnt = 2; cnt--;) {
1360cabdff1aSopenharmony_ci            LD_SB2(src0_ptr_tmp, 16, src0, src1);
1361cabdff1aSopenharmony_ci            src2 = LD_SB(src0_ptr_tmp + 24);
1362cabdff1aSopenharmony_ci            src0_ptr_tmp += 32;
1363cabdff1aSopenharmony_ci            LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1364cabdff1aSopenharmony_ci            src1_ptr_tmp += 32;
1365cabdff1aSopenharmony_ci            XORI_B3_128_SB(src0, src1, src2);
1366cabdff1aSopenharmony_ci
1367cabdff1aSopenharmony_ci            VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1368cabdff1aSopenharmony_ci                       vec0, vec1, vec2, vec3);
1369cabdff1aSopenharmony_ci            dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1370cabdff1aSopenharmony_ci                                     filt2, filt3);
1371cabdff1aSopenharmony_ci            VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1372cabdff1aSopenharmony_ci                       vec0, vec1, vec2, vec3);
1373cabdff1aSopenharmony_ci            dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1374cabdff1aSopenharmony_ci                                     filt2, filt3);
1375cabdff1aSopenharmony_ci            VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1376cabdff1aSopenharmony_ci                       vec0, vec1, vec2, vec3);
1377cabdff1aSopenharmony_ci            dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1378cabdff1aSopenharmony_ci                                     filt2, filt3);
1379cabdff1aSopenharmony_ci            VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1380cabdff1aSopenharmony_ci                       vec0, vec1, vec2, vec3);
1381cabdff1aSopenharmony_ci            dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1382cabdff1aSopenharmony_ci                                     filt2, filt3);
1383cabdff1aSopenharmony_ci
1384cabdff1aSopenharmony_ci            HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1385cabdff1aSopenharmony_ci                               in0, in1, in2, in3,
1386cabdff1aSopenharmony_ci                               weight_vec, rnd_vec, offset_vec,
1387cabdff1aSopenharmony_ci                               out0, out1, out2, out3);
1388cabdff1aSopenharmony_ci
1389cabdff1aSopenharmony_ci            PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1390cabdff1aSopenharmony_ci            ST_SH2(out0, out1, dst_tmp, 16);
1391cabdff1aSopenharmony_ci            dst_tmp += 32;
1392cabdff1aSopenharmony_ci        }
1393cabdff1aSopenharmony_ci
1394cabdff1aSopenharmony_ci        src0_ptr += src_stride;
1395cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1396cabdff1aSopenharmony_ci        dst += dst_stride;
1397cabdff1aSopenharmony_ci
1398cabdff1aSopenharmony_ci    }
1399cabdff1aSopenharmony_ci}
1400cabdff1aSopenharmony_ci
1401cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1402cabdff1aSopenharmony_ci                                    int32_t src_stride,
1403cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
1404cabdff1aSopenharmony_ci                                    int32_t src2_stride,
1405cabdff1aSopenharmony_ci                                    uint8_t *dst,
1406cabdff1aSopenharmony_ci                                    int32_t dst_stride,
1407cabdff1aSopenharmony_ci                                    const int8_t *filter,
1408cabdff1aSopenharmony_ci                                    int32_t height,
1409cabdff1aSopenharmony_ci                                    int32_t weight0,
1410cabdff1aSopenharmony_ci                                    int32_t weight1,
1411cabdff1aSopenharmony_ci                                    int32_t offset0,
1412cabdff1aSopenharmony_ci                                    int32_t offset1,
1413cabdff1aSopenharmony_ci                                    int32_t rnd_val)
1414cabdff1aSopenharmony_ci{
1415cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1416cabdff1aSopenharmony_ci    int32_t offset, weight;
1417cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1418cabdff1aSopenharmony_ci    v16i8 src11, src12, src13, src14;
1419cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1420cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1421cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1422cabdff1aSopenharmony_ci    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1423cabdff1aSopenharmony_ci    v16i8 src2110, src4332, src6554, src8776, src10998;
1424cabdff1aSopenharmony_ci    v16i8 src12111110, src14131312;
1425cabdff1aSopenharmony_ci    v8i16 dst10, dst32, dst54, dst76;
1426cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1427cabdff1aSopenharmony_ci    v8i16 filter_vec, out0, out1, out2, out3;
1428cabdff1aSopenharmony_ci    v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1429cabdff1aSopenharmony_ci
1430cabdff1aSopenharmony_ci    src0_ptr -= (3 * src_stride);
1431cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
1432cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
1433cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
1434cabdff1aSopenharmony_ci
1435cabdff1aSopenharmony_ci    const_vec = __msa_ldi_w(128);
1436cabdff1aSopenharmony_ci    const_vec <<= 6;
1437cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
1438cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1439cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
1440cabdff1aSopenharmony_ci    weight1_vec = __msa_fill_w(weight1);
1441cabdff1aSopenharmony_ci    offset_vec += const_vec * weight1_vec;
1442cabdff1aSopenharmony_ci
1443cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
1444cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1445cabdff1aSopenharmony_ci
1446cabdff1aSopenharmony_ci    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1447cabdff1aSopenharmony_ci    src0_ptr += (7 * src_stride);
1448cabdff1aSopenharmony_ci
1449cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1450cabdff1aSopenharmony_ci               src10_r, src32_r, src54_r, src21_r);
1451cabdff1aSopenharmony_ci    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1452cabdff1aSopenharmony_ci    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1453cabdff1aSopenharmony_ci               src2110, src4332, src6554);
1454cabdff1aSopenharmony_ci    XORI_B3_128_SB(src2110, src4332, src6554);
1455cabdff1aSopenharmony_ci
1456cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
1457cabdff1aSopenharmony_ci        LD_SB8(src0_ptr, src_stride,
1458cabdff1aSopenharmony_ci               src7, src8, src9, src10, src11, src12, src13, src14);
1459cabdff1aSopenharmony_ci        src0_ptr += (8 * src_stride);
1460cabdff1aSopenharmony_ci        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1461cabdff1aSopenharmony_ci        src1_ptr += (8 * src2_stride);
1462cabdff1aSopenharmony_ci
1463cabdff1aSopenharmony_ci        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1464cabdff1aSopenharmony_ci        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1465cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1466cabdff1aSopenharmony_ci                   src76_r, src87_r, src98_r, src109_r);
1467cabdff1aSopenharmony_ci        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1468cabdff1aSopenharmony_ci                   src1110_r, src1211_r, src1312_r, src1413_r);
1469cabdff1aSopenharmony_ci        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1470cabdff1aSopenharmony_ci                   src1413_r, src1312_r,
1471cabdff1aSopenharmony_ci                   src8776, src10998, src12111110, src14131312);
1472cabdff1aSopenharmony_ci        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1473cabdff1aSopenharmony_ci
1474cabdff1aSopenharmony_ci        DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1475cabdff1aSopenharmony_ci                    filt0, dst10, dst32, dst54, dst76);
1476cabdff1aSopenharmony_ci        DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1477cabdff1aSopenharmony_ci                     filt1, dst10, dst32, dst54, dst76);
1478cabdff1aSopenharmony_ci        DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1479cabdff1aSopenharmony_ci                     filt2, filt2, dst10, dst32, dst54, dst76);
1480cabdff1aSopenharmony_ci        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1481cabdff1aSopenharmony_ci                     filt3, filt3, dst10, dst32, dst54, dst76);
1482cabdff1aSopenharmony_ci
1483cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
1484cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
1485cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
1486cabdff1aSopenharmony_ci                           out0, out1, out2, out3);
1487cabdff1aSopenharmony_ci
1488cabdff1aSopenharmony_ci        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1489cabdff1aSopenharmony_ci        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1490cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
1491cabdff1aSopenharmony_ci
1492cabdff1aSopenharmony_ci        src2110 = src10998;
1493cabdff1aSopenharmony_ci        src4332 = src12111110;
1494cabdff1aSopenharmony_ci        src6554 = src14131312;
1495cabdff1aSopenharmony_ci        src6 = src14;
1496cabdff1aSopenharmony_ci    }
1497cabdff1aSopenharmony_ci}
1498cabdff1aSopenharmony_ci
1499cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr,
1500cabdff1aSopenharmony_ci                                    int32_t src_stride,
1501cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
1502cabdff1aSopenharmony_ci                                    int32_t src2_stride,
1503cabdff1aSopenharmony_ci                                    uint8_t *dst,
1504cabdff1aSopenharmony_ci                                    int32_t dst_stride,
1505cabdff1aSopenharmony_ci                                    const int8_t *filter,
1506cabdff1aSopenharmony_ci                                    int32_t height,
1507cabdff1aSopenharmony_ci                                    int32_t weight0,
1508cabdff1aSopenharmony_ci                                    int32_t weight1,
1509cabdff1aSopenharmony_ci                                    int32_t offset0,
1510cabdff1aSopenharmony_ci                                    int32_t offset1,
1511cabdff1aSopenharmony_ci                                    int32_t rnd_val)
1512cabdff1aSopenharmony_ci{
1513cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1514cabdff1aSopenharmony_ci    int32_t offset, weight;
1515cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5;
1516cabdff1aSopenharmony_ci    v16i8 src6, src7, src8, src9, src10;
1517cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
1518cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1519cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1520cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3;
1521cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1522cabdff1aSopenharmony_ci    v8i16 filter_vec, out0, out1, out2, out3;
1523cabdff1aSopenharmony_ci    v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1524cabdff1aSopenharmony_ci
1525cabdff1aSopenharmony_ci    src0_ptr -= (3 * src_stride);
1526cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
1527cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
1528cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
1529cabdff1aSopenharmony_ci
1530cabdff1aSopenharmony_ci    const_vec = __msa_ldi_w(128);
1531cabdff1aSopenharmony_ci    const_vec <<= 6;
1532cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
1533cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1534cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
1535cabdff1aSopenharmony_ci    weight1_vec = __msa_fill_w(weight1);
1536cabdff1aSopenharmony_ci    offset_vec += const_vec * weight1_vec;
1537cabdff1aSopenharmony_ci
1538cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
1539cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1540cabdff1aSopenharmony_ci
1541cabdff1aSopenharmony_ci    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1542cabdff1aSopenharmony_ci    src0_ptr += (7 * src_stride);
1543cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1544cabdff1aSopenharmony_ci
1545cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1546cabdff1aSopenharmony_ci               src10_r, src32_r, src54_r, src21_r);
1547cabdff1aSopenharmony_ci    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1548cabdff1aSopenharmony_ci
1549cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1550cabdff1aSopenharmony_ci        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1551cabdff1aSopenharmony_ci        src0_ptr += (4 * src_stride);
1552cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1553cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
1554cabdff1aSopenharmony_ci
1555cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
1556cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1557cabdff1aSopenharmony_ci                   src76_r, src87_r, src98_r, src109_r);
1558cabdff1aSopenharmony_ci
1559cabdff1aSopenharmony_ci        DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1560cabdff1aSopenharmony_ci                    filt0, tmp0, tmp1, tmp2, tmp3);
1561cabdff1aSopenharmony_ci        DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1562cabdff1aSopenharmony_ci                     filt1, tmp0, tmp1, tmp2, tmp3);
1563cabdff1aSopenharmony_ci        DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1564cabdff1aSopenharmony_ci                     filt2, tmp0, tmp1, tmp2, tmp3);
1565cabdff1aSopenharmony_ci        DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1566cabdff1aSopenharmony_ci                     filt3, tmp0, tmp1, tmp2, tmp3);
1567cabdff1aSopenharmony_ci
1568cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1569cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
1570cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
1571cabdff1aSopenharmony_ci                           out0, out1, out2, out3);
1572cabdff1aSopenharmony_ci
1573cabdff1aSopenharmony_ci        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1574cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1575cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1576cabdff1aSopenharmony_ci
1577cabdff1aSopenharmony_ci        src10_r = src54_r;
1578cabdff1aSopenharmony_ci        src32_r = src76_r;
1579cabdff1aSopenharmony_ci        src54_r = src98_r;
1580cabdff1aSopenharmony_ci        src21_r = src65_r;
1581cabdff1aSopenharmony_ci        src43_r = src87_r;
1582cabdff1aSopenharmony_ci        src65_r = src109_r;
1583cabdff1aSopenharmony_ci        src6 = src10;
1584cabdff1aSopenharmony_ci    }
1585cabdff1aSopenharmony_ci}
1586cabdff1aSopenharmony_ci
1587cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
1588cabdff1aSopenharmony_ci                                     int32_t src_stride,
1589cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
1590cabdff1aSopenharmony_ci                                     int32_t src2_stride,
1591cabdff1aSopenharmony_ci                                     uint8_t *dst,
1592cabdff1aSopenharmony_ci                                     int32_t dst_stride,
1593cabdff1aSopenharmony_ci                                     const int8_t *filter,
1594cabdff1aSopenharmony_ci                                     int32_t height,
1595cabdff1aSopenharmony_ci                                     int32_t weight0,
1596cabdff1aSopenharmony_ci                                     int32_t weight1,
1597cabdff1aSopenharmony_ci                                     int32_t offset0,
1598cabdff1aSopenharmony_ci                                     int32_t offset1,
1599cabdff1aSopenharmony_ci                                     int32_t rnd_val)
1600cabdff1aSopenharmony_ci{
1601cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1602cabdff1aSopenharmony_ci    int32_t offset, weight;
1603cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1604cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
1605cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r;
1606cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src65_r, src87_r;
1607cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2;
1608cabdff1aSopenharmony_ci    v16i8 src10_l, src32_l, src54_l, src76_l;
1609cabdff1aSopenharmony_ci    v16i8 src21_l, src43_l, src65_l, src87_l;
1610cabdff1aSopenharmony_ci    v16i8 src2110, src4332, src6554, src8776;
1611cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1612cabdff1aSopenharmony_ci    v8i16 out0, out1, out2, filter_vec;
1613cabdff1aSopenharmony_ci    v4i32 dst2_r, dst2_l;
1614cabdff1aSopenharmony_ci    v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1615cabdff1aSopenharmony_ci
1616cabdff1aSopenharmony_ci    src0_ptr -= (3 * src_stride);
1617cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
1618cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
1619cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
1620cabdff1aSopenharmony_ci
1621cabdff1aSopenharmony_ci    const_vec = __msa_ldi_w(128);
1622cabdff1aSopenharmony_ci    const_vec <<= 6;
1623cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
1624cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1625cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
1626cabdff1aSopenharmony_ci    weight1_vec = __msa_fill_w(weight1);
1627cabdff1aSopenharmony_ci    offset_vec += const_vec * weight1_vec;
1628cabdff1aSopenharmony_ci
1629cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
1630cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1631cabdff1aSopenharmony_ci
1632cabdff1aSopenharmony_ci    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1633cabdff1aSopenharmony_ci    src0_ptr += (7 * src_stride);
1634cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1635cabdff1aSopenharmony_ci
1636cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1637cabdff1aSopenharmony_ci               src10_r, src32_r, src54_r, src21_r);
1638cabdff1aSopenharmony_ci    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1639cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1640cabdff1aSopenharmony_ci               src10_l, src32_l, src54_l, src21_l);
1641cabdff1aSopenharmony_ci    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1642cabdff1aSopenharmony_ci    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1643cabdff1aSopenharmony_ci               src2110, src4332, src6554);
1644cabdff1aSopenharmony_ci
1645cabdff1aSopenharmony_ci    for (loop_cnt = 8; loop_cnt--;) {
1646cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, src_stride, src7, src8);
1647cabdff1aSopenharmony_ci        src0_ptr += (2 * src_stride);
1648cabdff1aSopenharmony_ci        LD_SH2(src1_ptr, src2_stride, in0, in1);
1649cabdff1aSopenharmony_ci        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
1650cabdff1aSopenharmony_ci        src1_ptr += (2 * src2_stride);
1651cabdff1aSopenharmony_ci        in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
1652cabdff1aSopenharmony_ci        XORI_B2_128_SB(src7, src8);
1653cabdff1aSopenharmony_ci
1654cabdff1aSopenharmony_ci        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1655cabdff1aSopenharmony_ci        ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1656cabdff1aSopenharmony_ci        src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
1657cabdff1aSopenharmony_ci
1658cabdff1aSopenharmony_ci        DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
1659cabdff1aSopenharmony_ci                    tmp0, tmp1, tmp2);
1660cabdff1aSopenharmony_ci        DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
1661cabdff1aSopenharmony_ci        tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
1662cabdff1aSopenharmony_ci        DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
1663cabdff1aSopenharmony_ci        tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
1664cabdff1aSopenharmony_ci        DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
1665cabdff1aSopenharmony_ci        tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
1666cabdff1aSopenharmony_ci
1667cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
1668cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
1669cabdff1aSopenharmony_ci                           out0, out1);
1670cabdff1aSopenharmony_ci
1671cabdff1aSopenharmony_ci        ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l);
1672cabdff1aSopenharmony_ci        dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1673cabdff1aSopenharmony_ci                                 (v8i16) weight_vec);
1674cabdff1aSopenharmony_ci        dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1675cabdff1aSopenharmony_ci                                 (v8i16) weight_vec);
1676cabdff1aSopenharmony_ci        SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1677cabdff1aSopenharmony_ci        out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1678cabdff1aSopenharmony_ci        CLIP_SH_0_255(out2);
1679cabdff1aSopenharmony_ci        PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1680cabdff1aSopenharmony_ci        ST_D2(out0, 0, 1, dst, dst_stride);
1681cabdff1aSopenharmony_ci        ST_W2(out2, 0, 1, dst + 8, dst_stride);
1682cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
1683cabdff1aSopenharmony_ci
1684cabdff1aSopenharmony_ci        src10_r = src32_r;
1685cabdff1aSopenharmony_ci        src32_r = src54_r;
1686cabdff1aSopenharmony_ci        src54_r = src76_r;
1687cabdff1aSopenharmony_ci        src21_r = src43_r;
1688cabdff1aSopenharmony_ci        src43_r = src65_r;
1689cabdff1aSopenharmony_ci        src65_r = src87_r;
1690cabdff1aSopenharmony_ci        src2110 = src4332;
1691cabdff1aSopenharmony_ci        src4332 = src6554;
1692cabdff1aSopenharmony_ci        src6554 = src8776;
1693cabdff1aSopenharmony_ci        src6 = src8;
1694cabdff1aSopenharmony_ci    }
1695cabdff1aSopenharmony_ci}
1696cabdff1aSopenharmony_ci
1697cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr,
1698cabdff1aSopenharmony_ci                                              int32_t src_stride,
1699cabdff1aSopenharmony_ci                                              int16_t *src1_ptr,
1700cabdff1aSopenharmony_ci                                              int32_t src2_stride,
1701cabdff1aSopenharmony_ci                                              uint8_t *dst,
1702cabdff1aSopenharmony_ci                                              int32_t dst_stride,
1703cabdff1aSopenharmony_ci                                              const int8_t *filter,
1704cabdff1aSopenharmony_ci                                              int32_t height,
1705cabdff1aSopenharmony_ci                                              int32_t weight0,
1706cabdff1aSopenharmony_ci                                              int32_t weight1,
1707cabdff1aSopenharmony_ci                                              int32_t offset0,
1708cabdff1aSopenharmony_ci                                              int32_t offset1,
1709cabdff1aSopenharmony_ci                                              int32_t rnd_val,
1710cabdff1aSopenharmony_ci                                              int32_t width)
1711cabdff1aSopenharmony_ci{
1712cabdff1aSopenharmony_ci    uint8_t *src0_ptr_tmp;
1713cabdff1aSopenharmony_ci    int16_t *src1_ptr_tmp;
1714cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
1715cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
1716cabdff1aSopenharmony_ci    int32_t offset, weight;
1717cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1718cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
1719cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r;
1720cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src65_r, src87_r;
1721cabdff1aSopenharmony_ci    v16i8 src10_l, src32_l, src54_l, src76_l;
1722cabdff1aSopenharmony_ci    v16i8 src21_l, src43_l, src65_l, src87_l;
1723cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3;
1724cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1725cabdff1aSopenharmony_ci    v8i16 filter_vec;
1726cabdff1aSopenharmony_ci    v8i16 out0, out1, out2, out3;
1727cabdff1aSopenharmony_ci    v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1728cabdff1aSopenharmony_ci
1729cabdff1aSopenharmony_ci    src0_ptr -= (3 * src_stride);
1730cabdff1aSopenharmony_ci
1731cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
1732cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
1733cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
1734cabdff1aSopenharmony_ci
1735cabdff1aSopenharmony_ci    const_vec = __msa_ldi_w(128);
1736cabdff1aSopenharmony_ci    const_vec <<= 6;
1737cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
1738cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
1739cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
1740cabdff1aSopenharmony_ci    weight1_vec = __msa_fill_w(weight1);
1741cabdff1aSopenharmony_ci    offset_vec += const_vec * weight1_vec;
1742cabdff1aSopenharmony_ci
1743cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
1744cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1745cabdff1aSopenharmony_ci
1746cabdff1aSopenharmony_ci    for (cnt = (width >> 4); cnt--;) {
1747cabdff1aSopenharmony_ci        src0_ptr_tmp = src0_ptr;
1748cabdff1aSopenharmony_ci        src1_ptr_tmp = src1_ptr;
1749cabdff1aSopenharmony_ci        dst_tmp = dst;
1750cabdff1aSopenharmony_ci
1751cabdff1aSopenharmony_ci        LD_SB7(src0_ptr_tmp, src_stride,
1752cabdff1aSopenharmony_ci               src0, src1, src2, src3, src4, src5, src6);
1753cabdff1aSopenharmony_ci        src0_ptr_tmp += (7 * src_stride);
1754cabdff1aSopenharmony_ci
1755cabdff1aSopenharmony_ci        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1756cabdff1aSopenharmony_ci        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1757cabdff1aSopenharmony_ci                   src10_r, src32_r, src54_r, src21_r);
1758cabdff1aSopenharmony_ci        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1759cabdff1aSopenharmony_ci        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1760cabdff1aSopenharmony_ci                   src10_l, src32_l, src54_l, src21_l);
1761cabdff1aSopenharmony_ci        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1762cabdff1aSopenharmony_ci
1763cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 1); loop_cnt--;) {
1764cabdff1aSopenharmony_ci            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1765cabdff1aSopenharmony_ci            src0_ptr_tmp += (2 * src_stride);
1766cabdff1aSopenharmony_ci            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1767cabdff1aSopenharmony_ci            LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1768cabdff1aSopenharmony_ci            src1_ptr_tmp += (2 * src2_stride);
1769cabdff1aSopenharmony_ci
1770cabdff1aSopenharmony_ci            XORI_B2_128_SB(src7, src8);
1771cabdff1aSopenharmony_ci            ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1772cabdff1aSopenharmony_ci            ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1773cabdff1aSopenharmony_ci
1774cabdff1aSopenharmony_ci            DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
1775cabdff1aSopenharmony_ci                        filt0, filt0, tmp0, tmp1, tmp2, tmp3);
1776cabdff1aSopenharmony_ci            DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
1777cabdff1aSopenharmony_ci                         filt1, filt1, tmp0, tmp1, tmp2, tmp3);
1778cabdff1aSopenharmony_ci            DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
1779cabdff1aSopenharmony_ci                         filt2, filt2, tmp0, tmp1, tmp2, tmp3);
1780cabdff1aSopenharmony_ci            DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
1781cabdff1aSopenharmony_ci                         filt3, filt3, tmp0, tmp1, tmp2, tmp3);
1782cabdff1aSopenharmony_ci
1783cabdff1aSopenharmony_ci            HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1784cabdff1aSopenharmony_ci                               in0, in1, in2, in3,
1785cabdff1aSopenharmony_ci                               weight_vec, rnd_vec, offset_vec,
1786cabdff1aSopenharmony_ci                               out0, out1, out2, out3);
1787cabdff1aSopenharmony_ci
1788cabdff1aSopenharmony_ci            PCKEV_B2_SH(out2, out0, out3, out1, out0, out1);
1789cabdff1aSopenharmony_ci            ST_SH2(out0, out1, dst_tmp, dst_stride);
1790cabdff1aSopenharmony_ci            dst_tmp += (2 * dst_stride);
1791cabdff1aSopenharmony_ci
1792cabdff1aSopenharmony_ci            src10_r = src32_r;
1793cabdff1aSopenharmony_ci            src32_r = src54_r;
1794cabdff1aSopenharmony_ci            src54_r = src76_r;
1795cabdff1aSopenharmony_ci            src21_r = src43_r;
1796cabdff1aSopenharmony_ci            src43_r = src65_r;
1797cabdff1aSopenharmony_ci            src65_r = src87_r;
1798cabdff1aSopenharmony_ci            src10_l = src32_l;
1799cabdff1aSopenharmony_ci            src32_l = src54_l;
1800cabdff1aSopenharmony_ci            src54_l = src76_l;
1801cabdff1aSopenharmony_ci            src21_l = src43_l;
1802cabdff1aSopenharmony_ci            src43_l = src65_l;
1803cabdff1aSopenharmony_ci            src65_l = src87_l;
1804cabdff1aSopenharmony_ci            src6 = src8;
1805cabdff1aSopenharmony_ci        }
1806cabdff1aSopenharmony_ci
1807cabdff1aSopenharmony_ci        src0_ptr += 16;
1808cabdff1aSopenharmony_ci        src1_ptr += 16;
1809cabdff1aSopenharmony_ci        dst += 16;
1810cabdff1aSopenharmony_ci    }
1811cabdff1aSopenharmony_ci}
1812cabdff1aSopenharmony_ci
1813cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr,
1814cabdff1aSopenharmony_ci                                     int32_t src_stride,
1815cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
1816cabdff1aSopenharmony_ci                                     int32_t src2_stride,
1817cabdff1aSopenharmony_ci                                     uint8_t *dst,
1818cabdff1aSopenharmony_ci                                     int32_t dst_stride,
1819cabdff1aSopenharmony_ci                                     const int8_t *filter,
1820cabdff1aSopenharmony_ci                                     int32_t height,
1821cabdff1aSopenharmony_ci                                     int32_t weight0,
1822cabdff1aSopenharmony_ci                                     int32_t weight1,
1823cabdff1aSopenharmony_ci                                     int32_t offset0,
1824cabdff1aSopenharmony_ci                                     int32_t offset1,
1825cabdff1aSopenharmony_ci                                     int32_t rnd_val)
1826cabdff1aSopenharmony_ci{
1827cabdff1aSopenharmony_ci    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1828cabdff1aSopenharmony_ci                                      src1_ptr, src2_stride,
1829cabdff1aSopenharmony_ci                                      dst, dst_stride, filter, height,
1830cabdff1aSopenharmony_ci                                      weight0, weight1, offset0, offset1,
1831cabdff1aSopenharmony_ci                                      rnd_val, 16);
1832cabdff1aSopenharmony_ci}
1833cabdff1aSopenharmony_ci
1834cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr,
1835cabdff1aSopenharmony_ci                                     int32_t src_stride,
1836cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
1837cabdff1aSopenharmony_ci                                     int32_t src2_stride,
1838cabdff1aSopenharmony_ci                                     uint8_t *dst,
1839cabdff1aSopenharmony_ci                                     int32_t dst_stride,
1840cabdff1aSopenharmony_ci                                     const int8_t *filter,
1841cabdff1aSopenharmony_ci                                     int32_t height,
1842cabdff1aSopenharmony_ci                                     int32_t weight0,
1843cabdff1aSopenharmony_ci                                     int32_t weight1,
1844cabdff1aSopenharmony_ci                                     int32_t offset0,
1845cabdff1aSopenharmony_ci                                     int32_t offset1,
1846cabdff1aSopenharmony_ci                                     int32_t rnd_val)
1847cabdff1aSopenharmony_ci{
1848cabdff1aSopenharmony_ci    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1849cabdff1aSopenharmony_ci                                      src1_ptr, src2_stride,
1850cabdff1aSopenharmony_ci                                      dst, dst_stride, filter, height,
1851cabdff1aSopenharmony_ci                                      weight0, weight1, offset0, offset1,
1852cabdff1aSopenharmony_ci                                      rnd_val, 16);
1853cabdff1aSopenharmony_ci    hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride,
1854cabdff1aSopenharmony_ci                            src1_ptr + 16, src2_stride,
1855cabdff1aSopenharmony_ci                            dst + 16, dst_stride, filter, height,
1856cabdff1aSopenharmony_ci                            weight0, weight1, offset0, offset1, rnd_val);
1857cabdff1aSopenharmony_ci}
1858cabdff1aSopenharmony_ci
1859cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr,
1860cabdff1aSopenharmony_ci                                     int32_t src_stride,
1861cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
1862cabdff1aSopenharmony_ci                                     int32_t src2_stride,
1863cabdff1aSopenharmony_ci                                     uint8_t *dst,
1864cabdff1aSopenharmony_ci                                     int32_t dst_stride,
1865cabdff1aSopenharmony_ci                                     const int8_t *filter,
1866cabdff1aSopenharmony_ci                                     int32_t height,
1867cabdff1aSopenharmony_ci                                     int32_t weight0,
1868cabdff1aSopenharmony_ci                                     int32_t weight1,
1869cabdff1aSopenharmony_ci                                     int32_t offset0,
1870cabdff1aSopenharmony_ci                                     int32_t offset1,
1871cabdff1aSopenharmony_ci                                     int32_t rnd_val)
1872cabdff1aSopenharmony_ci{
1873cabdff1aSopenharmony_ci    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1874cabdff1aSopenharmony_ci                                      src1_ptr, src2_stride,
1875cabdff1aSopenharmony_ci                                      dst, dst_stride, filter, height,
1876cabdff1aSopenharmony_ci                                      weight0, weight1, offset0, offset1,
1877cabdff1aSopenharmony_ci                                      rnd_val, 32);
1878cabdff1aSopenharmony_ci}
1879cabdff1aSopenharmony_ci
1880cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1881cabdff1aSopenharmony_ci                                     int32_t src_stride,
1882cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
1883cabdff1aSopenharmony_ci                                     int32_t src2_stride,
1884cabdff1aSopenharmony_ci                                     uint8_t *dst,
1885cabdff1aSopenharmony_ci                                     int32_t dst_stride,
1886cabdff1aSopenharmony_ci                                     const int8_t *filter,
1887cabdff1aSopenharmony_ci                                     int32_t height,
1888cabdff1aSopenharmony_ci                                     int32_t weight0,
1889cabdff1aSopenharmony_ci                                     int32_t weight1,
1890cabdff1aSopenharmony_ci                                     int32_t offset0,
1891cabdff1aSopenharmony_ci                                     int32_t offset1,
1892cabdff1aSopenharmony_ci                                     int32_t rnd_val)
1893cabdff1aSopenharmony_ci{
1894cabdff1aSopenharmony_ci    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1895cabdff1aSopenharmony_ci                                      src1_ptr, src2_stride,
1896cabdff1aSopenharmony_ci                                      dst, dst_stride, filter, height,
1897cabdff1aSopenharmony_ci                                      weight0, weight1, offset0, offset1,
1898cabdff1aSopenharmony_ci                                      rnd_val, 48);
1899cabdff1aSopenharmony_ci}
1900cabdff1aSopenharmony_ci
1901cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1902cabdff1aSopenharmony_ci                                     int32_t src_stride,
1903cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
1904cabdff1aSopenharmony_ci                                     int32_t src2_stride,
1905cabdff1aSopenharmony_ci                                     uint8_t *dst,
1906cabdff1aSopenharmony_ci                                     int32_t dst_stride,
1907cabdff1aSopenharmony_ci                                     const int8_t *filter,
1908cabdff1aSopenharmony_ci                                     int32_t height,
1909cabdff1aSopenharmony_ci                                     int32_t weight0,
1910cabdff1aSopenharmony_ci                                     int32_t weight1,
1911cabdff1aSopenharmony_ci                                     int32_t offset0,
1912cabdff1aSopenharmony_ci                                     int32_t offset1,
1913cabdff1aSopenharmony_ci                                     int32_t rnd_val)
1914cabdff1aSopenharmony_ci{
1915cabdff1aSopenharmony_ci    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1916cabdff1aSopenharmony_ci                                      src1_ptr, src2_stride,
1917cabdff1aSopenharmony_ci                                      dst, dst_stride, filter, height,
1918cabdff1aSopenharmony_ci                                      weight0, weight1, offset0, offset1,
1919cabdff1aSopenharmony_ci                                      rnd_val, 64);
1920cabdff1aSopenharmony_ci}
1921cabdff1aSopenharmony_ci
1922cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1923cabdff1aSopenharmony_ci                                    int32_t src_stride,
1924cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
1925cabdff1aSopenharmony_ci                                    int32_t src2_stride,
1926cabdff1aSopenharmony_ci                                    uint8_t *dst,
1927cabdff1aSopenharmony_ci                                    int32_t dst_stride,
1928cabdff1aSopenharmony_ci                                    const int8_t *filter_x,
1929cabdff1aSopenharmony_ci                                    const int8_t *filter_y,
1930cabdff1aSopenharmony_ci                                    int32_t height,
1931cabdff1aSopenharmony_ci                                    int32_t weight0,
1932cabdff1aSopenharmony_ci                                    int32_t weight1,
1933cabdff1aSopenharmony_ci                                    int32_t offset0,
1934cabdff1aSopenharmony_ci                                    int32_t offset1,
1935cabdff1aSopenharmony_ci                                    int32_t rnd_val)
1936cabdff1aSopenharmony_ci{
1937cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1938cabdff1aSopenharmony_ci    uint64_t tp0, tp1;
1939cabdff1aSopenharmony_ci    int32_t offset, weight;
1940cabdff1aSopenharmony_ci    v16u8 out;
1941cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1942cabdff1aSopenharmony_ci    v8i16 in0 = { 0 }, in1 = { 0 };
1943cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1944cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1945cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3;
1946cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec;
1947cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1948cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1949cabdff1aSopenharmony_ci    v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1950cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3;
1951cabdff1aSopenharmony_ci    v8i16 dst10, dst32, dst54, dst76;
1952cabdff1aSopenharmony_ci    v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
1953cabdff1aSopenharmony_ci    v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
1954cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1955cabdff1aSopenharmony_ci
1956cabdff1aSopenharmony_ci    src0_ptr -= ((3 * src_stride) + 3);
1957cabdff1aSopenharmony_ci
1958cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
1959cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1960cabdff1aSopenharmony_ci
1961cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
1962cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
1963cabdff1aSopenharmony_ci
1964cabdff1aSopenharmony_ci    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1965cabdff1aSopenharmony_ci
1966cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1967cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1968cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1969cabdff1aSopenharmony_ci
1970cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
1971cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
1972cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
1973cabdff1aSopenharmony_ci
1974cabdff1aSopenharmony_ci    const_vec = __msa_fill_w((128 * weight1));
1975cabdff1aSopenharmony_ci    const_vec <<= 6;
1976cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
1977cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
1978cabdff1aSopenharmony_ci    offset_vec += const_vec;
1979cabdff1aSopenharmony_ci    weight_vec = (v8i16) __msa_fill_w(weight);
1980cabdff1aSopenharmony_ci
1981cabdff1aSopenharmony_ci    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1982cabdff1aSopenharmony_ci    src0_ptr += (7 * src_stride);
1983cabdff1aSopenharmony_ci
1984cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1985cabdff1aSopenharmony_ci
1986cabdff1aSopenharmony_ci    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1987cabdff1aSopenharmony_ci    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1988cabdff1aSopenharmony_ci    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1989cabdff1aSopenharmony_ci               vec8, vec9, vec10, vec11);
1990cabdff1aSopenharmony_ci    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1991cabdff1aSopenharmony_ci               vec12, vec13, vec14, vec15);
1992cabdff1aSopenharmony_ci
1993cabdff1aSopenharmony_ci    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1994cabdff1aSopenharmony_ci                              filt3);
1995cabdff1aSopenharmony_ci    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1996cabdff1aSopenharmony_ci                              filt3);
1997cabdff1aSopenharmony_ci    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1998cabdff1aSopenharmony_ci                              filt3);
1999cabdff1aSopenharmony_ci    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2000cabdff1aSopenharmony_ci                              filt3);
2001cabdff1aSopenharmony_ci
2002cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2003cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2004cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2005cabdff1aSopenharmony_ci
2006cabdff1aSopenharmony_ci    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2007cabdff1aSopenharmony_ci
2008cabdff1aSopenharmony_ci    for (loop_cnt = height >> 2; loop_cnt--;) {
2009cabdff1aSopenharmony_ci        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2010cabdff1aSopenharmony_ci        src0_ptr += (4 * src_stride);
2011cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
2012cabdff1aSopenharmony_ci
2013cabdff1aSopenharmony_ci        LD2(src1_ptr, src2_stride, tp0, tp1);
2014cabdff1aSopenharmony_ci        INSERT_D2_SH(tp0, tp1, in0);
2015cabdff1aSopenharmony_ci        src1_ptr += (2 * src2_stride);
2016cabdff1aSopenharmony_ci        LD2(src1_ptr, src2_stride, tp0, tp1);
2017cabdff1aSopenharmony_ci        INSERT_D2_SH(tp0, tp1, in1);
2018cabdff1aSopenharmony_ci        src1_ptr += (2 * src2_stride);
2019cabdff1aSopenharmony_ci
2020cabdff1aSopenharmony_ci        VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
2021cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
2022cabdff1aSopenharmony_ci        VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
2023cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
2024cabdff1aSopenharmony_ci        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2025cabdff1aSopenharmony_ci                                  filt3);
2026cabdff1aSopenharmony_ci        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2027cabdff1aSopenharmony_ci                                   filt3);
2028cabdff1aSopenharmony_ci
2029cabdff1aSopenharmony_ci        dst76 = __msa_ilvr_h(dst97, dst66);
2030cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2031cabdff1aSopenharmony_ci        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2032cabdff1aSopenharmony_ci        dst98 = __msa_ilvr_h(dst66, dst108);
2033cabdff1aSopenharmony_ci
2034cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2035cabdff1aSopenharmony_ci                              filt_h2, filt_h3);
2036cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2037cabdff1aSopenharmony_ci                              filt_h2, filt_h3);
2038cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2039cabdff1aSopenharmony_ci                              filt_h2, filt_h3);
2040cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2041cabdff1aSopenharmony_ci                              filt_h2, filt_h3);
2042cabdff1aSopenharmony_ci        SRA_4V(dst0, dst1, dst2, dst3, 6);
2043cabdff1aSopenharmony_ci        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2044cabdff1aSopenharmony_ci        ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2045cabdff1aSopenharmony_ci        ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2046cabdff1aSopenharmony_ci        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2047cabdff1aSopenharmony_ci        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2048cabdff1aSopenharmony_ci        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2049cabdff1aSopenharmony_ci        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2050cabdff1aSopenharmony_ci        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2051cabdff1aSopenharmony_ci        CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2052cabdff1aSopenharmony_ci        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2053cabdff1aSopenharmony_ci        out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2054cabdff1aSopenharmony_ci        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2055cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
2056cabdff1aSopenharmony_ci
2057cabdff1aSopenharmony_ci        dst10 = dst54;
2058cabdff1aSopenharmony_ci        dst32 = dst76;
2059cabdff1aSopenharmony_ci        dst54 = dst98;
2060cabdff1aSopenharmony_ci        dst21 = dst65;
2061cabdff1aSopenharmony_ci        dst43 = dst87;
2062cabdff1aSopenharmony_ci        dst65 = dst109;
2063cabdff1aSopenharmony_ci        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2064cabdff1aSopenharmony_ci    }
2065cabdff1aSopenharmony_ci}
2066cabdff1aSopenharmony_ci
2067cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr,
2068cabdff1aSopenharmony_ci                                             int32_t src_stride,
2069cabdff1aSopenharmony_ci                                             int16_t *src1_ptr,
2070cabdff1aSopenharmony_ci                                             int32_t src2_stride,
2071cabdff1aSopenharmony_ci                                             uint8_t *dst,
2072cabdff1aSopenharmony_ci                                             int32_t dst_stride,
2073cabdff1aSopenharmony_ci                                             const int8_t *filter_x,
2074cabdff1aSopenharmony_ci                                             const int8_t *filter_y,
2075cabdff1aSopenharmony_ci                                             int32_t height,
2076cabdff1aSopenharmony_ci                                             int32_t weight0,
2077cabdff1aSopenharmony_ci                                             int32_t weight1,
2078cabdff1aSopenharmony_ci                                             int32_t offset0,
2079cabdff1aSopenharmony_ci                                             int32_t offset1,
2080cabdff1aSopenharmony_ci                                             int32_t rnd_val,
2081cabdff1aSopenharmony_ci                                             int32_t width8mult)
2082cabdff1aSopenharmony_ci{
2083cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
2084cabdff1aSopenharmony_ci    int32_t offset, weight;
2085cabdff1aSopenharmony_ci    uint8_t *src0_ptr_tmp;
2086cabdff1aSopenharmony_ci    int16_t *src1_ptr_tmp;
2087cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
2088cabdff1aSopenharmony_ci    v16u8 out;
2089cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2090cabdff1aSopenharmony_ci    v8i16 in0, in1;
2091cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
2092cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2093cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2094cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3;
2095cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec;
2096cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2097cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2098cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
2099cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
2100cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3;
2101cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2102cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2103cabdff1aSopenharmony_ci    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2104cabdff1aSopenharmony_ci    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
2105cabdff1aSopenharmony_ci    v4i32 offset_vec, rnd_vec, const_vec;
2106cabdff1aSopenharmony_ci
2107cabdff1aSopenharmony_ci    src0_ptr -= ((3 * src_stride) + 3);
2108cabdff1aSopenharmony_ci
2109cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
2110cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
2111cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
2112cabdff1aSopenharmony_ci
2113cabdff1aSopenharmony_ci    const_vec = __msa_fill_w((128 * weight1));
2114cabdff1aSopenharmony_ci    const_vec <<= 6;
2115cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
2116cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
2117cabdff1aSopenharmony_ci    offset_vec += const_vec;
2118cabdff1aSopenharmony_ci    weight_vec = (v8i16) __msa_fill_w(weight);
2119cabdff1aSopenharmony_ci
2120cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
2121cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2122cabdff1aSopenharmony_ci
2123cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
2124cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
2125cabdff1aSopenharmony_ci
2126cabdff1aSopenharmony_ci    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2127cabdff1aSopenharmony_ci
2128cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2129cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
2130cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
2131cabdff1aSopenharmony_ci
2132cabdff1aSopenharmony_ci    for (cnt = width8mult; cnt--;) {
2133cabdff1aSopenharmony_ci        src0_ptr_tmp = src0_ptr;
2134cabdff1aSopenharmony_ci        src1_ptr_tmp = src1_ptr;
2135cabdff1aSopenharmony_ci        dst_tmp = dst;
2136cabdff1aSopenharmony_ci
2137cabdff1aSopenharmony_ci        LD_SB7(src0_ptr_tmp, src_stride,
2138cabdff1aSopenharmony_ci               src0, src1, src2, src3, src4, src5, src6);
2139cabdff1aSopenharmony_ci        src0_ptr_tmp += (7 * src_stride);
2140cabdff1aSopenharmony_ci
2141cabdff1aSopenharmony_ci        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2142cabdff1aSopenharmony_ci
2143cabdff1aSopenharmony_ci        /* row 0 row 1 row 2 row 3 */
2144cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
2145cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
2146cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
2147cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
2148cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
2149cabdff1aSopenharmony_ci                   vec8, vec9, vec10, vec11);
2150cabdff1aSopenharmony_ci        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2151cabdff1aSopenharmony_ci                   vec12, vec13, vec14, vec15);
2152cabdff1aSopenharmony_ci
2153cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2154cabdff1aSopenharmony_ci                                 filt3);
2155cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2156cabdff1aSopenharmony_ci                                 filt3);
2157cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2158cabdff1aSopenharmony_ci                                 filt3);
2159cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2160cabdff1aSopenharmony_ci                                 filt2, filt3);
2161cabdff1aSopenharmony_ci
2162cabdff1aSopenharmony_ci        /* row 4 row 5 row 6 */
2163cabdff1aSopenharmony_ci        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2164cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
2165cabdff1aSopenharmony_ci        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2166cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
2167cabdff1aSopenharmony_ci        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2168cabdff1aSopenharmony_ci                   vec8, vec9, vec10, vec11);
2169cabdff1aSopenharmony_ci
2170cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2171cabdff1aSopenharmony_ci                                 filt3);
2172cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2173cabdff1aSopenharmony_ci                                 filt3);
2174cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2175cabdff1aSopenharmony_ci                                 filt3);
2176cabdff1aSopenharmony_ci
2177cabdff1aSopenharmony_ci        for (loop_cnt = height >> 1; loop_cnt--;) {
2178cabdff1aSopenharmony_ci            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2179cabdff1aSopenharmony_ci            XORI_B2_128_SB(src7, src8);
2180cabdff1aSopenharmony_ci            src0_ptr_tmp += 2 * src_stride;
2181cabdff1aSopenharmony_ci
2182cabdff1aSopenharmony_ci            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2183cabdff1aSopenharmony_ci            src1_ptr_tmp += (2 * src2_stride);
2184cabdff1aSopenharmony_ci
2185cabdff1aSopenharmony_ci            ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
2186cabdff1aSopenharmony_ci                       dst32_r, dst54_r, dst21_r);
2187cabdff1aSopenharmony_ci            ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
2188cabdff1aSopenharmony_ci                       dst32_l, dst54_l, dst21_l);
2189cabdff1aSopenharmony_ci            ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2190cabdff1aSopenharmony_ci            ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2191cabdff1aSopenharmony_ci
2192cabdff1aSopenharmony_ci            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2193cabdff1aSopenharmony_ci                       vec0, vec1, vec2, vec3);
2194cabdff1aSopenharmony_ci            dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2195cabdff1aSopenharmony_ci                                     filt2, filt3);
2196cabdff1aSopenharmony_ci
2197cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2198cabdff1aSopenharmony_ci            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2199cabdff1aSopenharmony_ci                                    filt_h0, filt_h1, filt_h2, filt_h3);
2200cabdff1aSopenharmony_ci            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2201cabdff1aSopenharmony_ci                                    filt_h0, filt_h1, filt_h2, filt_h3);
2202cabdff1aSopenharmony_ci
2203cabdff1aSopenharmony_ci            dst0_r >>= 6;
2204cabdff1aSopenharmony_ci            dst0_l >>= 6;
2205cabdff1aSopenharmony_ci
2206cabdff1aSopenharmony_ci            /* row 8 */
2207cabdff1aSopenharmony_ci            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2208cabdff1aSopenharmony_ci                       vec0, vec1, vec2, vec3);
2209cabdff1aSopenharmony_ci            dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2210cabdff1aSopenharmony_ci                                     filt2, filt3);
2211cabdff1aSopenharmony_ci
2212cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
2213cabdff1aSopenharmony_ci            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2214cabdff1aSopenharmony_ci                                    filt_h0, filt_h1, filt_h2, filt_h3);
2215cabdff1aSopenharmony_ci            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2216cabdff1aSopenharmony_ci                                    filt_h0, filt_h1, filt_h2, filt_h3);
2217cabdff1aSopenharmony_ci
2218cabdff1aSopenharmony_ci            dst1_r >>= 6;
2219cabdff1aSopenharmony_ci            dst1_l >>= 6;
2220cabdff1aSopenharmony_ci
2221cabdff1aSopenharmony_ci            PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
2222cabdff1aSopenharmony_ci            ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2223cabdff1aSopenharmony_ci            ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2224cabdff1aSopenharmony_ci            dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2225cabdff1aSopenharmony_ci            dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2226cabdff1aSopenharmony_ci            dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2227cabdff1aSopenharmony_ci            dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2228cabdff1aSopenharmony_ci            SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
2229cabdff1aSopenharmony_ci            CLIP_SW4_0_255(dst0_l, dst0_r, dst1_l, dst1_r);
2230cabdff1aSopenharmony_ci            PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
2231cabdff1aSopenharmony_ci            out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2232cabdff1aSopenharmony_ci            ST_D2(out, 0, 1, dst_tmp, dst_stride);
2233cabdff1aSopenharmony_ci            dst_tmp += (2 * dst_stride);
2234cabdff1aSopenharmony_ci
2235cabdff1aSopenharmony_ci            dst0 = dst2;
2236cabdff1aSopenharmony_ci            dst1 = dst3;
2237cabdff1aSopenharmony_ci            dst2 = dst4;
2238cabdff1aSopenharmony_ci            dst3 = dst5;
2239cabdff1aSopenharmony_ci            dst4 = dst6;
2240cabdff1aSopenharmony_ci            dst5 = dst7;
2241cabdff1aSopenharmony_ci            dst6 = dst8;
2242cabdff1aSopenharmony_ci        }
2243cabdff1aSopenharmony_ci
2244cabdff1aSopenharmony_ci        src0_ptr += 8;
2245cabdff1aSopenharmony_ci        src1_ptr += 8;
2246cabdff1aSopenharmony_ci        dst += 8;
2247cabdff1aSopenharmony_ci    }
2248cabdff1aSopenharmony_ci}
2249cabdff1aSopenharmony_ci
2250cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr,
2251cabdff1aSopenharmony_ci                                    int32_t src_stride,
2252cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
2253cabdff1aSopenharmony_ci                                    int32_t src2_stride,
2254cabdff1aSopenharmony_ci                                    uint8_t *dst,
2255cabdff1aSopenharmony_ci                                    int32_t dst_stride,
2256cabdff1aSopenharmony_ci                                    const int8_t *filter_x,
2257cabdff1aSopenharmony_ci                                    const int8_t *filter_y,
2258cabdff1aSopenharmony_ci                                    int32_t height,
2259cabdff1aSopenharmony_ci                                    int32_t weight0,
2260cabdff1aSopenharmony_ci                                    int32_t weight1,
2261cabdff1aSopenharmony_ci                                    int32_t offset0,
2262cabdff1aSopenharmony_ci                                    int32_t offset1,
2263cabdff1aSopenharmony_ci                                    int32_t rnd_val)
2264cabdff1aSopenharmony_ci{
2265cabdff1aSopenharmony_ci    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2266cabdff1aSopenharmony_ci                                     src1_ptr, src2_stride,
2267cabdff1aSopenharmony_ci                                     dst, dst_stride, filter_x, filter_y,
2268cabdff1aSopenharmony_ci                                     height, weight0, weight1, offset0,
2269cabdff1aSopenharmony_ci                                     offset1, rnd_val, 1);
2270cabdff1aSopenharmony_ci}
2271cabdff1aSopenharmony_ci
2272cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
2273cabdff1aSopenharmony_ci                                     int32_t src_stride,
2274cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
2275cabdff1aSopenharmony_ci                                     int32_t src2_stride,
2276cabdff1aSopenharmony_ci                                     uint8_t *dst,
2277cabdff1aSopenharmony_ci                                     int32_t dst_stride,
2278cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
2279cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
2280cabdff1aSopenharmony_ci                                     int32_t height,
2281cabdff1aSopenharmony_ci                                     int32_t weight0,
2282cabdff1aSopenharmony_ci                                     int32_t weight1,
2283cabdff1aSopenharmony_ci                                     int32_t offset0,
2284cabdff1aSopenharmony_ci                                     int32_t offset1,
2285cabdff1aSopenharmony_ci                                     int32_t rnd_val)
2286cabdff1aSopenharmony_ci{
2287cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2288cabdff1aSopenharmony_ci    uint8_t *src0_ptr_tmp, *dst_tmp;
2289cabdff1aSopenharmony_ci    int16_t *src1_ptr_tmp;
2290cabdff1aSopenharmony_ci    int32_t offset, weight;
2291cabdff1aSopenharmony_ci    uint64_t tp0, tp1;
2292cabdff1aSopenharmony_ci    v16u8 out;
2293cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2294cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2295cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2296cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2297cabdff1aSopenharmony_ci    v8i16 in0 = { 0 }, in1 = { 0 };
2298cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
2299cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2300cabdff1aSopenharmony_ci    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
2301cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
2302cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
2303cabdff1aSopenharmony_ci    v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
2304cabdff1aSopenharmony_ci    v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
2305cabdff1aSopenharmony_ci    v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2306cabdff1aSopenharmony_ci
2307cabdff1aSopenharmony_ci    src0_ptr -= ((3 * src_stride) + 3);
2308cabdff1aSopenharmony_ci
2309cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
2310cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
2311cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
2312cabdff1aSopenharmony_ci
2313cabdff1aSopenharmony_ci    const_vec = __msa_fill_w((128 * weight1));
2314cabdff1aSopenharmony_ci    const_vec <<= 6;
2315cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
2316cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
2317cabdff1aSopenharmony_ci    offset_vec += const_vec;
2318cabdff1aSopenharmony_ci    weight_vec = (v8i16) __msa_fill_w(weight);
2319cabdff1aSopenharmony_ci
2320cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
2321cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2322cabdff1aSopenharmony_ci
2323cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
2324cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
2325cabdff1aSopenharmony_ci
2326cabdff1aSopenharmony_ci    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2327cabdff1aSopenharmony_ci
2328cabdff1aSopenharmony_ci    mask0 = LD_SB(ff_hevc_mask_arr);
2329cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2330cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
2331cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
2332cabdff1aSopenharmony_ci
2333cabdff1aSopenharmony_ci    src0_ptr_tmp = src0_ptr;
2334cabdff1aSopenharmony_ci    src1_ptr_tmp = src1_ptr;
2335cabdff1aSopenharmony_ci    dst_tmp = dst;
2336cabdff1aSopenharmony_ci
2337cabdff1aSopenharmony_ci    LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2338cabdff1aSopenharmony_ci    src0_ptr_tmp += (7 * src_stride);
2339cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2340cabdff1aSopenharmony_ci
2341cabdff1aSopenharmony_ci    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2342cabdff1aSopenharmony_ci    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2343cabdff1aSopenharmony_ci    VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2344cabdff1aSopenharmony_ci               vec11);
2345cabdff1aSopenharmony_ci    VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2346cabdff1aSopenharmony_ci               vec15);
2347cabdff1aSopenharmony_ci    dsth0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2348cabdff1aSopenharmony_ci                              filt3);
2349cabdff1aSopenharmony_ci    dsth1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2350cabdff1aSopenharmony_ci                              filt3);
2351cabdff1aSopenharmony_ci    dsth2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2352cabdff1aSopenharmony_ci                              filt3);
2353cabdff1aSopenharmony_ci    dsth3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2354cabdff1aSopenharmony_ci                              filt2, filt3);
2355cabdff1aSopenharmony_ci    VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2356cabdff1aSopenharmony_ci    VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2357cabdff1aSopenharmony_ci    VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2358cabdff1aSopenharmony_ci               vec11);
2359cabdff1aSopenharmony_ci    dsth4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2360cabdff1aSopenharmony_ci                              filt3);
2361cabdff1aSopenharmony_ci    dsth5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2362cabdff1aSopenharmony_ci                              filt3);
2363cabdff1aSopenharmony_ci    dsth6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2364cabdff1aSopenharmony_ci                              filt3);
2365cabdff1aSopenharmony_ci
2366cabdff1aSopenharmony_ci    for (loop_cnt = 8; loop_cnt--;) {
2367cabdff1aSopenharmony_ci        LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2368cabdff1aSopenharmony_ci        src0_ptr_tmp += (2 * src_stride);
2369cabdff1aSopenharmony_ci        XORI_B2_128_SB(src7, src8);
2370cabdff1aSopenharmony_ci
2371cabdff1aSopenharmony_ci        LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2372cabdff1aSopenharmony_ci        src1_ptr_tmp += (2 * src2_stride);
2373cabdff1aSopenharmony_ci
2374cabdff1aSopenharmony_ci        ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2375cabdff1aSopenharmony_ci                   dst10_r, dst32_r, dst54_r, dst21_r);
2376cabdff1aSopenharmony_ci        ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2377cabdff1aSopenharmony_ci                   dst10_l, dst32_l, dst54_l, dst21_l);
2378cabdff1aSopenharmony_ci        ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
2379cabdff1aSopenharmony_ci        ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
2380cabdff1aSopenharmony_ci
2381cabdff1aSopenharmony_ci        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2382cabdff1aSopenharmony_ci                   vec3);
2383cabdff1aSopenharmony_ci        dsth7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2384cabdff1aSopenharmony_ci                                  filt3);
2385cabdff1aSopenharmony_ci
2386cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
2387cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2388cabdff1aSopenharmony_ci                              filt_h1, filt_h2, filt_h3);
2389cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
2390cabdff1aSopenharmony_ci                              filt_h1, filt_h2, filt_h3);
2391cabdff1aSopenharmony_ci        dst0 >>= 6;
2392cabdff1aSopenharmony_ci        dst1 >>= 6;
2393cabdff1aSopenharmony_ci
2394cabdff1aSopenharmony_ci        VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2395cabdff1aSopenharmony_ci                   vec3);
2396cabdff1aSopenharmony_ci        dsth8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2397cabdff1aSopenharmony_ci                                  filt3);
2398cabdff1aSopenharmony_ci
2399cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
2400cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2401cabdff1aSopenharmony_ci                              filt_h1, filt_h2, filt_h3);
2402cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
2403cabdff1aSopenharmony_ci                              filt_h1, filt_h2, filt_h3);
2404cabdff1aSopenharmony_ci        dst2 >>= 6;
2405cabdff1aSopenharmony_ci        dst3 >>= 6;
2406cabdff1aSopenharmony_ci
2407cabdff1aSopenharmony_ci        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2408cabdff1aSopenharmony_ci        ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2409cabdff1aSopenharmony_ci        ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2410cabdff1aSopenharmony_ci        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2411cabdff1aSopenharmony_ci        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2412cabdff1aSopenharmony_ci        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2413cabdff1aSopenharmony_ci        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2414cabdff1aSopenharmony_ci        SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec);
2415cabdff1aSopenharmony_ci        CLIP_SW4_0_255(dst1, dst0, dst3, dst2);
2416cabdff1aSopenharmony_ci        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2417cabdff1aSopenharmony_ci        out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2418cabdff1aSopenharmony_ci        ST_D2(out, 0, 1, dst_tmp, dst_stride);
2419cabdff1aSopenharmony_ci        dst_tmp += (2 * dst_stride);
2420cabdff1aSopenharmony_ci
2421cabdff1aSopenharmony_ci        dsth0 = dsth2;
2422cabdff1aSopenharmony_ci        dsth1 = dsth3;
2423cabdff1aSopenharmony_ci        dsth2 = dsth4;
2424cabdff1aSopenharmony_ci        dsth3 = dsth5;
2425cabdff1aSopenharmony_ci        dsth4 = dsth6;
2426cabdff1aSopenharmony_ci        dsth5 = dsth7;
2427cabdff1aSopenharmony_ci        dsth6 = dsth8;
2428cabdff1aSopenharmony_ci    }
2429cabdff1aSopenharmony_ci
2430cabdff1aSopenharmony_ci    src0_ptr += 8;
2431cabdff1aSopenharmony_ci    src1_ptr += 8;
2432cabdff1aSopenharmony_ci    dst += 8;
2433cabdff1aSopenharmony_ci
2434cabdff1aSopenharmony_ci    mask4 = LD_SB(ff_hevc_mask_arr + 16);
2435cabdff1aSopenharmony_ci    mask5 = mask4 + 2;
2436cabdff1aSopenharmony_ci    mask6 = mask4 + 4;
2437cabdff1aSopenharmony_ci    mask7 = mask4 + 6;
2438cabdff1aSopenharmony_ci
2439cabdff1aSopenharmony_ci    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2440cabdff1aSopenharmony_ci    src0_ptr += (7 * src_stride);
2441cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2442cabdff1aSopenharmony_ci
2443cabdff1aSopenharmony_ci    VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2444cabdff1aSopenharmony_ci    VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2445cabdff1aSopenharmony_ci    VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2446cabdff1aSopenharmony_ci               vec11);
2447cabdff1aSopenharmony_ci    VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2448cabdff1aSopenharmony_ci               vec15);
2449cabdff1aSopenharmony_ci    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2450cabdff1aSopenharmony_ci                              filt3);
2451cabdff1aSopenharmony_ci    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2452cabdff1aSopenharmony_ci                              filt3);
2453cabdff1aSopenharmony_ci    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2454cabdff1aSopenharmony_ci                              filt3);
2455cabdff1aSopenharmony_ci    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2456cabdff1aSopenharmony_ci                              filt3);
2457cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2458cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2459cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2460cabdff1aSopenharmony_ci
2461cabdff1aSopenharmony_ci    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2462cabdff1aSopenharmony_ci
2463cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
2464cabdff1aSopenharmony_ci        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2465cabdff1aSopenharmony_ci        src0_ptr += (4 * src_stride);
2466cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
2467cabdff1aSopenharmony_ci
2468cabdff1aSopenharmony_ci        LD2(src1_ptr, src2_stride, tp0, tp1);
2469cabdff1aSopenharmony_ci        INSERT_D2_SH(tp0, tp1, in0);
2470cabdff1aSopenharmony_ci        src1_ptr += (2 * src2_stride);
2471cabdff1aSopenharmony_ci        LD2(src1_ptr, src2_stride, tp0, tp1);
2472cabdff1aSopenharmony_ci        INSERT_D2_SH(tp0, tp1, in1);
2473cabdff1aSopenharmony_ci        src1_ptr += (2 * src2_stride);
2474cabdff1aSopenharmony_ci
2475cabdff1aSopenharmony_ci        VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2476cabdff1aSopenharmony_ci                   vec3);
2477cabdff1aSopenharmony_ci        VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2478cabdff1aSopenharmony_ci                   vec7);
2479cabdff1aSopenharmony_ci        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2480cabdff1aSopenharmony_ci                                  filt3);
2481cabdff1aSopenharmony_ci        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2482cabdff1aSopenharmony_ci                                   filt3);
2483cabdff1aSopenharmony_ci
2484cabdff1aSopenharmony_ci        dst76 = __msa_ilvr_h(dst97, dst66);
2485cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2486cabdff1aSopenharmony_ci        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2487cabdff1aSopenharmony_ci        dst98 = __msa_ilvr_h(dst66, dst108);
2488cabdff1aSopenharmony_ci
2489cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2490cabdff1aSopenharmony_ci                              filt_h2, filt_h3);
2491cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2492cabdff1aSopenharmony_ci                              filt_h2, filt_h3);
2493cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2494cabdff1aSopenharmony_ci                              filt_h2, filt_h3);
2495cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2496cabdff1aSopenharmony_ci                              filt_h2, filt_h3);
2497cabdff1aSopenharmony_ci        SRA_4V(dst0, dst1, dst2, dst3, 6);
2498cabdff1aSopenharmony_ci        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2499cabdff1aSopenharmony_ci        ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2500cabdff1aSopenharmony_ci        ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2501cabdff1aSopenharmony_ci        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2502cabdff1aSopenharmony_ci        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2503cabdff1aSopenharmony_ci        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2504cabdff1aSopenharmony_ci        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2505cabdff1aSopenharmony_ci        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2506cabdff1aSopenharmony_ci        CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2507cabdff1aSopenharmony_ci        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2508cabdff1aSopenharmony_ci        out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2509cabdff1aSopenharmony_ci        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2510cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
2511cabdff1aSopenharmony_ci
2512cabdff1aSopenharmony_ci        dst10 = dst54;
2513cabdff1aSopenharmony_ci        dst32 = dst76;
2514cabdff1aSopenharmony_ci        dst54 = dst98;
2515cabdff1aSopenharmony_ci        dst21 = dst65;
2516cabdff1aSopenharmony_ci        dst43 = dst87;
2517cabdff1aSopenharmony_ci        dst65 = dst109;
2518cabdff1aSopenharmony_ci        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2519cabdff1aSopenharmony_ci    }
2520cabdff1aSopenharmony_ci}
2521cabdff1aSopenharmony_ci
2522cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr,
2523cabdff1aSopenharmony_ci                                     int32_t src_stride,
2524cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
2525cabdff1aSopenharmony_ci                                     int32_t src2_stride,
2526cabdff1aSopenharmony_ci                                     uint8_t *dst,
2527cabdff1aSopenharmony_ci                                     int32_t dst_stride,
2528cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
2529cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
2530cabdff1aSopenharmony_ci                                     int32_t height,
2531cabdff1aSopenharmony_ci                                     int32_t weight0,
2532cabdff1aSopenharmony_ci                                     int32_t weight1,
2533cabdff1aSopenharmony_ci                                     int32_t offset0,
2534cabdff1aSopenharmony_ci                                     int32_t offset1,
2535cabdff1aSopenharmony_ci                                     int32_t rnd_val)
2536cabdff1aSopenharmony_ci{
2537cabdff1aSopenharmony_ci    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2538cabdff1aSopenharmony_ci                                     src1_ptr, src2_stride,
2539cabdff1aSopenharmony_ci                                     dst, dst_stride, filter_x, filter_y,
2540cabdff1aSopenharmony_ci                                     height, weight0, weight1, offset0,
2541cabdff1aSopenharmony_ci                                     offset1, rnd_val, 2);
2542cabdff1aSopenharmony_ci}
2543cabdff1aSopenharmony_ci
2544cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr,
2545cabdff1aSopenharmony_ci                                     int32_t src_stride,
2546cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
2547cabdff1aSopenharmony_ci                                     int32_t src2_stride,
2548cabdff1aSopenharmony_ci                                     uint8_t *dst,
2549cabdff1aSopenharmony_ci                                     int32_t dst_stride,
2550cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
2551cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
2552cabdff1aSopenharmony_ci                                     int32_t height,
2553cabdff1aSopenharmony_ci                                     int32_t weight0,
2554cabdff1aSopenharmony_ci                                     int32_t weight1,
2555cabdff1aSopenharmony_ci                                     int32_t offset0,
2556cabdff1aSopenharmony_ci                                     int32_t offset1,
2557cabdff1aSopenharmony_ci                                     int32_t rnd_val)
2558cabdff1aSopenharmony_ci{
2559cabdff1aSopenharmony_ci    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2560cabdff1aSopenharmony_ci                                     src1_ptr, src2_stride,
2561cabdff1aSopenharmony_ci                                     dst, dst_stride, filter_x, filter_y,
2562cabdff1aSopenharmony_ci                                     height, weight0, weight1, offset0,
2563cabdff1aSopenharmony_ci                                     offset1, rnd_val, 3);
2564cabdff1aSopenharmony_ci}
2565cabdff1aSopenharmony_ci
2566cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr,
2567cabdff1aSopenharmony_ci                                     int32_t src_stride,
2568cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
2569cabdff1aSopenharmony_ci                                     int32_t src2_stride,
2570cabdff1aSopenharmony_ci                                     uint8_t *dst,
2571cabdff1aSopenharmony_ci                                     int32_t dst_stride,
2572cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
2573cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
2574cabdff1aSopenharmony_ci                                     int32_t height,
2575cabdff1aSopenharmony_ci                                     int32_t weight0,
2576cabdff1aSopenharmony_ci                                     int32_t weight1,
2577cabdff1aSopenharmony_ci                                     int32_t offset0,
2578cabdff1aSopenharmony_ci                                     int32_t offset1,
2579cabdff1aSopenharmony_ci                                     int32_t rnd_val)
2580cabdff1aSopenharmony_ci{
2581cabdff1aSopenharmony_ci    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2582cabdff1aSopenharmony_ci                                     src1_ptr, src2_stride,
2583cabdff1aSopenharmony_ci                                     dst, dst_stride, filter_x, filter_y,
2584cabdff1aSopenharmony_ci                                     height, weight0, weight1, offset0,
2585cabdff1aSopenharmony_ci                                     offset1, rnd_val, 4);
2586cabdff1aSopenharmony_ci}
2587cabdff1aSopenharmony_ci
2588cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr,
2589cabdff1aSopenharmony_ci                                     int32_t src_stride,
2590cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
2591cabdff1aSopenharmony_ci                                     int32_t src2_stride,
2592cabdff1aSopenharmony_ci                                     uint8_t *dst,
2593cabdff1aSopenharmony_ci                                     int32_t dst_stride,
2594cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
2595cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
2596cabdff1aSopenharmony_ci                                     int32_t height,
2597cabdff1aSopenharmony_ci                                     int32_t weight0,
2598cabdff1aSopenharmony_ci                                     int32_t weight1,
2599cabdff1aSopenharmony_ci                                     int32_t offset0,
2600cabdff1aSopenharmony_ci                                     int32_t offset1,
2601cabdff1aSopenharmony_ci                                     int32_t rnd_val)
2602cabdff1aSopenharmony_ci{
2603cabdff1aSopenharmony_ci    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2604cabdff1aSopenharmony_ci                                     src1_ptr, src2_stride,
2605cabdff1aSopenharmony_ci                                     dst, dst_stride, filter_x, filter_y,
2606cabdff1aSopenharmony_ci                                     height, weight0, weight1, offset0,
2607cabdff1aSopenharmony_ci                                     offset1, rnd_val, 6);
2608cabdff1aSopenharmony_ci}
2609cabdff1aSopenharmony_ci
2610cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr,
2611cabdff1aSopenharmony_ci                                     int32_t src_stride,
2612cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
2613cabdff1aSopenharmony_ci                                     int32_t src2_stride,
2614cabdff1aSopenharmony_ci                                     uint8_t *dst,
2615cabdff1aSopenharmony_ci                                     int32_t dst_stride,
2616cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
2617cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
2618cabdff1aSopenharmony_ci                                     int32_t height,
2619cabdff1aSopenharmony_ci                                     int32_t weight0,
2620cabdff1aSopenharmony_ci                                     int32_t weight1,
2621cabdff1aSopenharmony_ci                                     int32_t offset0,
2622cabdff1aSopenharmony_ci                                     int32_t offset1,
2623cabdff1aSopenharmony_ci                                     int32_t rnd_val)
2624cabdff1aSopenharmony_ci{
2625cabdff1aSopenharmony_ci    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2626cabdff1aSopenharmony_ci                                     src1_ptr, src2_stride,
2627cabdff1aSopenharmony_ci                                     dst, dst_stride, filter_x, filter_y,
2628cabdff1aSopenharmony_ci                                     height, weight0, weight1, offset0,
2629cabdff1aSopenharmony_ci                                     offset1, rnd_val, 8);
2630cabdff1aSopenharmony_ci}
2631cabdff1aSopenharmony_ci
2632cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
2633cabdff1aSopenharmony_ci                                     int32_t src_stride,
2634cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
2635cabdff1aSopenharmony_ci                                     int32_t src2_stride,
2636cabdff1aSopenharmony_ci                                     uint8_t *dst,
2637cabdff1aSopenharmony_ci                                     int32_t dst_stride,
2638cabdff1aSopenharmony_ci                                     const int8_t *filter,
2639cabdff1aSopenharmony_ci                                     int32_t weight0,
2640cabdff1aSopenharmony_ci                                     int32_t weight1,
2641cabdff1aSopenharmony_ci                                     int32_t offset0,
2642cabdff1aSopenharmony_ci                                     int32_t offset1,
2643cabdff1aSopenharmony_ci                                     int32_t rnd_val)
2644cabdff1aSopenharmony_ci{
2645cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
2646cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
2647cabdff1aSopenharmony_ci    v16i8 src0, src1;
2648cabdff1aSopenharmony_ci    v8i16 in0, in1;
2649cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2650cabdff1aSopenharmony_ci    v16i8 mask1, vec0, vec1;
2651cabdff1aSopenharmony_ci    v8i16 dst0;
2652cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l;
2653cabdff1aSopenharmony_ci    v8i16 out0, filter_vec;
2654cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
2655cabdff1aSopenharmony_ci
2656cabdff1aSopenharmony_ci    src0_ptr -= 1;
2657cabdff1aSopenharmony_ci
2658cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2659cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2660cabdff1aSopenharmony_ci
2661cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2662cabdff1aSopenharmony_ci
2663cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
2664cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
2665cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
2666cabdff1aSopenharmony_ci    constant = 128 * weight1;
2667cabdff1aSopenharmony_ci    constant <<= 6;
2668cabdff1aSopenharmony_ci    offset += constant;
2669cabdff1aSopenharmony_ci
2670cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
2671cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
2672cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
2673cabdff1aSopenharmony_ci
2674cabdff1aSopenharmony_ci    LD_SB2(src0_ptr, src_stride, src0, src1);
2675cabdff1aSopenharmony_ci    LD_SH2(src1_ptr, src2_stride, in0, in1);
2676cabdff1aSopenharmony_ci    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2677cabdff1aSopenharmony_ci    XORI_B2_128_SB(src0, src1);
2678cabdff1aSopenharmony_ci
2679cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2680cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2681cabdff1aSopenharmony_ci
2682cabdff1aSopenharmony_ci    ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
2683cabdff1aSopenharmony_ci    dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
2684cabdff1aSopenharmony_ci    dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
2685cabdff1aSopenharmony_ci    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2686cabdff1aSopenharmony_ci    out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2687cabdff1aSopenharmony_ci    CLIP_SH_0_255(out0);
2688cabdff1aSopenharmony_ci    out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
2689cabdff1aSopenharmony_ci    ST_W2(out0, 0, 1, dst, dst_stride);
2690cabdff1aSopenharmony_ci}
2691cabdff1aSopenharmony_ci
2692cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
2693cabdff1aSopenharmony_ci                                     int32_t src_stride,
2694cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
2695cabdff1aSopenharmony_ci                                     int32_t src2_stride,
2696cabdff1aSopenharmony_ci                                     uint8_t *dst,
2697cabdff1aSopenharmony_ci                                     int32_t dst_stride,
2698cabdff1aSopenharmony_ci                                     const int8_t *filter,
2699cabdff1aSopenharmony_ci                                     int32_t weight0,
2700cabdff1aSopenharmony_ci                                     int32_t weight1,
2701cabdff1aSopenharmony_ci                                     int32_t offset0,
2702cabdff1aSopenharmony_ci                                     int32_t offset1,
2703cabdff1aSopenharmony_ci                                     int32_t rnd_val)
2704cabdff1aSopenharmony_ci{
2705cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
2706cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
2707cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
2708cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2709cabdff1aSopenharmony_ci    v16i8 mask1;
2710cabdff1aSopenharmony_ci    v8i16 dst0, dst1;
2711cabdff1aSopenharmony_ci    v16i8 vec0, vec1;
2712cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
2713cabdff1aSopenharmony_ci    v8i16 filter_vec;
2714cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
2715cabdff1aSopenharmony_ci
2716cabdff1aSopenharmony_ci    src0_ptr -= 1;
2717cabdff1aSopenharmony_ci
2718cabdff1aSopenharmony_ci    /* rearranging filter */
2719cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2720cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2721cabdff1aSopenharmony_ci
2722cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2723cabdff1aSopenharmony_ci
2724cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
2725cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
2726cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
2727cabdff1aSopenharmony_ci    constant = 128 * weight1;
2728cabdff1aSopenharmony_ci    constant <<= 6;
2729cabdff1aSopenharmony_ci    offset += constant;
2730cabdff1aSopenharmony_ci
2731cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
2732cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
2733cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
2734cabdff1aSopenharmony_ci
2735cabdff1aSopenharmony_ci    LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2736cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
2737cabdff1aSopenharmony_ci    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2738cabdff1aSopenharmony_ci    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2739cabdff1aSopenharmony_ci
2740cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2741cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2742cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2743cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2744cabdff1aSopenharmony_ci    HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2745cabdff1aSopenharmony_ci                       weight_vec, rnd_vec, offset_vec,
2746cabdff1aSopenharmony_ci                       dst0, dst1);
2747cabdff1aSopenharmony_ci
2748cabdff1aSopenharmony_ci    dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2749cabdff1aSopenharmony_ci    ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2750cabdff1aSopenharmony_ci}
2751cabdff1aSopenharmony_ci
2752cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
2753cabdff1aSopenharmony_ci                                             int32_t src_stride,
2754cabdff1aSopenharmony_ci                                             int16_t *src1_ptr,
2755cabdff1aSopenharmony_ci                                             int32_t src2_stride,
2756cabdff1aSopenharmony_ci                                             uint8_t *dst,
2757cabdff1aSopenharmony_ci                                             int32_t dst_stride,
2758cabdff1aSopenharmony_ci                                             const int8_t *filter,
2759cabdff1aSopenharmony_ci                                             int32_t height,
2760cabdff1aSopenharmony_ci                                             int32_t weight0,
2761cabdff1aSopenharmony_ci                                             int32_t weight1,
2762cabdff1aSopenharmony_ci                                             int32_t offset0,
2763cabdff1aSopenharmony_ci                                             int32_t offset1,
2764cabdff1aSopenharmony_ci                                             int32_t rnd_val)
2765cabdff1aSopenharmony_ci{
2766cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2767cabdff1aSopenharmony_ci    int32_t weight, offset, constant;
2768cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
2769cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2770cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2771cabdff1aSopenharmony_ci    v16i8 mask1;
2772cabdff1aSopenharmony_ci    v16i8 vec0, vec1;
2773cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
2774cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2775cabdff1aSopenharmony_ci    v8i16 filter_vec;
2776cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
2777cabdff1aSopenharmony_ci
2778cabdff1aSopenharmony_ci    src0_ptr -= 1;
2779cabdff1aSopenharmony_ci
2780cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2781cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2782cabdff1aSopenharmony_ci
2783cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
2784cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
2785cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
2786cabdff1aSopenharmony_ci    constant = 128 * weight1;
2787cabdff1aSopenharmony_ci    constant <<= 6;
2788cabdff1aSopenharmony_ci    offset += constant;
2789cabdff1aSopenharmony_ci
2790cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
2791cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
2792cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
2793cabdff1aSopenharmony_ci
2794cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2795cabdff1aSopenharmony_ci
2796cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
2797cabdff1aSopenharmony_ci        LD_SB8(src0_ptr, src_stride,
2798cabdff1aSopenharmony_ci               src0, src1, src2, src3, src4, src5, src6, src7);
2799cabdff1aSopenharmony_ci        src0_ptr += (8 * src_stride);
2800cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2801cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
2802cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2803cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
2804cabdff1aSopenharmony_ci        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2805cabdff1aSopenharmony_ci        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2806cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2807cabdff1aSopenharmony_ci
2808cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2809cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2810cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2811cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2812cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2813cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2814cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2815cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2816cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2817cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
2818cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
2819cabdff1aSopenharmony_ci                           dst0, dst1, dst2, dst3);
2820cabdff1aSopenharmony_ci
2821cabdff1aSopenharmony_ci        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2822cabdff1aSopenharmony_ci        ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2823cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
2824cabdff1aSopenharmony_ci    }
2825cabdff1aSopenharmony_ci}
2826cabdff1aSopenharmony_ci
2827cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr,
2828cabdff1aSopenharmony_ci                                    int32_t src_stride,
2829cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
2830cabdff1aSopenharmony_ci                                    int32_t src2_stride,
2831cabdff1aSopenharmony_ci                                    uint8_t *dst,
2832cabdff1aSopenharmony_ci                                    int32_t dst_stride,
2833cabdff1aSopenharmony_ci                                    const int8_t *filter,
2834cabdff1aSopenharmony_ci                                    int32_t height,
2835cabdff1aSopenharmony_ci                                    int32_t weight0,
2836cabdff1aSopenharmony_ci                                    int32_t weight1,
2837cabdff1aSopenharmony_ci                                    int32_t offset0,
2838cabdff1aSopenharmony_ci                                    int32_t offset1,
2839cabdff1aSopenharmony_ci                                    int32_t rnd_val)
2840cabdff1aSopenharmony_ci{
2841cabdff1aSopenharmony_ci    if (2 == height) {
2842cabdff1aSopenharmony_ci        hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2843cabdff1aSopenharmony_ci                                 dst, dst_stride, filter,
2844cabdff1aSopenharmony_ci                                 weight0, weight1, offset0, offset1, rnd_val);
2845cabdff1aSopenharmony_ci    } else if (4 == height) {
2846cabdff1aSopenharmony_ci        hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2847cabdff1aSopenharmony_ci                                 dst, dst_stride, filter,
2848cabdff1aSopenharmony_ci                                 weight0, weight1, offset0, offset1, rnd_val);
2849cabdff1aSopenharmony_ci    } else if (0 == (height % 8)) {
2850cabdff1aSopenharmony_ci        hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
2851cabdff1aSopenharmony_ci                                         src1_ptr, src2_stride,
2852cabdff1aSopenharmony_ci                                         dst, dst_stride, filter, height,
2853cabdff1aSopenharmony_ci                                         weight0, weight1, offset0, offset1,
2854cabdff1aSopenharmony_ci                                         rnd_val);
2855cabdff1aSopenharmony_ci    }
2856cabdff1aSopenharmony_ci}
2857cabdff1aSopenharmony_ci
2858cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr,
2859cabdff1aSopenharmony_ci                                    int32_t src_stride,
2860cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
2861cabdff1aSopenharmony_ci                                    int32_t src2_stride,
2862cabdff1aSopenharmony_ci                                    uint8_t *dst,
2863cabdff1aSopenharmony_ci                                    int32_t dst_stride,
2864cabdff1aSopenharmony_ci                                    const int8_t *filter,
2865cabdff1aSopenharmony_ci                                    int32_t height,
2866cabdff1aSopenharmony_ci                                    int32_t weight0,
2867cabdff1aSopenharmony_ci                                    int32_t weight1,
2868cabdff1aSopenharmony_ci                                    int32_t offset0,
2869cabdff1aSopenharmony_ci                                    int32_t offset1,
2870cabdff1aSopenharmony_ci                                    int32_t rnd_val)
2871cabdff1aSopenharmony_ci{
2872cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2873cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
2874cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
2875cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
2876cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2877cabdff1aSopenharmony_ci    v16i8 mask1;
2878cabdff1aSopenharmony_ci    v16i8 vec0, vec1;
2879cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
2880cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
2881cabdff1aSopenharmony_ci    v8i16 filter_vec;
2882cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
2883cabdff1aSopenharmony_ci
2884cabdff1aSopenharmony_ci    src0_ptr -= 1;
2885cabdff1aSopenharmony_ci
2886cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2887cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2888cabdff1aSopenharmony_ci
2889cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
2890cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
2891cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
2892cabdff1aSopenharmony_ci    constant = 128 * weight1;
2893cabdff1aSopenharmony_ci    constant <<= 6;
2894cabdff1aSopenharmony_ci    offset += constant;
2895cabdff1aSopenharmony_ci
2896cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
2897cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
2898cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
2899cabdff1aSopenharmony_ci
2900cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2901cabdff1aSopenharmony_ci
2902cabdff1aSopenharmony_ci    for (loop_cnt = 2; loop_cnt--;) {
2903cabdff1aSopenharmony_ci        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2904cabdff1aSopenharmony_ci        src0_ptr += (4 * src_stride);
2905cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2906cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
2907cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
2908cabdff1aSopenharmony_ci
2909cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2910cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2911cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2912cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2913cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2914cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2915cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2916cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2917cabdff1aSopenharmony_ci
2918cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2919cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
2920cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
2921cabdff1aSopenharmony_ci                           dst0, dst1, dst2, dst3);
2922cabdff1aSopenharmony_ci
2923cabdff1aSopenharmony_ci        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2924cabdff1aSopenharmony_ci        ST_W2(dst0, 0, 2, dst, dst_stride);
2925cabdff1aSopenharmony_ci        ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2926cabdff1aSopenharmony_ci        ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2927cabdff1aSopenharmony_ci        ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2928cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
2929cabdff1aSopenharmony_ci    }
2930cabdff1aSopenharmony_ci}
2931cabdff1aSopenharmony_ci
2932cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
2933cabdff1aSopenharmony_ci                                     int32_t src_stride,
2934cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
2935cabdff1aSopenharmony_ci                                     int32_t src2_stride,
2936cabdff1aSopenharmony_ci                                     uint8_t *dst,
2937cabdff1aSopenharmony_ci                                     int32_t dst_stride,
2938cabdff1aSopenharmony_ci                                     const int8_t *filter,
2939cabdff1aSopenharmony_ci                                     int32_t weight0,
2940cabdff1aSopenharmony_ci                                     int32_t weight1,
2941cabdff1aSopenharmony_ci                                     int32_t offset0,
2942cabdff1aSopenharmony_ci                                     int32_t offset1,
2943cabdff1aSopenharmony_ci                                     int32_t rnd_val)
2944cabdff1aSopenharmony_ci{
2945cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
2946cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
2947cabdff1aSopenharmony_ci    v16i8 src0, src1;
2948cabdff1aSopenharmony_ci    v8i16 in0, in1;
2949cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2950cabdff1aSopenharmony_ci    v16i8 mask1, vec0, vec1;
2951cabdff1aSopenharmony_ci    v8i16 dst0, dst1;
2952cabdff1aSopenharmony_ci    v8i16 filter_vec;
2953cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
2954cabdff1aSopenharmony_ci
2955cabdff1aSopenharmony_ci    src0_ptr -= 1;
2956cabdff1aSopenharmony_ci
2957cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2958cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2959cabdff1aSopenharmony_ci
2960cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
2961cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
2962cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
2963cabdff1aSopenharmony_ci    constant = 128 * weight1;
2964cabdff1aSopenharmony_ci    constant <<= 6;
2965cabdff1aSopenharmony_ci    offset += constant;
2966cabdff1aSopenharmony_ci
2967cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
2968cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
2969cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
2970cabdff1aSopenharmony_ci
2971cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2972cabdff1aSopenharmony_ci
2973cabdff1aSopenharmony_ci    LD_SB2(src0_ptr, src_stride, src0, src1);
2974cabdff1aSopenharmony_ci    LD_SH2(src1_ptr, src2_stride, in0, in1);
2975cabdff1aSopenharmony_ci    XORI_B2_128_SB(src0, src1);
2976cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2977cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2978cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2979cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2980cabdff1aSopenharmony_ci    HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2981cabdff1aSopenharmony_ci                       weight_vec, rnd_vec, offset_vec,
2982cabdff1aSopenharmony_ci                       dst0, dst1);
2983cabdff1aSopenharmony_ci
2984cabdff1aSopenharmony_ci    dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2985cabdff1aSopenharmony_ci    ST_D2(dst0, 0, 1, dst, dst_stride);
2986cabdff1aSopenharmony_ci}
2987cabdff1aSopenharmony_ci
2988cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
2989cabdff1aSopenharmony_ci                                     int32_t src_stride,
2990cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
2991cabdff1aSopenharmony_ci                                     int32_t src2_stride,
2992cabdff1aSopenharmony_ci                                     uint8_t *dst,
2993cabdff1aSopenharmony_ci                                     int32_t dst_stride,
2994cabdff1aSopenharmony_ci                                     const int8_t *filter,
2995cabdff1aSopenharmony_ci                                     int32_t weight0,
2996cabdff1aSopenharmony_ci                                     int32_t weight1,
2997cabdff1aSopenharmony_ci                                     int32_t offset0,
2998cabdff1aSopenharmony_ci                                     int32_t offset1,
2999cabdff1aSopenharmony_ci                                     int32_t rnd_val)
3000cabdff1aSopenharmony_ci{
3001cabdff1aSopenharmony_ci    int32_t weight, offset, constant;
3002cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3003cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5;
3004cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5;
3005cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3006cabdff1aSopenharmony_ci    v16i8 mask1;
3007cabdff1aSopenharmony_ci    v16i8 vec0, vec1;
3008cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3009cabdff1aSopenharmony_ci    v8i16 filter_vec;
3010cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
3011cabdff1aSopenharmony_ci
3012cabdff1aSopenharmony_ci    src0_ptr -= 1;
3013cabdff1aSopenharmony_ci
3014cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3015cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3016cabdff1aSopenharmony_ci
3017cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
3018cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
3019cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
3020cabdff1aSopenharmony_ci    constant = 128 * weight1;
3021cabdff1aSopenharmony_ci    constant <<= 6;
3022cabdff1aSopenharmony_ci    offset += constant;
3023cabdff1aSopenharmony_ci
3024cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
3025cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3026cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
3027cabdff1aSopenharmony_ci
3028cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3029cabdff1aSopenharmony_ci
3030cabdff1aSopenharmony_ci    LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
3031cabdff1aSopenharmony_ci
3032cabdff1aSopenharmony_ci    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3033cabdff1aSopenharmony_ci    src1_ptr += (4 * src2_stride);
3034cabdff1aSopenharmony_ci    LD_SH2(src1_ptr, src2_stride, in4, in5);
3035cabdff1aSopenharmony_ci    XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
3036cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3037cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3038cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3039cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3040cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3041cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3042cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3043cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3044cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3045cabdff1aSopenharmony_ci    dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3046cabdff1aSopenharmony_ci    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3047cabdff1aSopenharmony_ci    dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3048cabdff1aSopenharmony_ci    HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3049cabdff1aSopenharmony_ci                       in0, in1, in2, in3,
3050cabdff1aSopenharmony_ci                       weight_vec, rnd_vec, offset_vec,
3051cabdff1aSopenharmony_ci                       dst0, dst1, dst2, dst3);
3052cabdff1aSopenharmony_ci    HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3053cabdff1aSopenharmony_ci                       weight_vec, rnd_vec, offset_vec,
3054cabdff1aSopenharmony_ci                       dst4, dst5);
3055cabdff1aSopenharmony_ci
3056cabdff1aSopenharmony_ci    PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3057cabdff1aSopenharmony_ci    dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3058cabdff1aSopenharmony_ci    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3059cabdff1aSopenharmony_ci    ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride);
3060cabdff1aSopenharmony_ci}
3061cabdff1aSopenharmony_ci
3062cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
3063cabdff1aSopenharmony_ci                                             int32_t src_stride,
3064cabdff1aSopenharmony_ci                                             int16_t *src1_ptr,
3065cabdff1aSopenharmony_ci                                             int32_t src2_stride,
3066cabdff1aSopenharmony_ci                                             uint8_t *dst,
3067cabdff1aSopenharmony_ci                                             int32_t dst_stride,
3068cabdff1aSopenharmony_ci                                             const int8_t *filter,
3069cabdff1aSopenharmony_ci                                             int32_t height,
3070cabdff1aSopenharmony_ci                                             int32_t weight0,
3071cabdff1aSopenharmony_ci                                             int32_t weight1,
3072cabdff1aSopenharmony_ci                                             int32_t offset0,
3073cabdff1aSopenharmony_ci                                             int32_t offset1,
3074cabdff1aSopenharmony_ci                                             int32_t rnd_val)
3075cabdff1aSopenharmony_ci{
3076cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3077cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
3078cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3079cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
3080cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3081cabdff1aSopenharmony_ci    v16i8 mask1;
3082cabdff1aSopenharmony_ci    v16i8 vec0, vec1;
3083cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
3084cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
3085cabdff1aSopenharmony_ci    v8i16 filter_vec;
3086cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
3087cabdff1aSopenharmony_ci
3088cabdff1aSopenharmony_ci    src0_ptr -= 1;
3089cabdff1aSopenharmony_ci
3090cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3091cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3092cabdff1aSopenharmony_ci
3093cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
3094cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
3095cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
3096cabdff1aSopenharmony_ci    constant = 128 * weight1;
3097cabdff1aSopenharmony_ci    constant <<= 6;
3098cabdff1aSopenharmony_ci    offset += constant;
3099cabdff1aSopenharmony_ci
3100cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
3101cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3102cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
3103cabdff1aSopenharmony_ci
3104cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3105cabdff1aSopenharmony_ci
3106cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
3107cabdff1aSopenharmony_ci        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3108cabdff1aSopenharmony_ci        src0_ptr += (4 * src_stride);
3109cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3110cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
3111cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
3112cabdff1aSopenharmony_ci
3113cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3114cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3115cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3116cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3117cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3118cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3119cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3120cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3121cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3122cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
3123cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
3124cabdff1aSopenharmony_ci                           dst0, dst1, dst2, dst3);
3125cabdff1aSopenharmony_ci
3126cabdff1aSopenharmony_ci        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3127cabdff1aSopenharmony_ci        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3128cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
3129cabdff1aSopenharmony_ci    }
3130cabdff1aSopenharmony_ci}
3131cabdff1aSopenharmony_ci
3132cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr,
3133cabdff1aSopenharmony_ci                                    int32_t src_stride,
3134cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
3135cabdff1aSopenharmony_ci                                    int32_t src2_stride,
3136cabdff1aSopenharmony_ci                                    uint8_t *dst,
3137cabdff1aSopenharmony_ci                                    int32_t dst_stride,
3138cabdff1aSopenharmony_ci                                    const int8_t *filter,
3139cabdff1aSopenharmony_ci                                    int32_t height,
3140cabdff1aSopenharmony_ci                                    int32_t weight0,
3141cabdff1aSopenharmony_ci                                    int32_t weight1,
3142cabdff1aSopenharmony_ci                                    int32_t offset0,
3143cabdff1aSopenharmony_ci                                    int32_t offset1,
3144cabdff1aSopenharmony_ci                                    int32_t rnd_val)
3145cabdff1aSopenharmony_ci{
3146cabdff1aSopenharmony_ci    if (2 == height) {
3147cabdff1aSopenharmony_ci        hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3148cabdff1aSopenharmony_ci                                 dst, dst_stride, filter,
3149cabdff1aSopenharmony_ci                                 weight0, weight1, offset0, offset1, rnd_val);
3150cabdff1aSopenharmony_ci    } else if (6 == height) {
3151cabdff1aSopenharmony_ci        hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3152cabdff1aSopenharmony_ci                                 dst, dst_stride, filter,
3153cabdff1aSopenharmony_ci                                 weight0, weight1, offset0, offset1, rnd_val);
3154cabdff1aSopenharmony_ci    } else if (0 == (height % 4)) {
3155cabdff1aSopenharmony_ci        hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
3156cabdff1aSopenharmony_ci                                         src1_ptr, src2_stride,
3157cabdff1aSopenharmony_ci                                         dst, dst_stride, filter, height,
3158cabdff1aSopenharmony_ci                                         weight0, weight1, offset0, offset1,
3159cabdff1aSopenharmony_ci                                         rnd_val);
3160cabdff1aSopenharmony_ci    }
3161cabdff1aSopenharmony_ci}
3162cabdff1aSopenharmony_ci
3163cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr,
3164cabdff1aSopenharmony_ci                                     int32_t src_stride,
3165cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
3166cabdff1aSopenharmony_ci                                     int32_t src2_stride,
3167cabdff1aSopenharmony_ci                                     uint8_t *dst,
3168cabdff1aSopenharmony_ci                                     int32_t dst_stride,
3169cabdff1aSopenharmony_ci                                     const int8_t *filter,
3170cabdff1aSopenharmony_ci                                     int32_t height,
3171cabdff1aSopenharmony_ci                                     int32_t weight0,
3172cabdff1aSopenharmony_ci                                     int32_t weight1,
3173cabdff1aSopenharmony_ci                                     int32_t offset0,
3174cabdff1aSopenharmony_ci                                     int32_t offset1,
3175cabdff1aSopenharmony_ci                                     int32_t rnd_val)
3176cabdff1aSopenharmony_ci{
3177cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3178cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
3179cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3180cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
3181cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3182cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3183cabdff1aSopenharmony_ci    v16i8 mask2 = {
3184cabdff1aSopenharmony_ci        8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
3185cabdff1aSopenharmony_ci    };
3186cabdff1aSopenharmony_ci    v16i8 mask1, mask3;
3187cabdff1aSopenharmony_ci    v16i8 vec0, vec1;
3188cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3189cabdff1aSopenharmony_ci    v8i16 filter_vec;
3190cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
3191cabdff1aSopenharmony_ci
3192cabdff1aSopenharmony_ci    src0_ptr -= 1;
3193cabdff1aSopenharmony_ci
3194cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3195cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3196cabdff1aSopenharmony_ci
3197cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
3198cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
3199cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
3200cabdff1aSopenharmony_ci    constant = 128 * weight1;
3201cabdff1aSopenharmony_ci    constant <<= 6;
3202cabdff1aSopenharmony_ci    offset += constant;
3203cabdff1aSopenharmony_ci
3204cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
3205cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3206cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
3207cabdff1aSopenharmony_ci
3208cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3209cabdff1aSopenharmony_ci    mask3 = mask2 + 2;
3210cabdff1aSopenharmony_ci
3211cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
3212cabdff1aSopenharmony_ci        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3213cabdff1aSopenharmony_ci        src0_ptr += (4 * src_stride);
3214cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3215cabdff1aSopenharmony_ci        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
3216cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
3217cabdff1aSopenharmony_ci        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3218cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
3219cabdff1aSopenharmony_ci
3220cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3221cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3222cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3223cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3224cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3225cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3226cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3227cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3228cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3229cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3230cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3231cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3232cabdff1aSopenharmony_ci
3233cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3234cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
3235cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
3236cabdff1aSopenharmony_ci                           dst0, dst1, dst2, dst3);
3237cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3238cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
3239cabdff1aSopenharmony_ci                           dst4, dst5);
3240cabdff1aSopenharmony_ci
3241cabdff1aSopenharmony_ci        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3242cabdff1aSopenharmony_ci        dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3243cabdff1aSopenharmony_ci        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3244cabdff1aSopenharmony_ci        ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride);
3245cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
3246cabdff1aSopenharmony_ci    }
3247cabdff1aSopenharmony_ci}
3248cabdff1aSopenharmony_ci
3249cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr,
3250cabdff1aSopenharmony_ci                                     int32_t src_stride,
3251cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
3252cabdff1aSopenharmony_ci                                     int32_t src2_stride,
3253cabdff1aSopenharmony_ci                                     uint8_t *dst,
3254cabdff1aSopenharmony_ci                                     int32_t dst_stride,
3255cabdff1aSopenharmony_ci                                     const int8_t *filter,
3256cabdff1aSopenharmony_ci                                     int32_t height,
3257cabdff1aSopenharmony_ci                                     int32_t weight0,
3258cabdff1aSopenharmony_ci                                     int32_t weight1,
3259cabdff1aSopenharmony_ci                                     int32_t offset0,
3260cabdff1aSopenharmony_ci                                     int32_t offset1,
3261cabdff1aSopenharmony_ci                                     int32_t rnd_val)
3262cabdff1aSopenharmony_ci{
3263cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3264cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
3265cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3266cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3267cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3268cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3269cabdff1aSopenharmony_ci    v16i8 mask1;
3270cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3271cabdff1aSopenharmony_ci    v16i8 vec0, vec1;
3272cabdff1aSopenharmony_ci    v8i16 filter_vec;
3273cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
3274cabdff1aSopenharmony_ci
3275cabdff1aSopenharmony_ci    src0_ptr -= 1;
3276cabdff1aSopenharmony_ci
3277cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3278cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3279cabdff1aSopenharmony_ci
3280cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
3281cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
3282cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
3283cabdff1aSopenharmony_ci    constant = 128 * weight1;
3284cabdff1aSopenharmony_ci    constant <<= 6;
3285cabdff1aSopenharmony_ci    offset += constant;
3286cabdff1aSopenharmony_ci
3287cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
3288cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3289cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
3290cabdff1aSopenharmony_ci
3291cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3292cabdff1aSopenharmony_ci
3293cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
3294cabdff1aSopenharmony_ci        LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
3295cabdff1aSopenharmony_ci        LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
3296cabdff1aSopenharmony_ci        src0_ptr += (4 * src_stride);
3297cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
3298cabdff1aSopenharmony_ci        LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
3299cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
3300cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3301cabdff1aSopenharmony_ci
3302cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3303cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3304cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3305cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3306cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3307cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3308cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3309cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3310cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3311cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3312cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3313cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3314cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3315cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3316cabdff1aSopenharmony_ci        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3317cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3318cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3319cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
3320cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
3321cabdff1aSopenharmony_ci                           dst0, dst1, dst2, dst3);
3322cabdff1aSopenharmony_ci
3323cabdff1aSopenharmony_ci        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3324cabdff1aSopenharmony_ci        ST_SH2(dst0, dst1, dst, dst_stride);
3325cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
3326cabdff1aSopenharmony_ci
3327cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7,
3328cabdff1aSopenharmony_ci                           in4, in5, in6, in7,
3329cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
3330cabdff1aSopenharmony_ci                           dst0, dst1, dst2, dst3);
3331cabdff1aSopenharmony_ci
3332cabdff1aSopenharmony_ci        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3333cabdff1aSopenharmony_ci        ST_SH2(dst0, dst1, dst, dst_stride);
3334cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
3335cabdff1aSopenharmony_ci    }
3336cabdff1aSopenharmony_ci}
3337cabdff1aSopenharmony_ci
3338cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr,
3339cabdff1aSopenharmony_ci                                     int32_t src_stride,
3340cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
3341cabdff1aSopenharmony_ci                                     int32_t src2_stride,
3342cabdff1aSopenharmony_ci                                     uint8_t *dst,
3343cabdff1aSopenharmony_ci                                     int32_t dst_stride,
3344cabdff1aSopenharmony_ci                                     const int8_t *filter,
3345cabdff1aSopenharmony_ci                                     int32_t height,
3346cabdff1aSopenharmony_ci                                     int32_t weight0,
3347cabdff1aSopenharmony_ci                                     int32_t weight1,
3348cabdff1aSopenharmony_ci                                     int32_t offset0,
3349cabdff1aSopenharmony_ci                                     int32_t offset1,
3350cabdff1aSopenharmony_ci                                     int32_t rnd_val)
3351cabdff1aSopenharmony_ci{
3352cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3353cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
3354cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
3355cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3356cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3357cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3;
3358cabdff1aSopenharmony_ci    v16i8 vec0, vec1;
3359cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
3360cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5;
3361cabdff1aSopenharmony_ci    v8i16 filter_vec;
3362cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
3363cabdff1aSopenharmony_ci
3364cabdff1aSopenharmony_ci    src0_ptr -= 1;
3365cabdff1aSopenharmony_ci
3366cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3367cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3368cabdff1aSopenharmony_ci
3369cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
3370cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
3371cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
3372cabdff1aSopenharmony_ci    constant = 128 * weight1;
3373cabdff1aSopenharmony_ci    constant <<= 6;
3374cabdff1aSopenharmony_ci    offset += constant;
3375cabdff1aSopenharmony_ci
3376cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
3377cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3378cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
3379cabdff1aSopenharmony_ci
3380cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3381cabdff1aSopenharmony_ci    mask2 = mask0 + 8;
3382cabdff1aSopenharmony_ci    mask3 = mask0 + 10;
3383cabdff1aSopenharmony_ci
3384cabdff1aSopenharmony_ci    for (loop_cnt = 16; loop_cnt--;) {
3385cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, src_stride, src0, src2);
3386cabdff1aSopenharmony_ci        LD_SB2(src0_ptr + 16, src_stride, src1, src3);
3387cabdff1aSopenharmony_ci        src0_ptr += (2 * src_stride);
3388cabdff1aSopenharmony_ci        LD_SH2(src1_ptr, src2_stride, in0, in2);
3389cabdff1aSopenharmony_ci        LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
3390cabdff1aSopenharmony_ci        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
3391cabdff1aSopenharmony_ci        src1_ptr += (2 * src2_stride);
3392cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
3393cabdff1aSopenharmony_ci
3394cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3395cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3396cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3397cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3398cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3399cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3400cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3401cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3402cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3403cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
3404cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
3405cabdff1aSopenharmony_ci                           dst0, dst1, dst2, dst3);
3406cabdff1aSopenharmony_ci
3407cabdff1aSopenharmony_ci        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3408cabdff1aSopenharmony_ci        ST_SH2(dst0, dst1, dst, dst_stride);
3409cabdff1aSopenharmony_ci
3410cabdff1aSopenharmony_ci        /* 8 width */
3411cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3412cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3413cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3414cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3415cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5,
3416cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
3417cabdff1aSopenharmony_ci                           dst0, dst1);
3418cabdff1aSopenharmony_ci
3419cabdff1aSopenharmony_ci        dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3420cabdff1aSopenharmony_ci        ST_D2(dst0, 0, 1, (dst + 16), dst_stride);
3421cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
3422cabdff1aSopenharmony_ci    }
3423cabdff1aSopenharmony_ci}
3424cabdff1aSopenharmony_ci
3425cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr,
3426cabdff1aSopenharmony_ci                                     int32_t src_stride,
3427cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
3428cabdff1aSopenharmony_ci                                     int32_t src2_stride,
3429cabdff1aSopenharmony_ci                                     uint8_t *dst,
3430cabdff1aSopenharmony_ci                                     int32_t dst_stride,
3431cabdff1aSopenharmony_ci                                     const int8_t *filter,
3432cabdff1aSopenharmony_ci                                     int32_t height,
3433cabdff1aSopenharmony_ci                                     int32_t weight0,
3434cabdff1aSopenharmony_ci                                     int32_t weight1,
3435cabdff1aSopenharmony_ci                                     int32_t offset0,
3436cabdff1aSopenharmony_ci                                     int32_t offset1,
3437cabdff1aSopenharmony_ci                                     int32_t rnd_val)
3438cabdff1aSopenharmony_ci{
3439cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3440cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
3441cabdff1aSopenharmony_ci    v16i8 src0, src1, src2;
3442cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3443cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3444cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3;
3445cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3;
3446cabdff1aSopenharmony_ci    v16i8 vec0, vec1;
3447cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
3448cabdff1aSopenharmony_ci    v8i16 filter_vec;
3449cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
3450cabdff1aSopenharmony_ci
3451cabdff1aSopenharmony_ci    src0_ptr -= 1;
3452cabdff1aSopenharmony_ci
3453cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3454cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3455cabdff1aSopenharmony_ci
3456cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
3457cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
3458cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
3459cabdff1aSopenharmony_ci    constant = 128 * weight1;
3460cabdff1aSopenharmony_ci    constant <<= 6;
3461cabdff1aSopenharmony_ci    offset += constant;
3462cabdff1aSopenharmony_ci
3463cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
3464cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3465cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
3466cabdff1aSopenharmony_ci
3467cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3468cabdff1aSopenharmony_ci    mask2 = mask0 + 8;
3469cabdff1aSopenharmony_ci    mask3 = mask0 + 10;
3470cabdff1aSopenharmony_ci
3471cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
3472cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, 16, src0, src1);
3473cabdff1aSopenharmony_ci        src2 = LD_SB(src0_ptr + 24);
3474cabdff1aSopenharmony_ci        src0_ptr += src_stride;
3475cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3476cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
3477cabdff1aSopenharmony_ci        XORI_B3_128_SB(src0, src1, src2);
3478cabdff1aSopenharmony_ci
3479cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3480cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3481cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3482cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3483cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3484cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3485cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3486cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3487cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3488cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
3489cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
3490cabdff1aSopenharmony_ci                           dst0, dst1, dst2, dst3);
3491cabdff1aSopenharmony_ci
3492cabdff1aSopenharmony_ci        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3493cabdff1aSopenharmony_ci        ST_SH2(dst0, dst1, dst, 16);
3494cabdff1aSopenharmony_ci        dst += dst_stride;
3495cabdff1aSopenharmony_ci    }
3496cabdff1aSopenharmony_ci}
3497cabdff1aSopenharmony_ci
3498cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
3499cabdff1aSopenharmony_ci                                     int32_t src_stride,
3500cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
3501cabdff1aSopenharmony_ci                                     int32_t src2_stride,
3502cabdff1aSopenharmony_ci                                     uint8_t *dst,
3503cabdff1aSopenharmony_ci                                     int32_t dst_stride,
3504cabdff1aSopenharmony_ci                                     const int8_t *filter,
3505cabdff1aSopenharmony_ci                                     int32_t weight0,
3506cabdff1aSopenharmony_ci                                     int32_t weight1,
3507cabdff1aSopenharmony_ci                                     int32_t offset0,
3508cabdff1aSopenharmony_ci                                     int32_t offset1,
3509cabdff1aSopenharmony_ci                                     int32_t rnd_val)
3510cabdff1aSopenharmony_ci{
3511cabdff1aSopenharmony_ci    int32_t weight, offset, constant;
3512cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
3513cabdff1aSopenharmony_ci    v8i16 in0, in1, dst10;
3514cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3515cabdff1aSopenharmony_ci    v4i32 dst10_r, dst10_l;
3516cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3517cabdff1aSopenharmony_ci    v8i16 filter_vec, out;
3518cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
3519cabdff1aSopenharmony_ci
3520cabdff1aSopenharmony_ci    src0_ptr -= src_stride;
3521cabdff1aSopenharmony_ci
3522cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
3523cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
3524cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
3525cabdff1aSopenharmony_ci    constant = 128 * weight1;
3526cabdff1aSopenharmony_ci    constant <<= 6;
3527cabdff1aSopenharmony_ci    offset += constant;
3528cabdff1aSopenharmony_ci
3529cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
3530cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3531cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
3532cabdff1aSopenharmony_ci
3533cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3534cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3535cabdff1aSopenharmony_ci
3536cabdff1aSopenharmony_ci    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3537cabdff1aSopenharmony_ci    src0_ptr += (3 * src_stride);
3538cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3539cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3540cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3541cabdff1aSopenharmony_ci    LD_SB2(src0_ptr, src_stride, src3, src4);
3542cabdff1aSopenharmony_ci    src0_ptr += (2 * src_stride);
3543cabdff1aSopenharmony_ci    LD_SH2(src1_ptr, src2_stride, in0, in1);
3544cabdff1aSopenharmony_ci    src1_ptr += (2 * src2_stride);
3545cabdff1aSopenharmony_ci
3546cabdff1aSopenharmony_ci    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3547cabdff1aSopenharmony_ci    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3548cabdff1aSopenharmony_ci    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3549cabdff1aSopenharmony_ci    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3550cabdff1aSopenharmony_ci
3551cabdff1aSopenharmony_ci    dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3552cabdff1aSopenharmony_ci
3553cabdff1aSopenharmony_ci    ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l);
3554cabdff1aSopenharmony_ci    dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
3555cabdff1aSopenharmony_ci    dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
3556cabdff1aSopenharmony_ci    SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
3557cabdff1aSopenharmony_ci    out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
3558cabdff1aSopenharmony_ci    CLIP_SH_0_255(out);
3559cabdff1aSopenharmony_ci    out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
3560cabdff1aSopenharmony_ci    ST_W2(out, 0, 1, dst, dst_stride);
3561cabdff1aSopenharmony_ci}
3562cabdff1aSopenharmony_ci
3563cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
3564cabdff1aSopenharmony_ci                                     int32_t src_stride,
3565cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
3566cabdff1aSopenharmony_ci                                     int32_t src2_stride,
3567cabdff1aSopenharmony_ci                                     uint8_t *dst,
3568cabdff1aSopenharmony_ci                                     int32_t dst_stride,
3569cabdff1aSopenharmony_ci                                     const int8_t *filter,
3570cabdff1aSopenharmony_ci                                     int32_t weight0,
3571cabdff1aSopenharmony_ci                                     int32_t weight1,
3572cabdff1aSopenharmony_ci                                     int32_t offset0,
3573cabdff1aSopenharmony_ci                                     int32_t offset1,
3574cabdff1aSopenharmony_ci                                     int32_t rnd_val)
3575cabdff1aSopenharmony_ci{
3576cabdff1aSopenharmony_ci    int32_t weight, offset, constant;
3577cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
3578cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
3579cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3580cabdff1aSopenharmony_ci    v16i8 src2110, src4332, src6554;
3581cabdff1aSopenharmony_ci    v8i16 dst10, dst32;
3582cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3583cabdff1aSopenharmony_ci    v8i16 filter_vec;
3584cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
3585cabdff1aSopenharmony_ci
3586cabdff1aSopenharmony_ci    src0_ptr -= src_stride;
3587cabdff1aSopenharmony_ci
3588cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
3589cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
3590cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
3591cabdff1aSopenharmony_ci    constant = 128 * weight1;
3592cabdff1aSopenharmony_ci    constant <<= 6;
3593cabdff1aSopenharmony_ci    offset += constant;
3594cabdff1aSopenharmony_ci
3595cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
3596cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3597cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
3598cabdff1aSopenharmony_ci
3599cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3600cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3601cabdff1aSopenharmony_ci
3602cabdff1aSopenharmony_ci    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3603cabdff1aSopenharmony_ci    src0_ptr += (3 * src_stride);
3604cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3605cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3606cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3607cabdff1aSopenharmony_ci
3608cabdff1aSopenharmony_ci    LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3609cabdff1aSopenharmony_ci    src0_ptr += (4 * src_stride);
3610cabdff1aSopenharmony_ci    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3611cabdff1aSopenharmony_ci    src1_ptr += (4 * src2_stride);
3612cabdff1aSopenharmony_ci    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3613cabdff1aSopenharmony_ci    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3614cabdff1aSopenharmony_ci               src32_r, src43_r, src54_r, src65_r);
3615cabdff1aSopenharmony_ci    ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3616cabdff1aSopenharmony_ci    XORI_B2_128_SB(src4332, src6554);
3617cabdff1aSopenharmony_ci
3618cabdff1aSopenharmony_ci    dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3619cabdff1aSopenharmony_ci    dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3620cabdff1aSopenharmony_ci
3621cabdff1aSopenharmony_ci    HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1,
3622cabdff1aSopenharmony_ci                       weight_vec, rnd_vec, offset_vec,
3623cabdff1aSopenharmony_ci                       dst10, dst32);
3624cabdff1aSopenharmony_ci
3625cabdff1aSopenharmony_ci    dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3626cabdff1aSopenharmony_ci    ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
3627cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
3628cabdff1aSopenharmony_ci}
3629cabdff1aSopenharmony_ci
3630cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
3631cabdff1aSopenharmony_ci                                             int32_t src_stride,
3632cabdff1aSopenharmony_ci                                             int16_t *src1_ptr,
3633cabdff1aSopenharmony_ci                                             int32_t src2_stride,
3634cabdff1aSopenharmony_ci                                             uint8_t *dst,
3635cabdff1aSopenharmony_ci                                             int32_t dst_stride,
3636cabdff1aSopenharmony_ci                                             const int8_t *filter,
3637cabdff1aSopenharmony_ci                                             int32_t height,
3638cabdff1aSopenharmony_ci                                             int32_t weight0,
3639cabdff1aSopenharmony_ci                                             int32_t weight1,
3640cabdff1aSopenharmony_ci                                             int32_t offset0,
3641cabdff1aSopenharmony_ci                                             int32_t offset1,
3642cabdff1aSopenharmony_ci                                             int32_t rnd_val)
3643cabdff1aSopenharmony_ci{
3644cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3645cabdff1aSopenharmony_ci    int32_t weight, offset, constant;
3646cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3647cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3648cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3649cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3650cabdff1aSopenharmony_ci    v16i8 src2110, src4332, src6554, src8776;
3651cabdff1aSopenharmony_ci    v8i16 dst10, dst32, dst54, dst76;
3652cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3653cabdff1aSopenharmony_ci    v8i16 filter_vec;
3654cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
3655cabdff1aSopenharmony_ci
3656cabdff1aSopenharmony_ci    src0_ptr -= src_stride;
3657cabdff1aSopenharmony_ci
3658cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
3659cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
3660cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
3661cabdff1aSopenharmony_ci    constant = 128 * weight1;
3662cabdff1aSopenharmony_ci    constant <<= 6;
3663cabdff1aSopenharmony_ci    offset += constant;
3664cabdff1aSopenharmony_ci
3665cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
3666cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3667cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
3668cabdff1aSopenharmony_ci
3669cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3670cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3671cabdff1aSopenharmony_ci
3672cabdff1aSopenharmony_ci    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3673cabdff1aSopenharmony_ci    src0_ptr += (3 * src_stride);
3674cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3675cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3676cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3677cabdff1aSopenharmony_ci
3678cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
3679cabdff1aSopenharmony_ci        LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3680cabdff1aSopenharmony_ci        src0_ptr += (6 * src_stride);
3681cabdff1aSopenharmony_ci        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3682cabdff1aSopenharmony_ci        src1_ptr += (8 * src2_stride);
3683cabdff1aSopenharmony_ci
3684cabdff1aSopenharmony_ci        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3685cabdff1aSopenharmony_ci        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3686cabdff1aSopenharmony_ci
3687cabdff1aSopenharmony_ci        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3688cabdff1aSopenharmony_ci                   src32_r, src43_r, src54_r, src65_r);
3689cabdff1aSopenharmony_ci        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3690cabdff1aSopenharmony_ci        ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3691cabdff1aSopenharmony_ci                   src4332, src6554, src8776);
3692cabdff1aSopenharmony_ci        XORI_B3_128_SB(src4332, src6554, src8776);
3693cabdff1aSopenharmony_ci
3694cabdff1aSopenharmony_ci        dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3695cabdff1aSopenharmony_ci        dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3696cabdff1aSopenharmony_ci        dst54 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3697cabdff1aSopenharmony_ci
3698cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, src_stride, src9, src2);
3699cabdff1aSopenharmony_ci        src0_ptr += (2 * src_stride);
3700cabdff1aSopenharmony_ci        ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3701cabdff1aSopenharmony_ci        src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3702cabdff1aSopenharmony_ci        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3703cabdff1aSopenharmony_ci
3704cabdff1aSopenharmony_ci        dst76 = HEVC_FILT_4TAP_SH(src8776, src2110, filt0, filt1);
3705cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
3706cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
3707cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
3708cabdff1aSopenharmony_ci                           dst10, dst32, dst54, dst76);
3709cabdff1aSopenharmony_ci
3710cabdff1aSopenharmony_ci        PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
3711cabdff1aSopenharmony_ci        ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3712cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
3713cabdff1aSopenharmony_ci    }
3714cabdff1aSopenharmony_ci}
3715cabdff1aSopenharmony_ci
3716cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr,
3717cabdff1aSopenharmony_ci                                    int32_t src_stride,
3718cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
3719cabdff1aSopenharmony_ci                                    int32_t src2_stride,
3720cabdff1aSopenharmony_ci                                    uint8_t *dst,
3721cabdff1aSopenharmony_ci                                    int32_t dst_stride,
3722cabdff1aSopenharmony_ci                                    const int8_t *filter,
3723cabdff1aSopenharmony_ci                                    int32_t height,
3724cabdff1aSopenharmony_ci                                    int32_t weight0,
3725cabdff1aSopenharmony_ci                                    int32_t weight1,
3726cabdff1aSopenharmony_ci                                    int32_t offset0,
3727cabdff1aSopenharmony_ci                                    int32_t offset1,
3728cabdff1aSopenharmony_ci                                    int32_t rnd_val)
3729cabdff1aSopenharmony_ci{
3730cabdff1aSopenharmony_ci    if (2 == height) {
3731cabdff1aSopenharmony_ci        hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3732cabdff1aSopenharmony_ci                                 dst, dst_stride, filter,
3733cabdff1aSopenharmony_ci                                 weight0, weight1, offset0, offset1, rnd_val);
3734cabdff1aSopenharmony_ci    } else if (4 == height) {
3735cabdff1aSopenharmony_ci        hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3736cabdff1aSopenharmony_ci                                 dst, dst_stride, filter,
3737cabdff1aSopenharmony_ci                                 weight0, weight1, offset0, offset1, rnd_val);
3738cabdff1aSopenharmony_ci    } else if (0 == (height % 8)) {
3739cabdff1aSopenharmony_ci        hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
3740cabdff1aSopenharmony_ci                                         src1_ptr, src2_stride,
3741cabdff1aSopenharmony_ci                                         dst, dst_stride, filter, height,
3742cabdff1aSopenharmony_ci                                         weight0, weight1, offset0, offset1,
3743cabdff1aSopenharmony_ci                                         rnd_val);
3744cabdff1aSopenharmony_ci    }
3745cabdff1aSopenharmony_ci}
3746cabdff1aSopenharmony_ci
3747cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr,
3748cabdff1aSopenharmony_ci                                    int32_t src_stride,
3749cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
3750cabdff1aSopenharmony_ci                                    int32_t src2_stride,
3751cabdff1aSopenharmony_ci                                    uint8_t *dst,
3752cabdff1aSopenharmony_ci                                    int32_t dst_stride,
3753cabdff1aSopenharmony_ci                                    const int8_t *filter,
3754cabdff1aSopenharmony_ci                                    int32_t height,
3755cabdff1aSopenharmony_ci                                    int32_t weight0,
3756cabdff1aSopenharmony_ci                                    int32_t weight1,
3757cabdff1aSopenharmony_ci                                    int32_t offset0,
3758cabdff1aSopenharmony_ci                                    int32_t offset1,
3759cabdff1aSopenharmony_ci                                    int32_t rnd_val)
3760cabdff1aSopenharmony_ci{
3761cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3762cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
3763cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
3764cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
3765cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r;
3766cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3;
3767cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3768cabdff1aSopenharmony_ci    v8i16 filter_vec;
3769cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
3770cabdff1aSopenharmony_ci
3771cabdff1aSopenharmony_ci    src0_ptr -= src_stride;
3772cabdff1aSopenharmony_ci
3773cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
3774cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
3775cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
3776cabdff1aSopenharmony_ci    constant = 128 * weight1;
3777cabdff1aSopenharmony_ci    constant <<= 6;
3778cabdff1aSopenharmony_ci    offset += constant;
3779cabdff1aSopenharmony_ci
3780cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
3781cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3782cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
3783cabdff1aSopenharmony_ci
3784cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3785cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3786cabdff1aSopenharmony_ci
3787cabdff1aSopenharmony_ci    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3788cabdff1aSopenharmony_ci    src0_ptr += (3 * src_stride);
3789cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
3790cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3791cabdff1aSopenharmony_ci
3792cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
3793cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, src_stride, src3, src4);
3794cabdff1aSopenharmony_ci        src0_ptr += (2 * src_stride);
3795cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3796cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
3797cabdff1aSopenharmony_ci        XORI_B2_128_SB(src3, src4);
3798cabdff1aSopenharmony_ci        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3799cabdff1aSopenharmony_ci
3800cabdff1aSopenharmony_ci        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3801cabdff1aSopenharmony_ci        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3802cabdff1aSopenharmony_ci
3803cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, src_stride, src1, src2);
3804cabdff1aSopenharmony_ci        src0_ptr += (2 * src_stride);
3805cabdff1aSopenharmony_ci        XORI_B2_128_SB(src1, src2);
3806cabdff1aSopenharmony_ci        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3807cabdff1aSopenharmony_ci
3808cabdff1aSopenharmony_ci        tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
3809cabdff1aSopenharmony_ci        tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
3810cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3811cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
3812cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
3813cabdff1aSopenharmony_ci                           tmp0, tmp1, tmp2, tmp3);
3814cabdff1aSopenharmony_ci
3815cabdff1aSopenharmony_ci        PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3816cabdff1aSopenharmony_ci        ST_W2(tmp0, 0, 2, dst, dst_stride);
3817cabdff1aSopenharmony_ci        ST_H2(tmp0, 2, 6, dst + 4, dst_stride);
3818cabdff1aSopenharmony_ci        ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
3819cabdff1aSopenharmony_ci        ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3820cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
3821cabdff1aSopenharmony_ci    }
3822cabdff1aSopenharmony_ci}
3823cabdff1aSopenharmony_ci
3824cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
3825cabdff1aSopenharmony_ci                                     int32_t src_stride,
3826cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
3827cabdff1aSopenharmony_ci                                     int32_t src2_stride,
3828cabdff1aSopenharmony_ci                                     uint8_t *dst,
3829cabdff1aSopenharmony_ci                                     int32_t dst_stride,
3830cabdff1aSopenharmony_ci                                     const int8_t *filter,
3831cabdff1aSopenharmony_ci                                     int32_t weight0,
3832cabdff1aSopenharmony_ci                                     int32_t weight1,
3833cabdff1aSopenharmony_ci                                     int32_t offset0,
3834cabdff1aSopenharmony_ci                                     int32_t offset1,
3835cabdff1aSopenharmony_ci                                     int32_t rnd_val)
3836cabdff1aSopenharmony_ci{
3837cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
3838cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
3839cabdff1aSopenharmony_ci    v8i16 in0, in1, tmp0, tmp1;
3840cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r;
3841cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3842cabdff1aSopenharmony_ci    v8i16 filter_vec;
3843cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
3844cabdff1aSopenharmony_ci
3845cabdff1aSopenharmony_ci    src0_ptr -= src_stride;
3846cabdff1aSopenharmony_ci
3847cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
3848cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
3849cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
3850cabdff1aSopenharmony_ci    constant = 128 * weight1;
3851cabdff1aSopenharmony_ci    constant <<= 6;
3852cabdff1aSopenharmony_ci    offset += constant;
3853cabdff1aSopenharmony_ci
3854cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
3855cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3856cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
3857cabdff1aSopenharmony_ci
3858cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3859cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3860cabdff1aSopenharmony_ci
3861cabdff1aSopenharmony_ci    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3862cabdff1aSopenharmony_ci    src0_ptr += (3 * src_stride);
3863cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
3864cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3865cabdff1aSopenharmony_ci
3866cabdff1aSopenharmony_ci    LD_SB2(src0_ptr, src_stride, src3, src4);
3867cabdff1aSopenharmony_ci    LD_SH2(src1_ptr, src2_stride, in0, in1);
3868cabdff1aSopenharmony_ci    XORI_B2_128_SB(src3, src4);
3869cabdff1aSopenharmony_ci    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3870cabdff1aSopenharmony_ci
3871cabdff1aSopenharmony_ci    tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3872cabdff1aSopenharmony_ci    tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3873cabdff1aSopenharmony_ci    HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
3874cabdff1aSopenharmony_ci                       weight_vec, rnd_vec, offset_vec,
3875cabdff1aSopenharmony_ci                       tmp0, tmp1);
3876cabdff1aSopenharmony_ci
3877cabdff1aSopenharmony_ci    tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3878cabdff1aSopenharmony_ci    ST_D2(tmp0, 0, 1, dst, dst_stride);
3879cabdff1aSopenharmony_ci}
3880cabdff1aSopenharmony_ci
3881cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
3882cabdff1aSopenharmony_ci                                     int32_t src_stride,
3883cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
3884cabdff1aSopenharmony_ci                                     int32_t src2_stride,
3885cabdff1aSopenharmony_ci                                     uint8_t *dst,
3886cabdff1aSopenharmony_ci                                     int32_t dst_stride,
3887cabdff1aSopenharmony_ci                                     const int8_t *filter,
3888cabdff1aSopenharmony_ci                                     int32_t weight0,
3889cabdff1aSopenharmony_ci                                     int32_t weight1,
3890cabdff1aSopenharmony_ci                                     int32_t offset0,
3891cabdff1aSopenharmony_ci                                     int32_t offset1,
3892cabdff1aSopenharmony_ci                                     int32_t rnd_val)
3893cabdff1aSopenharmony_ci{
3894cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
3895cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3896cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5;
3897cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r;
3898cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src65_r, src87_r;
3899cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3900cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3901cabdff1aSopenharmony_ci    v8i16 filter_vec;
3902cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
3903cabdff1aSopenharmony_ci
3904cabdff1aSopenharmony_ci    src0_ptr -= src_stride;
3905cabdff1aSopenharmony_ci
3906cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
3907cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
3908cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
3909cabdff1aSopenharmony_ci    constant = 128 * weight1;
3910cabdff1aSopenharmony_ci    constant <<= 6;
3911cabdff1aSopenharmony_ci    offset += constant;
3912cabdff1aSopenharmony_ci
3913cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
3914cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3915cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
3916cabdff1aSopenharmony_ci
3917cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3918cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3919cabdff1aSopenharmony_ci
3920cabdff1aSopenharmony_ci    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3921cabdff1aSopenharmony_ci    src0_ptr += (3 * src_stride);
3922cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
3923cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3924cabdff1aSopenharmony_ci
3925cabdff1aSopenharmony_ci    LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3926cabdff1aSopenharmony_ci    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3927cabdff1aSopenharmony_ci    XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3928cabdff1aSopenharmony_ci    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3929cabdff1aSopenharmony_ci               src32_r, src43_r, src54_r, src65_r);
3930cabdff1aSopenharmony_ci    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3931cabdff1aSopenharmony_ci
3932cabdff1aSopenharmony_ci    tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3933cabdff1aSopenharmony_ci    tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3934cabdff1aSopenharmony_ci    tmp2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3935cabdff1aSopenharmony_ci    tmp3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3936cabdff1aSopenharmony_ci    tmp4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3937cabdff1aSopenharmony_ci    tmp5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3938cabdff1aSopenharmony_ci    HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3939cabdff1aSopenharmony_ci                       in0, in1, in2, in3,
3940cabdff1aSopenharmony_ci                       weight_vec, rnd_vec, offset_vec,
3941cabdff1aSopenharmony_ci                       tmp0, tmp1, tmp2, tmp3);
3942cabdff1aSopenharmony_ci    HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
3943cabdff1aSopenharmony_ci                       weight_vec, rnd_vec, offset_vec,
3944cabdff1aSopenharmony_ci                       tmp4, tmp5);
3945cabdff1aSopenharmony_ci
3946cabdff1aSopenharmony_ci    PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3947cabdff1aSopenharmony_ci    tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
3948cabdff1aSopenharmony_ci    ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
3949cabdff1aSopenharmony_ci    ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride);
3950cabdff1aSopenharmony_ci}
3951cabdff1aSopenharmony_ci
3952cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
3953cabdff1aSopenharmony_ci                                             int32_t src_stride,
3954cabdff1aSopenharmony_ci                                             int16_t *src1_ptr,
3955cabdff1aSopenharmony_ci                                             int32_t src2_stride,
3956cabdff1aSopenharmony_ci                                             uint8_t *dst,
3957cabdff1aSopenharmony_ci                                             int32_t dst_stride,
3958cabdff1aSopenharmony_ci                                             const int8_t *filter,
3959cabdff1aSopenharmony_ci                                             int32_t height,
3960cabdff1aSopenharmony_ci                                             int32_t weight0,
3961cabdff1aSopenharmony_ci                                             int32_t weight1,
3962cabdff1aSopenharmony_ci                                             int32_t offset0,
3963cabdff1aSopenharmony_ci                                             int32_t offset1,
3964cabdff1aSopenharmony_ci                                             int32_t rnd_val)
3965cabdff1aSopenharmony_ci{
3966cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3967cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
3968cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
3969cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
3970cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r;
3971cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3;
3972cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3973cabdff1aSopenharmony_ci    v8i16 filter_vec;
3974cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
3975cabdff1aSopenharmony_ci
3976cabdff1aSopenharmony_ci    src0_ptr -= src_stride;
3977cabdff1aSopenharmony_ci
3978cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
3979cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
3980cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
3981cabdff1aSopenharmony_ci    constant = 128 * weight1;
3982cabdff1aSopenharmony_ci    constant <<= 6;
3983cabdff1aSopenharmony_ci    offset += constant;
3984cabdff1aSopenharmony_ci
3985cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
3986cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
3987cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
3988cabdff1aSopenharmony_ci
3989cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
3990cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3991cabdff1aSopenharmony_ci
3992cabdff1aSopenharmony_ci    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3993cabdff1aSopenharmony_ci    src0_ptr += (3 * src_stride);
3994cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
3995cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3996cabdff1aSopenharmony_ci
3997cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
3998cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, src_stride, src3, src4);
3999cabdff1aSopenharmony_ci        src0_ptr += (2 * src_stride);
4000cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4001cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
4002cabdff1aSopenharmony_ci        XORI_B2_128_SB(src3, src4);
4003cabdff1aSopenharmony_ci        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4004cabdff1aSopenharmony_ci
4005cabdff1aSopenharmony_ci        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4006cabdff1aSopenharmony_ci        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4007cabdff1aSopenharmony_ci
4008cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, src_stride, src1, src2);
4009cabdff1aSopenharmony_ci        src0_ptr += (2 * src_stride);
4010cabdff1aSopenharmony_ci        XORI_B2_128_SB(src1, src2);
4011cabdff1aSopenharmony_ci        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
4012cabdff1aSopenharmony_ci
4013cabdff1aSopenharmony_ci        tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4014cabdff1aSopenharmony_ci        tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4015cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4016cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
4017cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
4018cabdff1aSopenharmony_ci                           tmp0, tmp1, tmp2, tmp3);
4019cabdff1aSopenharmony_ci
4020cabdff1aSopenharmony_ci        PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4021cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4022cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
4023cabdff1aSopenharmony_ci    }
4024cabdff1aSopenharmony_ci}
4025cabdff1aSopenharmony_ci
4026cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr,
4027cabdff1aSopenharmony_ci                                    int32_t src_stride,
4028cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
4029cabdff1aSopenharmony_ci                                    int32_t src2_stride,
4030cabdff1aSopenharmony_ci                                    uint8_t *dst,
4031cabdff1aSopenharmony_ci                                    int32_t dst_stride,
4032cabdff1aSopenharmony_ci                                    const int8_t *filter,
4033cabdff1aSopenharmony_ci                                    int32_t height,
4034cabdff1aSopenharmony_ci                                    int32_t weight0,
4035cabdff1aSopenharmony_ci                                    int32_t weight1,
4036cabdff1aSopenharmony_ci                                    int32_t offset0,
4037cabdff1aSopenharmony_ci                                    int32_t offset1,
4038cabdff1aSopenharmony_ci                                    int32_t rnd_val)
4039cabdff1aSopenharmony_ci{
4040cabdff1aSopenharmony_ci    if (2 == height) {
4041cabdff1aSopenharmony_ci        hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4042cabdff1aSopenharmony_ci                                 dst, dst_stride, filter,
4043cabdff1aSopenharmony_ci                                 weight0, weight1, offset0, offset1, rnd_val);
4044cabdff1aSopenharmony_ci    } else if (6 == height) {
4045cabdff1aSopenharmony_ci        hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4046cabdff1aSopenharmony_ci                                 dst, dst_stride, filter,
4047cabdff1aSopenharmony_ci                                 weight0, weight1, offset0, offset1, rnd_val);
4048cabdff1aSopenharmony_ci    } else {
4049cabdff1aSopenharmony_ci        hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
4050cabdff1aSopenharmony_ci                                         src1_ptr, src2_stride,
4051cabdff1aSopenharmony_ci                                         dst, dst_stride, filter, height,
4052cabdff1aSopenharmony_ci                                         weight0, weight1, offset0, offset1,
4053cabdff1aSopenharmony_ci                                         rnd_val);
4054cabdff1aSopenharmony_ci    }
4055cabdff1aSopenharmony_ci}
4056cabdff1aSopenharmony_ci
4057cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr,
4058cabdff1aSopenharmony_ci                                     int32_t src_stride,
4059cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
4060cabdff1aSopenharmony_ci                                     int32_t src2_stride,
4061cabdff1aSopenharmony_ci                                     uint8_t *dst,
4062cabdff1aSopenharmony_ci                                     int32_t dst_stride,
4063cabdff1aSopenharmony_ci                                     const int8_t *filter,
4064cabdff1aSopenharmony_ci                                     int32_t height,
4065cabdff1aSopenharmony_ci                                     int32_t weight0,
4066cabdff1aSopenharmony_ci                                     int32_t weight1,
4067cabdff1aSopenharmony_ci                                     int32_t offset0,
4068cabdff1aSopenharmony_ci                                     int32_t offset1,
4069cabdff1aSopenharmony_ci                                     int32_t rnd_val)
4070cabdff1aSopenharmony_ci{
4071cabdff1aSopenharmony_ci    uint32_t loop_cnt;
4072cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
4073cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5;
4074cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4075cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r;
4076cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4077cabdff1aSopenharmony_ci    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4078cabdff1aSopenharmony_ci    v16i8 src2110, src4332;
4079cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4080cabdff1aSopenharmony_ci    v8i16 filter_vec;
4081cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
4082cabdff1aSopenharmony_ci
4083cabdff1aSopenharmony_ci    src0_ptr -= (1 * src_stride);
4084cabdff1aSopenharmony_ci
4085cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
4086cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
4087cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
4088cabdff1aSopenharmony_ci    constant = 128 * weight1;
4089cabdff1aSopenharmony_ci    constant <<= 6;
4090cabdff1aSopenharmony_ci    offset += constant;
4091cabdff1aSopenharmony_ci
4092cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
4093cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
4094cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
4095cabdff1aSopenharmony_ci
4096cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
4097cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4098cabdff1aSopenharmony_ci
4099cabdff1aSopenharmony_ci    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4100cabdff1aSopenharmony_ci    src0_ptr += (3 * src_stride);
4101cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
4102cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4103cabdff1aSopenharmony_ci    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4104cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
4105cabdff1aSopenharmony_ci
4106cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
4107cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, src_stride, src3, src4);
4108cabdff1aSopenharmony_ci        src0_ptr += (2 * src_stride);
4109cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4110cabdff1aSopenharmony_ci        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
4111cabdff1aSopenharmony_ci        src1_ptr += (4 * src2_stride);
4112cabdff1aSopenharmony_ci        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
4113cabdff1aSopenharmony_ci        XORI_B2_128_SB(src3, src4);
4114cabdff1aSopenharmony_ci
4115cabdff1aSopenharmony_ci        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4116cabdff1aSopenharmony_ci        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4117cabdff1aSopenharmony_ci        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
4118cabdff1aSopenharmony_ci
4119cabdff1aSopenharmony_ci        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4120cabdff1aSopenharmony_ci        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4121cabdff1aSopenharmony_ci        tmp4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
4122cabdff1aSopenharmony_ci
4123cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, src_stride, src5, src2);
4124cabdff1aSopenharmony_ci        src0_ptr += (2 * src_stride);
4125cabdff1aSopenharmony_ci        XORI_B2_128_SB(src5, src2);
4126cabdff1aSopenharmony_ci        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4127cabdff1aSopenharmony_ci        ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
4128cabdff1aSopenharmony_ci        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
4129cabdff1aSopenharmony_ci
4130cabdff1aSopenharmony_ci        tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4131cabdff1aSopenharmony_ci        tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4132cabdff1aSopenharmony_ci        tmp5 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
4133cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4134cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
4135cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
4136cabdff1aSopenharmony_ci                           tmp0, tmp1, tmp2, tmp3);
4137cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
4138cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
4139cabdff1aSopenharmony_ci                           tmp4, tmp5);
4140cabdff1aSopenharmony_ci
4141cabdff1aSopenharmony_ci        PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4142cabdff1aSopenharmony_ci        tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4143cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4144cabdff1aSopenharmony_ci        ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
4145cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
4146cabdff1aSopenharmony_ci    }
4147cabdff1aSopenharmony_ci}
4148cabdff1aSopenharmony_ci
4149cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr,
4150cabdff1aSopenharmony_ci                                     int32_t src_stride,
4151cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
4152cabdff1aSopenharmony_ci                                     int32_t src2_stride,
4153cabdff1aSopenharmony_ci                                     uint8_t *dst,
4154cabdff1aSopenharmony_ci                                     int32_t dst_stride,
4155cabdff1aSopenharmony_ci                                     const int8_t *filter,
4156cabdff1aSopenharmony_ci                                     int32_t height,
4157cabdff1aSopenharmony_ci                                     int32_t weight0,
4158cabdff1aSopenharmony_ci                                     int32_t weight1,
4159cabdff1aSopenharmony_ci                                     int32_t offset0,
4160cabdff1aSopenharmony_ci                                     int32_t offset1,
4161cabdff1aSopenharmony_ci                                     int32_t rnd_val)
4162cabdff1aSopenharmony_ci{
4163cabdff1aSopenharmony_ci    uint32_t loop_cnt;
4164cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
4165cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5;
4166cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
4167cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r;
4168cabdff1aSopenharmony_ci    v16i8 src10_l, src32_l, src21_l, src43_l;
4169cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3;
4170cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4171cabdff1aSopenharmony_ci    v8i16 filter_vec;
4172cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
4173cabdff1aSopenharmony_ci
4174cabdff1aSopenharmony_ci    src0_ptr -= src_stride;
4175cabdff1aSopenharmony_ci
4176cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
4177cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
4178cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
4179cabdff1aSopenharmony_ci    constant = 128 * weight1;
4180cabdff1aSopenharmony_ci    constant <<= 6;
4181cabdff1aSopenharmony_ci    offset += constant;
4182cabdff1aSopenharmony_ci
4183cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
4184cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
4185cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
4186cabdff1aSopenharmony_ci
4187cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
4188cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4189cabdff1aSopenharmony_ci
4190cabdff1aSopenharmony_ci    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4191cabdff1aSopenharmony_ci    src0_ptr += (3 * src_stride);
4192cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
4193cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4194cabdff1aSopenharmony_ci    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4195cabdff1aSopenharmony_ci
4196cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
4197cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, src_stride, src3, src4);
4198cabdff1aSopenharmony_ci        src0_ptr += (2 * src_stride);
4199cabdff1aSopenharmony_ci        LD_SH2(src1_ptr, src2_stride, in0, in1);
4200cabdff1aSopenharmony_ci        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4201cabdff1aSopenharmony_ci        src1_ptr += (2 * src2_stride);
4202cabdff1aSopenharmony_ci        XORI_B2_128_SB(src3, src4);
4203cabdff1aSopenharmony_ci        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4204cabdff1aSopenharmony_ci        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4205cabdff1aSopenharmony_ci
4206cabdff1aSopenharmony_ci        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4207cabdff1aSopenharmony_ci        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4208cabdff1aSopenharmony_ci        tmp2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4209cabdff1aSopenharmony_ci        tmp3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4210cabdff1aSopenharmony_ci
4211cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4212cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
4213cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
4214cabdff1aSopenharmony_ci                           tmp0, tmp1, tmp2, tmp3);
4215cabdff1aSopenharmony_ci        PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4216cabdff1aSopenharmony_ci        ST_SH2(tmp0, tmp1, dst, dst_stride);
4217cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
4218cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, src_stride, src5, src2);
4219cabdff1aSopenharmony_ci        src0_ptr += (2 * src_stride);
4220cabdff1aSopenharmony_ci
4221cabdff1aSopenharmony_ci        LD_SH2(src1_ptr, src2_stride, in0, in1);
4222cabdff1aSopenharmony_ci        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4223cabdff1aSopenharmony_ci        src1_ptr += (2 * src2_stride);
4224cabdff1aSopenharmony_ci        XORI_B2_128_SB(src5, src2);
4225cabdff1aSopenharmony_ci        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4226cabdff1aSopenharmony_ci        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4227cabdff1aSopenharmony_ci
4228cabdff1aSopenharmony_ci        tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4229cabdff1aSopenharmony_ci        tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4230cabdff1aSopenharmony_ci        tmp2 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4231cabdff1aSopenharmony_ci        tmp3 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4232cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4233cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
4234cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
4235cabdff1aSopenharmony_ci                           tmp0, tmp1, tmp2, tmp3);
4236cabdff1aSopenharmony_ci
4237cabdff1aSopenharmony_ci        PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4238cabdff1aSopenharmony_ci        ST_SH2(tmp0, tmp1, dst, dst_stride);
4239cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
4240cabdff1aSopenharmony_ci    }
4241cabdff1aSopenharmony_ci}
4242cabdff1aSopenharmony_ci
4243cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
4244cabdff1aSopenharmony_ci                                     int32_t src_stride,
4245cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
4246cabdff1aSopenharmony_ci                                     int32_t src2_stride,
4247cabdff1aSopenharmony_ci                                     uint8_t *dst,
4248cabdff1aSopenharmony_ci                                     int32_t dst_stride,
4249cabdff1aSopenharmony_ci                                     const int8_t *filter,
4250cabdff1aSopenharmony_ci                                     int32_t height,
4251cabdff1aSopenharmony_ci                                     int32_t weight0,
4252cabdff1aSopenharmony_ci                                     int32_t weight1,
4253cabdff1aSopenharmony_ci                                     int32_t offset0,
4254cabdff1aSopenharmony_ci                                     int32_t offset1,
4255cabdff1aSopenharmony_ci                                     int32_t rnd_val)
4256cabdff1aSopenharmony_ci{
4257cabdff1aSopenharmony_ci    uint32_t loop_cnt;
4258cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
4259cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5;
4260cabdff1aSopenharmony_ci    v16i8 src6, src7, src8, src9, src10, src11;
4261cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5;
4262cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src76_r, src98_r;
4263cabdff1aSopenharmony_ci    v16i8 src10_l, src32_l, src21_l, src43_l;
4264cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src87_r, src109_r;
4265cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4266cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4267cabdff1aSopenharmony_ci    v8i16 filter_vec;
4268cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
4269cabdff1aSopenharmony_ci
4270cabdff1aSopenharmony_ci    src0_ptr -= src_stride;
4271cabdff1aSopenharmony_ci
4272cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
4273cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
4274cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
4275cabdff1aSopenharmony_ci    constant = 128 * weight1;
4276cabdff1aSopenharmony_ci    constant <<= 6;
4277cabdff1aSopenharmony_ci    offset += constant;
4278cabdff1aSopenharmony_ci
4279cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
4280cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
4281cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
4282cabdff1aSopenharmony_ci
4283cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
4284cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4285cabdff1aSopenharmony_ci
4286cabdff1aSopenharmony_ci    /* 16width */
4287cabdff1aSopenharmony_ci    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4288cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
4289cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4290cabdff1aSopenharmony_ci    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4291cabdff1aSopenharmony_ci    /* 8width */
4292cabdff1aSopenharmony_ci    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4293cabdff1aSopenharmony_ci    src0_ptr += (3 * src_stride);
4294cabdff1aSopenharmony_ci    XORI_B3_128_SB(src6, src7, src8);
4295cabdff1aSopenharmony_ci    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4296cabdff1aSopenharmony_ci
4297cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
4298cabdff1aSopenharmony_ci        /* 16width */
4299cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, src_stride, src3, src4);
4300cabdff1aSopenharmony_ci        LD_SH2(src1_ptr, src2_stride, in0, in1);
4301cabdff1aSopenharmony_ci        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4302cabdff1aSopenharmony_ci        XORI_B2_128_SB(src3, src4);
4303cabdff1aSopenharmony_ci        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4304cabdff1aSopenharmony_ci        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4305cabdff1aSopenharmony_ci
4306cabdff1aSopenharmony_ci        /* 8width */
4307cabdff1aSopenharmony_ci        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4308cabdff1aSopenharmony_ci        src0_ptr += (2 * src_stride);
4309cabdff1aSopenharmony_ci        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4310cabdff1aSopenharmony_ci        src1_ptr += (2 * src2_stride);
4311cabdff1aSopenharmony_ci        XORI_B2_128_SB(src9, src10);
4312cabdff1aSopenharmony_ci        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4313cabdff1aSopenharmony_ci        /* 16width */
4314cabdff1aSopenharmony_ci        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4315cabdff1aSopenharmony_ci        tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4316cabdff1aSopenharmony_ci        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4317cabdff1aSopenharmony_ci        tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4318cabdff1aSopenharmony_ci        /* 8width */
4319cabdff1aSopenharmony_ci        tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4320cabdff1aSopenharmony_ci        tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4321cabdff1aSopenharmony_ci        /* 16width */
4322cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4323cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
4324cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
4325cabdff1aSopenharmony_ci                           tmp0, tmp1, tmp4, tmp5);
4326cabdff1aSopenharmony_ci        /* 8width */
4327cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4328cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
4329cabdff1aSopenharmony_ci                           tmp2, tmp3);
4330cabdff1aSopenharmony_ci        /* 16width */
4331cabdff1aSopenharmony_ci        PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4332cabdff1aSopenharmony_ci        /* 8width */
4333cabdff1aSopenharmony_ci        tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4334cabdff1aSopenharmony_ci        ST_SH2(tmp0, tmp1, dst, dst_stride);
4335cabdff1aSopenharmony_ci        ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4336cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
4337cabdff1aSopenharmony_ci
4338cabdff1aSopenharmony_ci        /* 16width */
4339cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, src_stride, src5, src2);
4340cabdff1aSopenharmony_ci        LD_SH2(src1_ptr, src2_stride, in0, in1);
4341cabdff1aSopenharmony_ci        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4342cabdff1aSopenharmony_ci        XORI_B2_128_SB(src5, src2);
4343cabdff1aSopenharmony_ci        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4344cabdff1aSopenharmony_ci        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4345cabdff1aSopenharmony_ci        /* 8width */
4346cabdff1aSopenharmony_ci        LD_SB2(src0_ptr + 16, src_stride, src11, src8);
4347cabdff1aSopenharmony_ci        src0_ptr += (2 * src_stride);
4348cabdff1aSopenharmony_ci        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4349cabdff1aSopenharmony_ci        src1_ptr += (2 * src2_stride);
4350cabdff1aSopenharmony_ci        XORI_B2_128_SB(src11, src8);
4351cabdff1aSopenharmony_ci        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
4352cabdff1aSopenharmony_ci        /* 16width */
4353cabdff1aSopenharmony_ci        tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4354cabdff1aSopenharmony_ci        tmp4 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4355cabdff1aSopenharmony_ci        tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4356cabdff1aSopenharmony_ci        tmp5 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4357cabdff1aSopenharmony_ci        /* 8width */
4358cabdff1aSopenharmony_ci        tmp2 = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
4359cabdff1aSopenharmony_ci        tmp3 = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
4360cabdff1aSopenharmony_ci        /* 16width */
4361cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4362cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
4363cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
4364cabdff1aSopenharmony_ci                           tmp0, tmp1, tmp4, tmp5);
4365cabdff1aSopenharmony_ci        /* 8width */
4366cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4367cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
4368cabdff1aSopenharmony_ci                           tmp2, tmp3);
4369cabdff1aSopenharmony_ci        /* 16width */
4370cabdff1aSopenharmony_ci        PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4371cabdff1aSopenharmony_ci
4372cabdff1aSopenharmony_ci        /* 8width */
4373cabdff1aSopenharmony_ci        tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4374cabdff1aSopenharmony_ci        ST_SH2(tmp0, tmp1, dst, dst_stride);
4375cabdff1aSopenharmony_ci        ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4376cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
4377cabdff1aSopenharmony_ci    }
4378cabdff1aSopenharmony_ci}
4379cabdff1aSopenharmony_ci
4380cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr,
4381cabdff1aSopenharmony_ci                                     int32_t src_stride,
4382cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
4383cabdff1aSopenharmony_ci                                     int32_t src2_stride,
4384cabdff1aSopenharmony_ci                                     uint8_t *dst,
4385cabdff1aSopenharmony_ci                                     int32_t dst_stride,
4386cabdff1aSopenharmony_ci                                     const int8_t *filter,
4387cabdff1aSopenharmony_ci                                     int32_t height,
4388cabdff1aSopenharmony_ci                                     int32_t weight0,
4389cabdff1aSopenharmony_ci                                     int32_t weight1,
4390cabdff1aSopenharmony_ci                                     int32_t offset0,
4391cabdff1aSopenharmony_ci                                     int32_t offset1,
4392cabdff1aSopenharmony_ci                                     int32_t rnd_val)
4393cabdff1aSopenharmony_ci{
4394cabdff1aSopenharmony_ci    uint32_t loop_cnt;
4395cabdff1aSopenharmony_ci    uint8_t *dst_tmp = dst + 16;
4396cabdff1aSopenharmony_ci    int32_t offset, weight, constant;
4397cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
4398cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4399cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src76_r, src98_r;
4400cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src87_r, src109_r;
4401cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4402cabdff1aSopenharmony_ci    v16i8 src10_l, src32_l, src76_l, src98_l;
4403cabdff1aSopenharmony_ci    v16i8 src21_l, src43_l, src87_l, src109_l;
4404cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4405cabdff1aSopenharmony_ci    v8i16 filter_vec;
4406cabdff1aSopenharmony_ci    v4i32 weight_vec, offset_vec, rnd_vec;
4407cabdff1aSopenharmony_ci
4408cabdff1aSopenharmony_ci    src0_ptr -= src_stride;
4409cabdff1aSopenharmony_ci
4410cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
4411cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
4412cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
4413cabdff1aSopenharmony_ci    constant = 128 * weight1;
4414cabdff1aSopenharmony_ci    constant <<= 6;
4415cabdff1aSopenharmony_ci    offset += constant;
4416cabdff1aSopenharmony_ci
4417cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
4418cabdff1aSopenharmony_ci    weight_vec = __msa_fill_w(weight);
4419cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
4420cabdff1aSopenharmony_ci
4421cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
4422cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4423cabdff1aSopenharmony_ci
4424cabdff1aSopenharmony_ci    /* 16width */
4425cabdff1aSopenharmony_ci    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4426cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
4427cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4428cabdff1aSopenharmony_ci    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4429cabdff1aSopenharmony_ci    /* next 16width */
4430cabdff1aSopenharmony_ci    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4431cabdff1aSopenharmony_ci    src0_ptr += (3 * src_stride);
4432cabdff1aSopenharmony_ci    XORI_B3_128_SB(src6, src7, src8);
4433cabdff1aSopenharmony_ci    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4434cabdff1aSopenharmony_ci    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
4435cabdff1aSopenharmony_ci
4436cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
4437cabdff1aSopenharmony_ci        /* 16width */
4438cabdff1aSopenharmony_ci        LD_SB2(src0_ptr, src_stride, src3, src4);
4439cabdff1aSopenharmony_ci        LD_SH2(src1_ptr, src2_stride, in0, in1);
4440cabdff1aSopenharmony_ci        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4441cabdff1aSopenharmony_ci        XORI_B2_128_SB(src3, src4);
4442cabdff1aSopenharmony_ci        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4443cabdff1aSopenharmony_ci        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4444cabdff1aSopenharmony_ci
4445cabdff1aSopenharmony_ci        /* 16width */
4446cabdff1aSopenharmony_ci        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4447cabdff1aSopenharmony_ci        tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4448cabdff1aSopenharmony_ci        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4449cabdff1aSopenharmony_ci        tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4450cabdff1aSopenharmony_ci        /* 16width */
4451cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4452cabdff1aSopenharmony_ci                           in0, in1, in2, in3,
4453cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
4454cabdff1aSopenharmony_ci                           tmp0, tmp1, tmp4, tmp5);
4455cabdff1aSopenharmony_ci        /* 16width */
4456cabdff1aSopenharmony_ci        PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4457cabdff1aSopenharmony_ci        ST_SH2(tmp0, tmp1, dst, dst_stride);
4458cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
4459cabdff1aSopenharmony_ci
4460cabdff1aSopenharmony_ci        src10_r = src32_r;
4461cabdff1aSopenharmony_ci        src21_r = src43_r;
4462cabdff1aSopenharmony_ci        src10_l = src32_l;
4463cabdff1aSopenharmony_ci        src21_l = src43_l;
4464cabdff1aSopenharmony_ci        src2 = src4;
4465cabdff1aSopenharmony_ci
4466cabdff1aSopenharmony_ci        /* next 16width */
4467cabdff1aSopenharmony_ci        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4468cabdff1aSopenharmony_ci        src0_ptr += (2 * src_stride);
4469cabdff1aSopenharmony_ci        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4470cabdff1aSopenharmony_ci        LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
4471cabdff1aSopenharmony_ci        src1_ptr += (2 * src2_stride);
4472cabdff1aSopenharmony_ci        XORI_B2_128_SB(src9, src10);
4473cabdff1aSopenharmony_ci        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4474cabdff1aSopenharmony_ci        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
4475cabdff1aSopenharmony_ci        /* next 16width */
4476cabdff1aSopenharmony_ci        tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4477cabdff1aSopenharmony_ci        tmp6 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
4478cabdff1aSopenharmony_ci        tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4479cabdff1aSopenharmony_ci        tmp7 = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
4480cabdff1aSopenharmony_ci        /* next 16width */
4481cabdff1aSopenharmony_ci        HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
4482cabdff1aSopenharmony_ci                           in4, in5, in6, in7,
4483cabdff1aSopenharmony_ci                           weight_vec, rnd_vec, offset_vec,
4484cabdff1aSopenharmony_ci                           tmp2, tmp3, tmp6, tmp7);
4485cabdff1aSopenharmony_ci
4486cabdff1aSopenharmony_ci        /* next 16width */
4487cabdff1aSopenharmony_ci        PCKEV_B2_SH(tmp6, tmp2, tmp7, tmp3, tmp2, tmp3);
4488cabdff1aSopenharmony_ci        ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
4489cabdff1aSopenharmony_ci        dst_tmp += (2 * dst_stride);
4490cabdff1aSopenharmony_ci
4491cabdff1aSopenharmony_ci        src76_r = src98_r;
4492cabdff1aSopenharmony_ci        src87_r = src109_r;
4493cabdff1aSopenharmony_ci        src76_l = src98_l;
4494cabdff1aSopenharmony_ci        src87_l = src109_l;
4495cabdff1aSopenharmony_ci        src8 = src10;
4496cabdff1aSopenharmony_ci    }
4497cabdff1aSopenharmony_ci}
4498cabdff1aSopenharmony_ci
4499cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
4500cabdff1aSopenharmony_ci                                     int32_t src_stride,
4501cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
4502cabdff1aSopenharmony_ci                                     int32_t src2_stride,
4503cabdff1aSopenharmony_ci                                     uint8_t *dst,
4504cabdff1aSopenharmony_ci                                     int32_t dst_stride,
4505cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
4506cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
4507cabdff1aSopenharmony_ci                                     int32_t weight0,
4508cabdff1aSopenharmony_ci                                     int32_t weight1,
4509cabdff1aSopenharmony_ci                                     int32_t offset0,
4510cabdff1aSopenharmony_ci                                     int32_t offset1,
4511cabdff1aSopenharmony_ci                                     int32_t rnd_val)
4512cabdff1aSopenharmony_ci{
4513cabdff1aSopenharmony_ci    uint64_t tp0, tp1;
4514cabdff1aSopenharmony_ci    int32_t offset, weight;
4515cabdff1aSopenharmony_ci    v8i16 in0 = { 0 };
4516cabdff1aSopenharmony_ci    v16u8 out;
4517cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
4518cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4519cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1;
4520cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4521cabdff1aSopenharmony_ci    v16i8 mask1;
4522cabdff1aSopenharmony_ci    v8i16 filter_vec, tmp, weight_vec;
4523cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4524cabdff1aSopenharmony_ci    v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
4525cabdff1aSopenharmony_ci    v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
4526cabdff1aSopenharmony_ci
4527cabdff1aSopenharmony_ci    src0_ptr -= (src_stride + 1);
4528cabdff1aSopenharmony_ci
4529cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
4530cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4531cabdff1aSopenharmony_ci
4532cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
4533cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
4534cabdff1aSopenharmony_ci
4535cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4536cabdff1aSopenharmony_ci
4537cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
4538cabdff1aSopenharmony_ci
4539cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
4540cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
4541cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
4542cabdff1aSopenharmony_ci
4543cabdff1aSopenharmony_ci    const_vec = __msa_fill_w((128 * weight1));
4544cabdff1aSopenharmony_ci    const_vec <<= 6;
4545cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
4546cabdff1aSopenharmony_ci    weight_vec = (v8i16) __msa_fill_w(weight);
4547cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
4548cabdff1aSopenharmony_ci    offset_vec += const_vec;
4549cabdff1aSopenharmony_ci
4550cabdff1aSopenharmony_ci    LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4551cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4552cabdff1aSopenharmony_ci
4553cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
4554cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
4555cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4556cabdff1aSopenharmony_ci
4557cabdff1aSopenharmony_ci    dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4558cabdff1aSopenharmony_ci    dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4559cabdff1aSopenharmony_ci    dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4560cabdff1aSopenharmony_ci
4561cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst31, dst20, dst10, dst32);
4562cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst42, dst31, dst21, dst43);
4563cabdff1aSopenharmony_ci
4564cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4565cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4566cabdff1aSopenharmony_ci    dst0 >>= 6;
4567cabdff1aSopenharmony_ci    dst1 >>= 6;
4568cabdff1aSopenharmony_ci    dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4569cabdff1aSopenharmony_ci
4570cabdff1aSopenharmony_ci    LD2(src1_ptr, src2_stride, tp0, tp1);
4571cabdff1aSopenharmony_ci    INSERT_D2_SH(tp0, tp1, in0);
4572cabdff1aSopenharmony_ci
4573cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4574cabdff1aSopenharmony_ci    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4575cabdff1aSopenharmony_ci    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4576cabdff1aSopenharmony_ci    SRAR_W2_SW(dst0, dst1, rnd_vec);
4577cabdff1aSopenharmony_ci    tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4578cabdff1aSopenharmony_ci    CLIP_SH_0_255(tmp);
4579cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
4580cabdff1aSopenharmony_ci    ST_W2(out, 0, 1, dst, dst_stride);
4581cabdff1aSopenharmony_ci}
4582cabdff1aSopenharmony_ci
4583cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
4584cabdff1aSopenharmony_ci                                     int32_t src_stride,
4585cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
4586cabdff1aSopenharmony_ci                                     int32_t src2_stride,
4587cabdff1aSopenharmony_ci                                     uint8_t *dst,
4588cabdff1aSopenharmony_ci                                     int32_t dst_stride,
4589cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
4590cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
4591cabdff1aSopenharmony_ci                                     int32_t weight0,
4592cabdff1aSopenharmony_ci                                     int32_t weight1,
4593cabdff1aSopenharmony_ci                                     int32_t offset0,
4594cabdff1aSopenharmony_ci                                     int32_t offset1,
4595cabdff1aSopenharmony_ci                                     int32_t rnd_val)
4596cabdff1aSopenharmony_ci{
4597cabdff1aSopenharmony_ci    uint64_t tp0, tp1;
4598cabdff1aSopenharmony_ci    int32_t offset, weight;
4599cabdff1aSopenharmony_ci    v16u8 out;
4600cabdff1aSopenharmony_ci    v8i16 in0 = { 0 }, in1 = { 0 };
4601cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
4602cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4603cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1;
4604cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4605cabdff1aSopenharmony_ci    v16i8 mask1;
4606cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec;
4607cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4608cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3;
4609cabdff1aSopenharmony_ci    v8i16 dst30, dst41, dst52, dst63;
4610cabdff1aSopenharmony_ci    v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
4611cabdff1aSopenharmony_ci    v4i32 offset_vec, rnd_vec, const_vec;
4612cabdff1aSopenharmony_ci    v4i32 dst0, dst1, dst2, dst3;
4613cabdff1aSopenharmony_ci
4614cabdff1aSopenharmony_ci    src0_ptr -= (src_stride + 1);
4615cabdff1aSopenharmony_ci
4616cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
4617cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4618cabdff1aSopenharmony_ci
4619cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
4620cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
4621cabdff1aSopenharmony_ci
4622cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4623cabdff1aSopenharmony_ci
4624cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
4625cabdff1aSopenharmony_ci
4626cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
4627cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
4628cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
4629cabdff1aSopenharmony_ci
4630cabdff1aSopenharmony_ci    const_vec = __msa_fill_w((128 * weight1));
4631cabdff1aSopenharmony_ci    const_vec <<= 6;
4632cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
4633cabdff1aSopenharmony_ci    weight_vec = (v8i16) __msa_fill_w(weight);
4634cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
4635cabdff1aSopenharmony_ci    offset_vec += const_vec;
4636cabdff1aSopenharmony_ci
4637cabdff1aSopenharmony_ci    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4638cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4639cabdff1aSopenharmony_ci
4640cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4641cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4642cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4643cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4644cabdff1aSopenharmony_ci
4645cabdff1aSopenharmony_ci    dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4646cabdff1aSopenharmony_ci    dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4647cabdff1aSopenharmony_ci    dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4648cabdff1aSopenharmony_ci    dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4649cabdff1aSopenharmony_ci
4650cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
4651cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
4652cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
4653cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4654cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4655cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
4656cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
4657cabdff1aSopenharmony_ci    SRA_4V(dst0, dst1, dst2, dst3, 6);
4658cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
4659cabdff1aSopenharmony_ci
4660cabdff1aSopenharmony_ci    LD2(src1_ptr, src2_stride, tp0, tp1);
4661cabdff1aSopenharmony_ci    INSERT_D2_SH(tp0, tp1, in0);
4662cabdff1aSopenharmony_ci    src1_ptr += (2 * src2_stride);
4663cabdff1aSopenharmony_ci    LD2(src1_ptr, src2_stride, tp0, tp1);
4664cabdff1aSopenharmony_ci    INSERT_D2_SH(tp0, tp1, in1);
4665cabdff1aSopenharmony_ci
4666cabdff1aSopenharmony_ci    ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
4667cabdff1aSopenharmony_ci    ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
4668cabdff1aSopenharmony_ci
4669cabdff1aSopenharmony_ci    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4670cabdff1aSopenharmony_ci    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4671cabdff1aSopenharmony_ci    dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4672cabdff1aSopenharmony_ci    dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4673cabdff1aSopenharmony_ci    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4674cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4675cabdff1aSopenharmony_ci    CLIP_SH2_0_255(tmp0, tmp1);
4676cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4677cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
4678cabdff1aSopenharmony_ci}
4679cabdff1aSopenharmony_ci
4680cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
4681cabdff1aSopenharmony_ci                                             int32_t src_stride,
4682cabdff1aSopenharmony_ci                                             int16_t *src1_ptr,
4683cabdff1aSopenharmony_ci                                             int32_t src2_stride,
4684cabdff1aSopenharmony_ci                                             uint8_t *dst,
4685cabdff1aSopenharmony_ci                                             int32_t dst_stride,
4686cabdff1aSopenharmony_ci                                             const int8_t *filter_x,
4687cabdff1aSopenharmony_ci                                             const int8_t *filter_y,
4688cabdff1aSopenharmony_ci                                             int32_t height,
4689cabdff1aSopenharmony_ci                                             int32_t weight0,
4690cabdff1aSopenharmony_ci                                             int32_t weight1,
4691cabdff1aSopenharmony_ci                                             int32_t offset0,
4692cabdff1aSopenharmony_ci                                             int32_t offset1,
4693cabdff1aSopenharmony_ci                                             int32_t rnd_val)
4694cabdff1aSopenharmony_ci{
4695cabdff1aSopenharmony_ci    uint32_t loop_cnt;
4696cabdff1aSopenharmony_ci    uint64_t tp0, tp1;
4697cabdff1aSopenharmony_ci    int32_t offset, weight;
4698cabdff1aSopenharmony_ci    v16u8 out0, out1;
4699cabdff1aSopenharmony_ci    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4700cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4701cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4702cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1;
4703cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4704cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4705cabdff1aSopenharmony_ci    v16i8 mask1;
4706cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec;
4707cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4708cabdff1aSopenharmony_ci    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4709cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4710cabdff1aSopenharmony_ci    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4711cabdff1aSopenharmony_ci    v8i16 dst98_r, dst109_r;
4712cabdff1aSopenharmony_ci    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4713cabdff1aSopenharmony_ci    v4i32 offset_vec, rnd_vec, const_vec;
4714cabdff1aSopenharmony_ci
4715cabdff1aSopenharmony_ci    src0_ptr -= (src_stride + 1);
4716cabdff1aSopenharmony_ci
4717cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
4718cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4719cabdff1aSopenharmony_ci
4720cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
4721cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
4722cabdff1aSopenharmony_ci
4723cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4724cabdff1aSopenharmony_ci
4725cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
4726cabdff1aSopenharmony_ci
4727cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
4728cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
4729cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
4730cabdff1aSopenharmony_ci
4731cabdff1aSopenharmony_ci    const_vec = __msa_fill_w((128 * weight1));
4732cabdff1aSopenharmony_ci    const_vec <<= 6;
4733cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
4734cabdff1aSopenharmony_ci    weight_vec = (v8i16) __msa_fill_w(weight);
4735cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
4736cabdff1aSopenharmony_ci    offset_vec += const_vec;
4737cabdff1aSopenharmony_ci
4738cabdff1aSopenharmony_ci    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4739cabdff1aSopenharmony_ci    src0_ptr += (3 * src_stride);
4740cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
4741cabdff1aSopenharmony_ci
4742cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4743cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4744cabdff1aSopenharmony_ci    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4745cabdff1aSopenharmony_ci    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4746cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4747cabdff1aSopenharmony_ci    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4748cabdff1aSopenharmony_ci
4749cabdff1aSopenharmony_ci    for (loop_cnt = height >> 3; loop_cnt--;) {
4750cabdff1aSopenharmony_ci        LD_SB8(src0_ptr, src_stride,
4751cabdff1aSopenharmony_ci               src3, src4, src5, src6, src7, src8, src9, src10);
4752cabdff1aSopenharmony_ci        src0_ptr += (8 * src_stride);
4753cabdff1aSopenharmony_ci        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4754cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4755cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4756cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4757cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4758cabdff1aSopenharmony_ci
4759cabdff1aSopenharmony_ci        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4760cabdff1aSopenharmony_ci        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4761cabdff1aSopenharmony_ci        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4762cabdff1aSopenharmony_ci        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4763cabdff1aSopenharmony_ci
4764cabdff1aSopenharmony_ci        dst32_r = __msa_ilvr_h(dst73, dst22);
4765cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4766cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4767cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4768cabdff1aSopenharmony_ci        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4769cabdff1aSopenharmony_ci        dst76_r = __msa_ilvr_h(dst22, dst106);
4770cabdff1aSopenharmony_ci
4771cabdff1aSopenharmony_ci        LD2(src1_ptr, src2_stride, tp0, tp1);
4772cabdff1aSopenharmony_ci        src1_ptr += 2 * src2_stride;
4773cabdff1aSopenharmony_ci        INSERT_D2_SH(tp0, tp1, in0);
4774cabdff1aSopenharmony_ci        LD2(src1_ptr, src2_stride, tp0, tp1);
4775cabdff1aSopenharmony_ci        src1_ptr += 2 * src2_stride;
4776cabdff1aSopenharmony_ci        INSERT_D2_SH(tp0, tp1, in1);
4777cabdff1aSopenharmony_ci
4778cabdff1aSopenharmony_ci        LD2(src1_ptr, src2_stride, tp0, tp1);
4779cabdff1aSopenharmony_ci        src1_ptr += 2 * src2_stride;
4780cabdff1aSopenharmony_ci        INSERT_D2_SH(tp0, tp1, in2);
4781cabdff1aSopenharmony_ci        LD2(src1_ptr, src2_stride, tp0, tp1);
4782cabdff1aSopenharmony_ci        src1_ptr += 2 * src2_stride;
4783cabdff1aSopenharmony_ci        INSERT_D2_SH(tp0, tp1, in3);
4784cabdff1aSopenharmony_ci
4785cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4786cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4787cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4788cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4789cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4790cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4791cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4792cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4793cabdff1aSopenharmony_ci        SRA_4V(dst0, dst1, dst2, dst3, 6);
4794cabdff1aSopenharmony_ci        SRA_4V(dst4, dst5, dst6, dst7, 6);
4795cabdff1aSopenharmony_ci        PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
4796cabdff1aSopenharmony_ci                    dst2, dst3);
4797cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4798cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4799cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4800cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4801cabdff1aSopenharmony_ci        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4802cabdff1aSopenharmony_ci        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4803cabdff1aSopenharmony_ci        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4804cabdff1aSopenharmony_ci        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4805cabdff1aSopenharmony_ci        dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4806cabdff1aSopenharmony_ci        dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4807cabdff1aSopenharmony_ci        dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4808cabdff1aSopenharmony_ci        dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
4809cabdff1aSopenharmony_ci        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4810cabdff1aSopenharmony_ci        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4811cabdff1aSopenharmony_ci        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4812cabdff1aSopenharmony_ci                    tmp2, tmp3);
4813cabdff1aSopenharmony_ci        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4814cabdff1aSopenharmony_ci        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4815cabdff1aSopenharmony_ci        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4816cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
4817cabdff1aSopenharmony_ci
4818cabdff1aSopenharmony_ci        dst10_r = dst98_r;
4819cabdff1aSopenharmony_ci        dst21_r = dst109_r;
4820cabdff1aSopenharmony_ci        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4821cabdff1aSopenharmony_ci    }
4822cabdff1aSopenharmony_ci}
4823cabdff1aSopenharmony_ci
4824cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr,
4825cabdff1aSopenharmony_ci                                    int32_t src_stride,
4826cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
4827cabdff1aSopenharmony_ci                                    int32_t src2_stride,
4828cabdff1aSopenharmony_ci                                    uint8_t *dst,
4829cabdff1aSopenharmony_ci                                    int32_t dst_stride,
4830cabdff1aSopenharmony_ci                                    const int8_t *filter_x,
4831cabdff1aSopenharmony_ci                                    const int8_t *filter_y,
4832cabdff1aSopenharmony_ci                                    int32_t height,
4833cabdff1aSopenharmony_ci                                    int32_t weight0,
4834cabdff1aSopenharmony_ci                                    int32_t weight1,
4835cabdff1aSopenharmony_ci                                    int32_t offset0,
4836cabdff1aSopenharmony_ci                                    int32_t offset1,
4837cabdff1aSopenharmony_ci                                    int32_t rnd_val)
4838cabdff1aSopenharmony_ci{
4839cabdff1aSopenharmony_ci    if (2 == height) {
4840cabdff1aSopenharmony_ci        hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4841cabdff1aSopenharmony_ci                                 dst, dst_stride, filter_x, filter_y,
4842cabdff1aSopenharmony_ci                                 weight0, weight1, offset0, offset1, rnd_val);
4843cabdff1aSopenharmony_ci    } else if (4 == height) {
4844cabdff1aSopenharmony_ci        hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4845cabdff1aSopenharmony_ci                                 dst, dst_stride, filter_x, filter_y,
4846cabdff1aSopenharmony_ci                                 weight0, weight1, offset0, offset1, rnd_val);
4847cabdff1aSopenharmony_ci    } else if (0 == (height % 8)) {
4848cabdff1aSopenharmony_ci        hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride,
4849cabdff1aSopenharmony_ci                                         src1_ptr, src2_stride,
4850cabdff1aSopenharmony_ci                                         dst, dst_stride, filter_x, filter_y,
4851cabdff1aSopenharmony_ci                                         height, weight0, weight1,
4852cabdff1aSopenharmony_ci                                         offset0, offset1, rnd_val);
4853cabdff1aSopenharmony_ci    }
4854cabdff1aSopenharmony_ci}
4855cabdff1aSopenharmony_ci
4856cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
4857cabdff1aSopenharmony_ci                                    int32_t src_stride,
4858cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
4859cabdff1aSopenharmony_ci                                    int32_t src2_stride,
4860cabdff1aSopenharmony_ci                                    uint8_t *dst,
4861cabdff1aSopenharmony_ci                                    int32_t dst_stride,
4862cabdff1aSopenharmony_ci                                    const int8_t *filter_x,
4863cabdff1aSopenharmony_ci                                    const int8_t *filter_y,
4864cabdff1aSopenharmony_ci                                    int32_t height,
4865cabdff1aSopenharmony_ci                                    int32_t weight0,
4866cabdff1aSopenharmony_ci                                    int32_t weight1,
4867cabdff1aSopenharmony_ci                                    int32_t offset0,
4868cabdff1aSopenharmony_ci                                    int32_t offset1,
4869cabdff1aSopenharmony_ci                                    int32_t rnd_val)
4870cabdff1aSopenharmony_ci{
4871cabdff1aSopenharmony_ci    uint32_t tpw0, tpw1, tpw2, tpw3;
4872cabdff1aSopenharmony_ci    uint64_t tp0, tp1;
4873cabdff1aSopenharmony_ci    int32_t offset, weight;
4874cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
4875cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4876cabdff1aSopenharmony_ci    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4877cabdff1aSopenharmony_ci    v8i16 in4 = { 0 }, in5 = { 0 };
4878cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
4879cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filter_vec;
4880cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4881cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4882cabdff1aSopenharmony_ci    v16i8 mask1;
4883cabdff1aSopenharmony_ci    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4884cabdff1aSopenharmony_ci    v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
4885cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4886cabdff1aSopenharmony_ci    v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4887cabdff1aSopenharmony_ci    v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4888cabdff1aSopenharmony_ci    v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4889cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4890cabdff1aSopenharmony_ci    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4891cabdff1aSopenharmony_ci    v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4892cabdff1aSopenharmony_ci    v4i32 offset_vec, rnd_vec, const_vec;
4893cabdff1aSopenharmony_ci
4894cabdff1aSopenharmony_ci    src0_ptr -= (src_stride + 1);
4895cabdff1aSopenharmony_ci
4896cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
4897cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4898cabdff1aSopenharmony_ci
4899cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
4900cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
4901cabdff1aSopenharmony_ci
4902cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4903cabdff1aSopenharmony_ci
4904cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
4905cabdff1aSopenharmony_ci
4906cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
4907cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
4908cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
4909cabdff1aSopenharmony_ci
4910cabdff1aSopenharmony_ci    const_vec = __msa_fill_w((128 * weight1));
4911cabdff1aSopenharmony_ci    const_vec <<= 6;
4912cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
4913cabdff1aSopenharmony_ci    weight_vec = (v8i16) __msa_fill_w(weight);
4914cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
4915cabdff1aSopenharmony_ci    offset_vec += const_vec;
4916cabdff1aSopenharmony_ci
4917cabdff1aSopenharmony_ci    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4918cabdff1aSopenharmony_ci    src0_ptr += (3 * src_stride);
4919cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
4920cabdff1aSopenharmony_ci
4921cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4922cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4923cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4924cabdff1aSopenharmony_ci    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4925cabdff1aSopenharmony_ci    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4926cabdff1aSopenharmony_ci    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4927cabdff1aSopenharmony_ci
4928cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4929cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4930cabdff1aSopenharmony_ci
4931cabdff1aSopenharmony_ci    LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
4932cabdff1aSopenharmony_ci           src10);
4933cabdff1aSopenharmony_ci    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4934cabdff1aSopenharmony_ci
4935cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4936cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4937cabdff1aSopenharmony_ci    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4938cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4939cabdff1aSopenharmony_ci
4940cabdff1aSopenharmony_ci    dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4941cabdff1aSopenharmony_ci    dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4942cabdff1aSopenharmony_ci    dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4943cabdff1aSopenharmony_ci    dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4944cabdff1aSopenharmony_ci
4945cabdff1aSopenharmony_ci    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4946cabdff1aSopenharmony_ci    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4947cabdff1aSopenharmony_ci    VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4948cabdff1aSopenharmony_ci    VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4949cabdff1aSopenharmony_ci
4950cabdff1aSopenharmony_ci    dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4951cabdff1aSopenharmony_ci    dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4952cabdff1aSopenharmony_ci    dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4953cabdff1aSopenharmony_ci    dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4954cabdff1aSopenharmony_ci
4955cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4956cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4957cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4958cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4959cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4960cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4961cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4962cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4963cabdff1aSopenharmony_ci    PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4964cabdff1aSopenharmony_ci    PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4965cabdff1aSopenharmony_ci    dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4966cabdff1aSopenharmony_ci
4967cabdff1aSopenharmony_ci    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4968cabdff1aSopenharmony_ci    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4969cabdff1aSopenharmony_ci    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4970cabdff1aSopenharmony_ci    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4971cabdff1aSopenharmony_ci    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4972cabdff1aSopenharmony_ci    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4973cabdff1aSopenharmony_ci    dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4974cabdff1aSopenharmony_ci    dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4975cabdff1aSopenharmony_ci    dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4976cabdff1aSopenharmony_ci    dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4977cabdff1aSopenharmony_ci    dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4978cabdff1aSopenharmony_ci    dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4979cabdff1aSopenharmony_ci    SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4980cabdff1aSopenharmony_ci    SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4981cabdff1aSopenharmony_ci    SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4982cabdff1aSopenharmony_ci    PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
4983cabdff1aSopenharmony_ci    PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
4984cabdff1aSopenharmony_ci
4985cabdff1aSopenharmony_ci    LD2(src1_ptr, src2_stride, tp0, tp1);
4986cabdff1aSopenharmony_ci    INSERT_D2_SH(tp0, tp1, in0);
4987cabdff1aSopenharmony_ci    LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4988cabdff1aSopenharmony_ci    INSERT_D2_SH(tp0, tp1, in1);
4989cabdff1aSopenharmony_ci
4990cabdff1aSopenharmony_ci    LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4991cabdff1aSopenharmony_ci    INSERT_D2_SH(tp0, tp1, in2);
4992cabdff1aSopenharmony_ci    LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4993cabdff1aSopenharmony_ci    INSERT_D2_SH(tp0, tp1, in3);
4994cabdff1aSopenharmony_ci
4995cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4996cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4997cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4998cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4999cabdff1aSopenharmony_ci    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5000cabdff1aSopenharmony_ci    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5001cabdff1aSopenharmony_ci    dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5002cabdff1aSopenharmony_ci    dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5003cabdff1aSopenharmony_ci    dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5004cabdff1aSopenharmony_ci    dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5005cabdff1aSopenharmony_ci    dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5006cabdff1aSopenharmony_ci    dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5007cabdff1aSopenharmony_ci    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5008cabdff1aSopenharmony_ci    SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5009cabdff1aSopenharmony_ci    PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5010cabdff1aSopenharmony_ci                tmp2, tmp3);
5011cabdff1aSopenharmony_ci    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5012cabdff1aSopenharmony_ci    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5013cabdff1aSopenharmony_ci    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5014cabdff1aSopenharmony_ci
5015cabdff1aSopenharmony_ci    PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
5016cabdff1aSopenharmony_ci
5017cabdff1aSopenharmony_ci    LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5018cabdff1aSopenharmony_ci    src1_ptr += (4 * src2_stride);
5019cabdff1aSopenharmony_ci    INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
5020cabdff1aSopenharmony_ci    LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5021cabdff1aSopenharmony_ci    INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
5022cabdff1aSopenharmony_ci
5023cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst4, in4, tmp0, tmp1);
5024cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst5, in5, tmp2, tmp3);
5025cabdff1aSopenharmony_ci
5026cabdff1aSopenharmony_ci    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5027cabdff1aSopenharmony_ci    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5028cabdff1aSopenharmony_ci    dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5029cabdff1aSopenharmony_ci    dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5030cabdff1aSopenharmony_ci    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5031cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5032cabdff1aSopenharmony_ci
5033cabdff1aSopenharmony_ci    CLIP_SH2_0_255(tmp4, tmp5);
5034cabdff1aSopenharmony_ci    out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5035cabdff1aSopenharmony_ci    ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
5036cabdff1aSopenharmony_ci}
5037cabdff1aSopenharmony_ci
5038cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
5039cabdff1aSopenharmony_ci                                     int32_t src_stride,
5040cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
5041cabdff1aSopenharmony_ci                                     int32_t src2_stride,
5042cabdff1aSopenharmony_ci                                     uint8_t *dst,
5043cabdff1aSopenharmony_ci                                     int32_t dst_stride,
5044cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
5045cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
5046cabdff1aSopenharmony_ci                                     int32_t weight0,
5047cabdff1aSopenharmony_ci                                     int32_t weight1,
5048cabdff1aSopenharmony_ci                                     int32_t offset0,
5049cabdff1aSopenharmony_ci                                     int32_t offset1,
5050cabdff1aSopenharmony_ci                                     int32_t rnd_val)
5051cabdff1aSopenharmony_ci{
5052cabdff1aSopenharmony_ci    int32_t weight, offset;
5053cabdff1aSopenharmony_ci    v16u8 out;
5054cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
5055cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
5056cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1;
5057cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5058cabdff1aSopenharmony_ci    v16i8 mask1;
5059cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec;
5060cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5061cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4;
5062cabdff1aSopenharmony_ci    v8i16 in0, in1;
5063cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
5064cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5065cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5066cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3;
5067cabdff1aSopenharmony_ci    v4i32 offset_vec, rnd_vec, const_vec;
5068cabdff1aSopenharmony_ci
5069cabdff1aSopenharmony_ci    src0_ptr -= (src_stride + 1);
5070cabdff1aSopenharmony_ci
5071cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
5072cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5073cabdff1aSopenharmony_ci
5074cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
5075cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
5076cabdff1aSopenharmony_ci
5077cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5078cabdff1aSopenharmony_ci
5079cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
5080cabdff1aSopenharmony_ci
5081cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
5082cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
5083cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
5084cabdff1aSopenharmony_ci
5085cabdff1aSopenharmony_ci    const_vec = __msa_fill_w((128 * weight1));
5086cabdff1aSopenharmony_ci    const_vec <<= 6;
5087cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
5088cabdff1aSopenharmony_ci    weight_vec = (v8i16) __msa_fill_w(weight);
5089cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
5090cabdff1aSopenharmony_ci    offset_vec += const_vec;
5091cabdff1aSopenharmony_ci
5092cabdff1aSopenharmony_ci    LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5093cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5094cabdff1aSopenharmony_ci
5095cabdff1aSopenharmony_ci    LD_SH2(src1_ptr, src2_stride, in0, in1);
5096cabdff1aSopenharmony_ci
5097cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5098cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5099cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5100cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5101cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5102cabdff1aSopenharmony_ci
5103cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5104cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5105cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5106cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5107cabdff1aSopenharmony_ci    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5108cabdff1aSopenharmony_ci
5109cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
5110cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
5111cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
5112cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
5113cabdff1aSopenharmony_ci    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5114cabdff1aSopenharmony_ci    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5115cabdff1aSopenharmony_ci    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5116cabdff1aSopenharmony_ci    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5117cabdff1aSopenharmony_ci    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5118cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
5119cabdff1aSopenharmony_ci
5120cabdff1aSopenharmony_ci    ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
5121cabdff1aSopenharmony_ci    ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
5122cabdff1aSopenharmony_ci
5123cabdff1aSopenharmony_ci    dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5124cabdff1aSopenharmony_ci    dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5125cabdff1aSopenharmony_ci    dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5126cabdff1aSopenharmony_ci    dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5127cabdff1aSopenharmony_ci    SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5128cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
5129cabdff1aSopenharmony_ci    CLIP_SH2_0_255(tmp0, tmp1);
5130cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
5131cabdff1aSopenharmony_ci    ST_D2(out, 0, 1, dst, dst_stride);
5132cabdff1aSopenharmony_ci}
5133cabdff1aSopenharmony_ci
5134cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr,
5135cabdff1aSopenharmony_ci                                         int32_t src_stride,
5136cabdff1aSopenharmony_ci                                         int16_t *src1_ptr,
5137cabdff1aSopenharmony_ci                                         int32_t src2_stride,
5138cabdff1aSopenharmony_ci                                         uint8_t *dst,
5139cabdff1aSopenharmony_ci                                         int32_t dst_stride,
5140cabdff1aSopenharmony_ci                                         const int8_t *filter_x,
5141cabdff1aSopenharmony_ci                                         const int8_t *filter_y,
5142cabdff1aSopenharmony_ci                                         int32_t weight0,
5143cabdff1aSopenharmony_ci                                         int32_t weight1,
5144cabdff1aSopenharmony_ci                                         int32_t offset0,
5145cabdff1aSopenharmony_ci                                         int32_t offset1,
5146cabdff1aSopenharmony_ci                                         int32_t rnd_val,
5147cabdff1aSopenharmony_ci                                         int32_t width8mult)
5148cabdff1aSopenharmony_ci{
5149cabdff1aSopenharmony_ci    int32_t weight, offset;
5150cabdff1aSopenharmony_ci    uint32_t cnt;
5151cabdff1aSopenharmony_ci    v16u8 out0, out1;
5152cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
5153cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5154cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
5155cabdff1aSopenharmony_ci    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5156cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
5157cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5158cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5159cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5160cabdff1aSopenharmony_ci    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5161cabdff1aSopenharmony_ci    v4i32 offset_vec, rnd_vec, const_vec;
5162cabdff1aSopenharmony_ci
5163cabdff1aSopenharmony_ci    src0_ptr -= (src_stride + 1);
5164cabdff1aSopenharmony_ci
5165cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
5166cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5167cabdff1aSopenharmony_ci
5168cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
5169cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
5170cabdff1aSopenharmony_ci
5171cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5172cabdff1aSopenharmony_ci
5173cabdff1aSopenharmony_ci    mask0 = LD_SB(ff_hevc_mask_arr);
5174cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
5175cabdff1aSopenharmony_ci
5176cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
5177cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
5178cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
5179cabdff1aSopenharmony_ci
5180cabdff1aSopenharmony_ci    const_vec = __msa_fill_w((128 * weight1));
5181cabdff1aSopenharmony_ci    const_vec <<= 6;
5182cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
5183cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
5184cabdff1aSopenharmony_ci    offset_vec += const_vec;
5185cabdff1aSopenharmony_ci    weight_vec = (v8i16) __msa_fill_w(weight);
5186cabdff1aSopenharmony_ci
5187cabdff1aSopenharmony_ci    for (cnt = width8mult; cnt--;) {
5188cabdff1aSopenharmony_ci        LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
5189cabdff1aSopenharmony_ci        src0_ptr += 8;
5190cabdff1aSopenharmony_ci        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
5191cabdff1aSopenharmony_ci
5192cabdff1aSopenharmony_ci        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
5193cabdff1aSopenharmony_ci        src1_ptr += 8;
5194cabdff1aSopenharmony_ci
5195cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5196cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5197cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5198cabdff1aSopenharmony_ci
5199cabdff1aSopenharmony_ci        dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5200cabdff1aSopenharmony_ci        dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5201cabdff1aSopenharmony_ci        dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5202cabdff1aSopenharmony_ci
5203cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5204cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5205cabdff1aSopenharmony_ci
5206cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5207cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5208cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5209cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5210cabdff1aSopenharmony_ci
5211cabdff1aSopenharmony_ci        dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5212cabdff1aSopenharmony_ci        dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5213cabdff1aSopenharmony_ci        dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5214cabdff1aSopenharmony_ci        dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5215cabdff1aSopenharmony_ci
5216cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5217cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5218cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5219cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5220cabdff1aSopenharmony_ci
5221cabdff1aSopenharmony_ci        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5222cabdff1aSopenharmony_ci        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5223cabdff1aSopenharmony_ci        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5224cabdff1aSopenharmony_ci        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5225cabdff1aSopenharmony_ci        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5226cabdff1aSopenharmony_ci        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5227cabdff1aSopenharmony_ci        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5228cabdff1aSopenharmony_ci        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5229cabdff1aSopenharmony_ci
5230cabdff1aSopenharmony_ci        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5231cabdff1aSopenharmony_ci        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5232cabdff1aSopenharmony_ci        PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5233cabdff1aSopenharmony_ci                    dst3_r, dst0, dst1, dst2, dst3);
5234cabdff1aSopenharmony_ci
5235cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5236cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5237cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5238cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5239cabdff1aSopenharmony_ci        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5240cabdff1aSopenharmony_ci        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5241cabdff1aSopenharmony_ci        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5242cabdff1aSopenharmony_ci        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5243cabdff1aSopenharmony_ci        dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5244cabdff1aSopenharmony_ci        dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5245cabdff1aSopenharmony_ci        dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5246cabdff1aSopenharmony_ci        dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5247cabdff1aSopenharmony_ci        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5248cabdff1aSopenharmony_ci        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5249cabdff1aSopenharmony_ci        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5250cabdff1aSopenharmony_ci                    tmp0, tmp1, tmp2, tmp3);
5251cabdff1aSopenharmony_ci        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5252cabdff1aSopenharmony_ci        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5253cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5254cabdff1aSopenharmony_ci        dst += 8;
5255cabdff1aSopenharmony_ci    }
5256cabdff1aSopenharmony_ci}
5257cabdff1aSopenharmony_ci
5258cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
5259cabdff1aSopenharmony_ci                                     int32_t src_stride,
5260cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
5261cabdff1aSopenharmony_ci                                     int32_t src2_stride,
5262cabdff1aSopenharmony_ci                                     uint8_t *dst,
5263cabdff1aSopenharmony_ci                                     int32_t dst_stride,
5264cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
5265cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
5266cabdff1aSopenharmony_ci                                     int32_t weight0,
5267cabdff1aSopenharmony_ci                                     int32_t weight1,
5268cabdff1aSopenharmony_ci                                     int32_t offset0,
5269cabdff1aSopenharmony_ci                                     int32_t offset1,
5270cabdff1aSopenharmony_ci                                     int32_t rnd_val)
5271cabdff1aSopenharmony_ci{
5272cabdff1aSopenharmony_ci    uint32_t offset, weight;
5273cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
5274cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5275cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
5276cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1;
5277cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5278cabdff1aSopenharmony_ci    v16i8 mask1;
5279cabdff1aSopenharmony_ci    v8i16 filter_vec, weight_vec;
5280cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5281cabdff1aSopenharmony_ci    v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
5282cabdff1aSopenharmony_ci    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
5283cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5284cabdff1aSopenharmony_ci    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
5285cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5286cabdff1aSopenharmony_ci    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5287cabdff1aSopenharmony_ci    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5288cabdff1aSopenharmony_ci    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5289cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5;
5290cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5291cabdff1aSopenharmony_ci    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5292cabdff1aSopenharmony_ci    v4i32 offset_vec, rnd_vec, const_vec;
5293cabdff1aSopenharmony_ci
5294cabdff1aSopenharmony_ci    src0_ptr -= (src_stride + 1);
5295cabdff1aSopenharmony_ci
5296cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
5297cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5298cabdff1aSopenharmony_ci
5299cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
5300cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
5301cabdff1aSopenharmony_ci
5302cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5303cabdff1aSopenharmony_ci
5304cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
5305cabdff1aSopenharmony_ci
5306cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
5307cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
5308cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
5309cabdff1aSopenharmony_ci
5310cabdff1aSopenharmony_ci    const_vec = __msa_fill_w((128 * weight1));
5311cabdff1aSopenharmony_ci    const_vec <<= 6;
5312cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
5313cabdff1aSopenharmony_ci    weight_vec = (v8i16) __msa_fill_w(weight);
5314cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
5315cabdff1aSopenharmony_ci    offset_vec += const_vec;
5316cabdff1aSopenharmony_ci
5317cabdff1aSopenharmony_ci    LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5318cabdff1aSopenharmony_ci    src0_ptr += (5 * src_stride);
5319cabdff1aSopenharmony_ci    LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
5320cabdff1aSopenharmony_ci
5321cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5322cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
5323cabdff1aSopenharmony_ci
5324cabdff1aSopenharmony_ci    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
5325cabdff1aSopenharmony_ci
5326cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5327cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5328cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5329cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5330cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5331cabdff1aSopenharmony_ci    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
5332cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
5333cabdff1aSopenharmony_ci    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
5334cabdff1aSopenharmony_ci    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
5335cabdff1aSopenharmony_ci
5336cabdff1aSopenharmony_ci    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5337cabdff1aSopenharmony_ci    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5338cabdff1aSopenharmony_ci    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5339cabdff1aSopenharmony_ci    dsth3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5340cabdff1aSopenharmony_ci    dsth4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5341cabdff1aSopenharmony_ci    dsth5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
5342cabdff1aSopenharmony_ci    dsth6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
5343cabdff1aSopenharmony_ci    dsth7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
5344cabdff1aSopenharmony_ci    dsth8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
5345cabdff1aSopenharmony_ci
5346cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5347cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5348cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5349cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5350cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5351cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5352cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
5353cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
5354cabdff1aSopenharmony_ci
5355cabdff1aSopenharmony_ci    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5356cabdff1aSopenharmony_ci    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5357cabdff1aSopenharmony_ci    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5358cabdff1aSopenharmony_ci    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5359cabdff1aSopenharmony_ci    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5360cabdff1aSopenharmony_ci    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5361cabdff1aSopenharmony_ci    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5362cabdff1aSopenharmony_ci    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5363cabdff1aSopenharmony_ci    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5364cabdff1aSopenharmony_ci    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
5365cabdff1aSopenharmony_ci    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5366cabdff1aSopenharmony_ci    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
5367cabdff1aSopenharmony_ci
5368cabdff1aSopenharmony_ci    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5369cabdff1aSopenharmony_ci    SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5370cabdff1aSopenharmony_ci    SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
5371cabdff1aSopenharmony_ci    PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
5372cabdff1aSopenharmony_ci                dst0, dst1, dst2, dst3);
5373cabdff1aSopenharmony_ci
5374cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5375cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5376cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5377cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5378cabdff1aSopenharmony_ci    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5379cabdff1aSopenharmony_ci    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5380cabdff1aSopenharmony_ci    dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5381cabdff1aSopenharmony_ci    dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5382cabdff1aSopenharmony_ci    dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5383cabdff1aSopenharmony_ci    dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5384cabdff1aSopenharmony_ci    dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5385cabdff1aSopenharmony_ci    dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5386cabdff1aSopenharmony_ci    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5387cabdff1aSopenharmony_ci    SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5388cabdff1aSopenharmony_ci    PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5389cabdff1aSopenharmony_ci                tmp0, tmp1, tmp2, tmp3);
5390cabdff1aSopenharmony_ci    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5391cabdff1aSopenharmony_ci    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5392cabdff1aSopenharmony_ci
5393cabdff1aSopenharmony_ci    PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
5394cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst0, in4, tmp0, tmp1);
5395cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst1, in5, tmp2, tmp3);
5396cabdff1aSopenharmony_ci    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5397cabdff1aSopenharmony_ci    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5398cabdff1aSopenharmony_ci    dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5399cabdff1aSopenharmony_ci    dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5400cabdff1aSopenharmony_ci    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5401cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5402cabdff1aSopenharmony_ci    CLIP_SH2_0_255(tmp4, tmp5);
5403cabdff1aSopenharmony_ci    out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5404cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5405cabdff1aSopenharmony_ci    ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
5406cabdff1aSopenharmony_ci}
5407cabdff1aSopenharmony_ci
5408cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
5409cabdff1aSopenharmony_ci                                             int32_t src_stride,
5410cabdff1aSopenharmony_ci                                             int16_t *src1_ptr,
5411cabdff1aSopenharmony_ci                                             int32_t src2_stride,
5412cabdff1aSopenharmony_ci                                             uint8_t *dst,
5413cabdff1aSopenharmony_ci                                             int32_t dst_stride,
5414cabdff1aSopenharmony_ci                                             const int8_t *filter_x,
5415cabdff1aSopenharmony_ci                                             const int8_t *filter_y,
5416cabdff1aSopenharmony_ci                                             int32_t height,
5417cabdff1aSopenharmony_ci                                             int32_t weight0,
5418cabdff1aSopenharmony_ci                                             int32_t weight1,
5419cabdff1aSopenharmony_ci                                             int32_t offset0,
5420cabdff1aSopenharmony_ci                                             int32_t offset1,
5421cabdff1aSopenharmony_ci                                             int32_t rnd_val,
5422cabdff1aSopenharmony_ci                                             int32_t width)
5423cabdff1aSopenharmony_ci{
5424cabdff1aSopenharmony_ci    uint32_t loop_cnt;
5425cabdff1aSopenharmony_ci    uint32_t cnt;
5426cabdff1aSopenharmony_ci    int32_t offset, weight;
5427cabdff1aSopenharmony_ci    uint8_t *src0_ptr_tmp;
5428cabdff1aSopenharmony_ci    int16_t *src1_ptr_tmp;
5429cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
5430cabdff1aSopenharmony_ci    v16u8 out0, out1;
5431cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
5432cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3;
5433cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
5434cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1;
5435cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5436cabdff1aSopenharmony_ci    v16i8 mask1;
5437cabdff1aSopenharmony_ci    v8i16 filter_vec;
5438cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5439cabdff1aSopenharmony_ci    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5440cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5441cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5442cabdff1aSopenharmony_ci    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5443cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5444cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
5445cabdff1aSopenharmony_ci    v4i32 offset_vec, rnd_vec, const_vec;
5446cabdff1aSopenharmony_ci
5447cabdff1aSopenharmony_ci    src0_ptr -= (src_stride + 1);
5448cabdff1aSopenharmony_ci
5449cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
5450cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5451cabdff1aSopenharmony_ci
5452cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
5453cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
5454cabdff1aSopenharmony_ci
5455cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5456cabdff1aSopenharmony_ci
5457cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
5458cabdff1aSopenharmony_ci
5459cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
5460cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
5461cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
5462cabdff1aSopenharmony_ci
5463cabdff1aSopenharmony_ci    const_vec = __msa_fill_w((128 * weight1));
5464cabdff1aSopenharmony_ci    const_vec <<= 6;
5465cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
5466cabdff1aSopenharmony_ci    weight_vec = (v8i16) __msa_fill_w(weight);
5467cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
5468cabdff1aSopenharmony_ci    offset_vec += const_vec;
5469cabdff1aSopenharmony_ci
5470cabdff1aSopenharmony_ci    for (cnt = width >> 3; cnt--;) {
5471cabdff1aSopenharmony_ci        src0_ptr_tmp = src0_ptr;
5472cabdff1aSopenharmony_ci        src1_ptr_tmp = src1_ptr;
5473cabdff1aSopenharmony_ci        dst_tmp = dst;
5474cabdff1aSopenharmony_ci
5475cabdff1aSopenharmony_ci        LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5476cabdff1aSopenharmony_ci        src0_ptr_tmp += (3 * src_stride);
5477cabdff1aSopenharmony_ci        XORI_B3_128_SB(src0, src1, src2);
5478cabdff1aSopenharmony_ci
5479cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5480cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5481cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5482cabdff1aSopenharmony_ci        dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5483cabdff1aSopenharmony_ci        dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5484cabdff1aSopenharmony_ci        dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5485cabdff1aSopenharmony_ci
5486cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5487cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5488cabdff1aSopenharmony_ci
5489cabdff1aSopenharmony_ci        for (loop_cnt = height >> 2; loop_cnt--;) {
5490cabdff1aSopenharmony_ci            LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5491cabdff1aSopenharmony_ci            src0_ptr_tmp += (4 * src_stride);
5492cabdff1aSopenharmony_ci            LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5493cabdff1aSopenharmony_ci            src1_ptr_tmp += (4 * src2_stride);
5494cabdff1aSopenharmony_ci            XORI_B4_128_SB(src3, src4, src5, src6);
5495cabdff1aSopenharmony_ci
5496cabdff1aSopenharmony_ci            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5497cabdff1aSopenharmony_ci            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5498cabdff1aSopenharmony_ci            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5499cabdff1aSopenharmony_ci            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5500cabdff1aSopenharmony_ci
5501cabdff1aSopenharmony_ci            dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5502cabdff1aSopenharmony_ci            dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5503cabdff1aSopenharmony_ci            dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5504cabdff1aSopenharmony_ci            dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5505cabdff1aSopenharmony_ci
5506cabdff1aSopenharmony_ci            ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5507cabdff1aSopenharmony_ci            ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5508cabdff1aSopenharmony_ci            ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5509cabdff1aSopenharmony_ci            ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5510cabdff1aSopenharmony_ci
5511cabdff1aSopenharmony_ci            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5512cabdff1aSopenharmony_ci            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5513cabdff1aSopenharmony_ci            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5514cabdff1aSopenharmony_ci            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5515cabdff1aSopenharmony_ci            dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5516cabdff1aSopenharmony_ci            dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5517cabdff1aSopenharmony_ci            dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5518cabdff1aSopenharmony_ci            dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5519cabdff1aSopenharmony_ci
5520cabdff1aSopenharmony_ci            SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5521cabdff1aSopenharmony_ci            SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5522cabdff1aSopenharmony_ci            PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5523cabdff1aSopenharmony_ci                        dst3_r, dst0, dst1, dst2, dst3);
5524cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5525cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5526cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5527cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5528cabdff1aSopenharmony_ci            dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5529cabdff1aSopenharmony_ci            dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5530cabdff1aSopenharmony_ci            dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5531cabdff1aSopenharmony_ci            dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5532cabdff1aSopenharmony_ci            dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5533cabdff1aSopenharmony_ci            dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5534cabdff1aSopenharmony_ci            dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5535cabdff1aSopenharmony_ci            dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5536cabdff1aSopenharmony_ci            SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5537cabdff1aSopenharmony_ci            SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5538cabdff1aSopenharmony_ci            PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5539cabdff1aSopenharmony_ci                        tmp0, tmp1, tmp2, tmp3);
5540cabdff1aSopenharmony_ci            CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5541cabdff1aSopenharmony_ci            PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5542cabdff1aSopenharmony_ci            ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5543cabdff1aSopenharmony_ci            dst_tmp += (4 * dst_stride);
5544cabdff1aSopenharmony_ci
5545cabdff1aSopenharmony_ci            dst10_r = dst54_r;
5546cabdff1aSopenharmony_ci            dst10_l = dst54_l;
5547cabdff1aSopenharmony_ci            dst21_r = dst65_r;
5548cabdff1aSopenharmony_ci            dst21_l = dst65_l;
5549cabdff1aSopenharmony_ci            dsth2 = dsth6;
5550cabdff1aSopenharmony_ci        }
5551cabdff1aSopenharmony_ci
5552cabdff1aSopenharmony_ci        src0_ptr += 8;
5553cabdff1aSopenharmony_ci        dst += 8;
5554cabdff1aSopenharmony_ci        src1_ptr += 8;
5555cabdff1aSopenharmony_ci    }
5556cabdff1aSopenharmony_ci}
5557cabdff1aSopenharmony_ci
5558cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr,
5559cabdff1aSopenharmony_ci                                    int32_t src_stride,
5560cabdff1aSopenharmony_ci                                    int16_t *src1_ptr,
5561cabdff1aSopenharmony_ci                                    int32_t src2_stride,
5562cabdff1aSopenharmony_ci                                    uint8_t *dst,
5563cabdff1aSopenharmony_ci                                    int32_t dst_stride,
5564cabdff1aSopenharmony_ci                                    const int8_t *filter_x,
5565cabdff1aSopenharmony_ci                                    const int8_t *filter_y,
5566cabdff1aSopenharmony_ci                                    int32_t height,
5567cabdff1aSopenharmony_ci                                    int32_t weight0,
5568cabdff1aSopenharmony_ci                                    int32_t weight1,
5569cabdff1aSopenharmony_ci                                    int32_t offset0,
5570cabdff1aSopenharmony_ci                                    int32_t offset1,
5571cabdff1aSopenharmony_ci                                    int32_t rnd_val)
5572cabdff1aSopenharmony_ci{
5573cabdff1aSopenharmony_ci    if (2 == height) {
5574cabdff1aSopenharmony_ci        hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5575cabdff1aSopenharmony_ci                                 dst, dst_stride, filter_x, filter_y,
5576cabdff1aSopenharmony_ci                                 weight0, weight1, offset0, offset1, rnd_val);
5577cabdff1aSopenharmony_ci    } else if (4 == height) {
5578cabdff1aSopenharmony_ci        hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5579cabdff1aSopenharmony_ci                                     src2_stride, dst, dst_stride, filter_x,
5580cabdff1aSopenharmony_ci                                     filter_y, weight0, weight1, offset0,
5581cabdff1aSopenharmony_ci                                     offset1, rnd_val, 1);
5582cabdff1aSopenharmony_ci    } else if (6 == height) {
5583cabdff1aSopenharmony_ci        hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5584cabdff1aSopenharmony_ci                                 dst, dst_stride, filter_x, filter_y,
5585cabdff1aSopenharmony_ci                                 weight0, weight1, offset0, offset1, rnd_val);
5586cabdff1aSopenharmony_ci    } else if (0 == (height % 4)) {
5587cabdff1aSopenharmony_ci        hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5588cabdff1aSopenharmony_ci                                         src1_ptr, src2_stride,
5589cabdff1aSopenharmony_ci                                         dst, dst_stride, filter_x, filter_y,
5590cabdff1aSopenharmony_ci                                         height, weight0,
5591cabdff1aSopenharmony_ci                                         weight1, offset0, offset1, rnd_val, 8);
5592cabdff1aSopenharmony_ci    }
5593cabdff1aSopenharmony_ci}
5594cabdff1aSopenharmony_ci
5595cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
5596cabdff1aSopenharmony_ci                                     int32_t src_stride,
5597cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
5598cabdff1aSopenharmony_ci                                     int32_t src2_stride,
5599cabdff1aSopenharmony_ci                                     uint8_t *dst,
5600cabdff1aSopenharmony_ci                                     int32_t dst_stride,
5601cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
5602cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
5603cabdff1aSopenharmony_ci                                     int32_t height,
5604cabdff1aSopenharmony_ci                                     int32_t weight0,
5605cabdff1aSopenharmony_ci                                     int32_t weight1,
5606cabdff1aSopenharmony_ci                                     int32_t offset0,
5607cabdff1aSopenharmony_ci                                     int32_t offset1,
5608cabdff1aSopenharmony_ci                                     int32_t rnd_val)
5609cabdff1aSopenharmony_ci{
5610cabdff1aSopenharmony_ci    uint32_t loop_cnt;
5611cabdff1aSopenharmony_ci    uint64_t tp0, tp1;
5612cabdff1aSopenharmony_ci    int32_t offset, weight;
5613cabdff1aSopenharmony_ci    uint8_t *src0_ptr_tmp, *dst_tmp;
5614cabdff1aSopenharmony_ci    int16_t *src1_ptr_tmp;
5615cabdff1aSopenharmony_ci    v16u8 out0, out1;
5616cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5617cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5618cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3;
5619cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
5620cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5621cabdff1aSopenharmony_ci    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec;
5622cabdff1aSopenharmony_ci    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5623cabdff1aSopenharmony_ci    v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5624cabdff1aSopenharmony_ci    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
5625cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5626cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5627cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5628cabdff1aSopenharmony_ci    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5629cabdff1aSopenharmony_ci    v4i32 offset_vec, rnd_vec, const_vec;
5630cabdff1aSopenharmony_ci
5631cabdff1aSopenharmony_ci    src0_ptr -= (src_stride + 1);
5632cabdff1aSopenharmony_ci
5633cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
5634cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5635cabdff1aSopenharmony_ci
5636cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
5637cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
5638cabdff1aSopenharmony_ci
5639cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5640cabdff1aSopenharmony_ci
5641cabdff1aSopenharmony_ci    mask0 = LD_SB(ff_hevc_mask_arr);
5642cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
5643cabdff1aSopenharmony_ci
5644cabdff1aSopenharmony_ci    offset = (offset0 + offset1) << rnd_val;
5645cabdff1aSopenharmony_ci    weight0 = weight0 & 0x0000FFFF;
5646cabdff1aSopenharmony_ci    weight = weight0 | (weight1 << 16);
5647cabdff1aSopenharmony_ci
5648cabdff1aSopenharmony_ci    const_vec = __msa_fill_w((128 * weight1));
5649cabdff1aSopenharmony_ci    const_vec <<= 6;
5650cabdff1aSopenharmony_ci    offset_vec = __msa_fill_w(offset);
5651cabdff1aSopenharmony_ci    rnd_vec = __msa_fill_w(rnd_val + 1);
5652cabdff1aSopenharmony_ci    offset_vec += const_vec;
5653cabdff1aSopenharmony_ci    weight_vec = (v8i16) __msa_fill_w(weight);
5654cabdff1aSopenharmony_ci
5655cabdff1aSopenharmony_ci    src0_ptr_tmp = src0_ptr;
5656cabdff1aSopenharmony_ci    dst_tmp = dst;
5657cabdff1aSopenharmony_ci    src1_ptr_tmp = src1_ptr;
5658cabdff1aSopenharmony_ci
5659cabdff1aSopenharmony_ci    LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5660cabdff1aSopenharmony_ci    src0_ptr_tmp += (3 * src_stride);
5661cabdff1aSopenharmony_ci
5662cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
5663cabdff1aSopenharmony_ci
5664cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5665cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5666cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5667cabdff1aSopenharmony_ci
5668cabdff1aSopenharmony_ci    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5669cabdff1aSopenharmony_ci    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5670cabdff1aSopenharmony_ci    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5671cabdff1aSopenharmony_ci
5672cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5673cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5674cabdff1aSopenharmony_ci
5675cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
5676cabdff1aSopenharmony_ci        LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5677cabdff1aSopenharmony_ci        src0_ptr_tmp += (4 * src_stride);
5678cabdff1aSopenharmony_ci        XORI_B4_128_SB(src3, src4, src5, src6);
5679cabdff1aSopenharmony_ci
5680cabdff1aSopenharmony_ci        LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5681cabdff1aSopenharmony_ci        src1_ptr_tmp += (4 * src2_stride);
5682cabdff1aSopenharmony_ci
5683cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5684cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5685cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5686cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5687cabdff1aSopenharmony_ci
5688cabdff1aSopenharmony_ci        dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5689cabdff1aSopenharmony_ci        dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5690cabdff1aSopenharmony_ci        dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5691cabdff1aSopenharmony_ci        dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5692cabdff1aSopenharmony_ci
5693cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5694cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5695cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5696cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5697cabdff1aSopenharmony_ci
5698cabdff1aSopenharmony_ci        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5699cabdff1aSopenharmony_ci        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5700cabdff1aSopenharmony_ci        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5701cabdff1aSopenharmony_ci        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5702cabdff1aSopenharmony_ci        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5703cabdff1aSopenharmony_ci        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5704cabdff1aSopenharmony_ci        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5705cabdff1aSopenharmony_ci        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5706cabdff1aSopenharmony_ci
5707cabdff1aSopenharmony_ci        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5708cabdff1aSopenharmony_ci        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5709cabdff1aSopenharmony_ci        PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5710cabdff1aSopenharmony_ci                    dst3_r, dst0, dst1, dst2, dst3);
5711cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5712cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5713cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5714cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5715cabdff1aSopenharmony_ci        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5716cabdff1aSopenharmony_ci        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5717cabdff1aSopenharmony_ci        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5718cabdff1aSopenharmony_ci        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5719cabdff1aSopenharmony_ci        dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5720cabdff1aSopenharmony_ci        dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5721cabdff1aSopenharmony_ci        dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5722cabdff1aSopenharmony_ci        dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5723cabdff1aSopenharmony_ci        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5724cabdff1aSopenharmony_ci        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5725cabdff1aSopenharmony_ci        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5726cabdff1aSopenharmony_ci                    tmp0, tmp1, tmp2, tmp3);
5727cabdff1aSopenharmony_ci        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5728cabdff1aSopenharmony_ci        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5729cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5730cabdff1aSopenharmony_ci        dst_tmp += (4 * dst_stride);
5731cabdff1aSopenharmony_ci
5732cabdff1aSopenharmony_ci        dst10_r = dst54_r;
5733cabdff1aSopenharmony_ci        dst10_l = dst54_l;
5734cabdff1aSopenharmony_ci        dst21_r = dst65_r;
5735cabdff1aSopenharmony_ci        dst21_l = dst65_l;
5736cabdff1aSopenharmony_ci        dsth2 = dsth6;
5737cabdff1aSopenharmony_ci    }
5738cabdff1aSopenharmony_ci
5739cabdff1aSopenharmony_ci    src0_ptr += 8;
5740cabdff1aSopenharmony_ci    dst += 8;
5741cabdff1aSopenharmony_ci    src1_ptr += 8;
5742cabdff1aSopenharmony_ci
5743cabdff1aSopenharmony_ci    mask2 = LD_SB(ff_hevc_mask_arr + 16);
5744cabdff1aSopenharmony_ci    mask3 = mask2 + 2;
5745cabdff1aSopenharmony_ci
5746cabdff1aSopenharmony_ci    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
5747cabdff1aSopenharmony_ci    src0_ptr += (3 * src_stride);
5748cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
5749cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
5750cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
5751cabdff1aSopenharmony_ci
5752cabdff1aSopenharmony_ci    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5753cabdff1aSopenharmony_ci    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5754cabdff1aSopenharmony_ci
5755cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
5756cabdff1aSopenharmony_ci    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5757cabdff1aSopenharmony_ci
5758cabdff1aSopenharmony_ci    for (loop_cnt = 2; loop_cnt--;) {
5759cabdff1aSopenharmony_ci        LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
5760cabdff1aSopenharmony_ci               src10);
5761cabdff1aSopenharmony_ci        src0_ptr += (8 * src_stride);
5762cabdff1aSopenharmony_ci        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
5763cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5764cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5765cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5766cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5767cabdff1aSopenharmony_ci
5768cabdff1aSopenharmony_ci        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5769cabdff1aSopenharmony_ci        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5770cabdff1aSopenharmony_ci        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5771cabdff1aSopenharmony_ci        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5772cabdff1aSopenharmony_ci
5773cabdff1aSopenharmony_ci        dst32_r = __msa_ilvr_h(dst73, dst22);
5774cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
5775cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
5776cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
5777cabdff1aSopenharmony_ci        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5778cabdff1aSopenharmony_ci        dst76_r = __msa_ilvr_h(dst22, dst106);
5779cabdff1aSopenharmony_ci
5780cabdff1aSopenharmony_ci        LD2(src1_ptr, src2_stride, tp0, tp1);
5781cabdff1aSopenharmony_ci        src1_ptr += 2 * src2_stride;
5782cabdff1aSopenharmony_ci        INSERT_D2_SH(tp0, tp1, in0);
5783cabdff1aSopenharmony_ci        LD2(src1_ptr, src2_stride, tp0, tp1);
5784cabdff1aSopenharmony_ci        src1_ptr += 2 * src2_stride;
5785cabdff1aSopenharmony_ci        INSERT_D2_SH(tp0, tp1, in1);
5786cabdff1aSopenharmony_ci
5787cabdff1aSopenharmony_ci        LD2(src1_ptr, src2_stride, tp0, tp1);
5788cabdff1aSopenharmony_ci        src1_ptr += 2 * src2_stride;
5789cabdff1aSopenharmony_ci        INSERT_D2_SH(tp0, tp1, in2);
5790cabdff1aSopenharmony_ci        LD2(src1_ptr, src2_stride, tp0, tp1);
5791cabdff1aSopenharmony_ci        src1_ptr += 2 * src2_stride;
5792cabdff1aSopenharmony_ci        INSERT_D2_SH(tp0, tp1, in3);
5793cabdff1aSopenharmony_ci
5794cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5795cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5796cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5797cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5798cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5799cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5800cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5801cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5802cabdff1aSopenharmony_ci
5803cabdff1aSopenharmony_ci        SRA_4V(dst0, dst1, dst2, dst3, 6);
5804cabdff1aSopenharmony_ci        SRA_4V(dst4, dst5, dst6, dst7, 6);
5805cabdff1aSopenharmony_ci        PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5806cabdff1aSopenharmony_ci                    dst0, dst1, dst2, dst3);
5807cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5808cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5809cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5810cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5811cabdff1aSopenharmony_ci        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5812cabdff1aSopenharmony_ci        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5813cabdff1aSopenharmony_ci        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5814cabdff1aSopenharmony_ci        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5815cabdff1aSopenharmony_ci        dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5816cabdff1aSopenharmony_ci        dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5817cabdff1aSopenharmony_ci        dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5818cabdff1aSopenharmony_ci        dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5819cabdff1aSopenharmony_ci        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5820cabdff1aSopenharmony_ci        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5821cabdff1aSopenharmony_ci        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5822cabdff1aSopenharmony_ci                    tmp0, tmp1, tmp2, tmp3);
5823cabdff1aSopenharmony_ci        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5824cabdff1aSopenharmony_ci        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5825cabdff1aSopenharmony_ci        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5826cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
5827cabdff1aSopenharmony_ci
5828cabdff1aSopenharmony_ci        dst10_r = dst98_r;
5829cabdff1aSopenharmony_ci        dst21_r = dst109_r;
5830cabdff1aSopenharmony_ci        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5831cabdff1aSopenharmony_ci    }
5832cabdff1aSopenharmony_ci}
5833cabdff1aSopenharmony_ci
5834cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr,
5835cabdff1aSopenharmony_ci                                     int32_t src_stride,
5836cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
5837cabdff1aSopenharmony_ci                                     int32_t src2_stride,
5838cabdff1aSopenharmony_ci                                     uint8_t *dst,
5839cabdff1aSopenharmony_ci                                     int32_t dst_stride,
5840cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
5841cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
5842cabdff1aSopenharmony_ci                                     int32_t height,
5843cabdff1aSopenharmony_ci                                     int32_t weight0,
5844cabdff1aSopenharmony_ci                                     int32_t weight1,
5845cabdff1aSopenharmony_ci                                     int32_t offset0,
5846cabdff1aSopenharmony_ci                                     int32_t offset1,
5847cabdff1aSopenharmony_ci                                     int32_t rnd_val)
5848cabdff1aSopenharmony_ci{
5849cabdff1aSopenharmony_ci    if (4 == height) {
5850cabdff1aSopenharmony_ci        hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5851cabdff1aSopenharmony_ci                                     src2_stride, dst, dst_stride, filter_x,
5852cabdff1aSopenharmony_ci                                     filter_y, weight0, weight1, offset0,
5853cabdff1aSopenharmony_ci                                     offset1, rnd_val, 2);
5854cabdff1aSopenharmony_ci    } else {
5855cabdff1aSopenharmony_ci        hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
5856cabdff1aSopenharmony_ci                                         src2_stride, dst, dst_stride,
5857cabdff1aSopenharmony_ci                                         filter_x, filter_y, height, weight0,
5858cabdff1aSopenharmony_ci                                         weight1, offset0, offset1, rnd_val, 16);
5859cabdff1aSopenharmony_ci    }
5860cabdff1aSopenharmony_ci}
5861cabdff1aSopenharmony_ci
5862cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr,
5863cabdff1aSopenharmony_ci                                     int32_t src_stride,
5864cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
5865cabdff1aSopenharmony_ci                                     int32_t src2_stride,
5866cabdff1aSopenharmony_ci                                     uint8_t *dst,
5867cabdff1aSopenharmony_ci                                     int32_t dst_stride,
5868cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
5869cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
5870cabdff1aSopenharmony_ci                                     int32_t height,
5871cabdff1aSopenharmony_ci                                     int32_t weight0,
5872cabdff1aSopenharmony_ci                                     int32_t weight1,
5873cabdff1aSopenharmony_ci                                     int32_t offset0,
5874cabdff1aSopenharmony_ci                                     int32_t offset1,
5875cabdff1aSopenharmony_ci                                     int32_t rnd_val)
5876cabdff1aSopenharmony_ci{
5877cabdff1aSopenharmony_ci    hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5878cabdff1aSopenharmony_ci                                     src1_ptr, src2_stride,
5879cabdff1aSopenharmony_ci                                     dst, dst_stride,
5880cabdff1aSopenharmony_ci                                     filter_x, filter_y, height, weight0,
5881cabdff1aSopenharmony_ci                                     weight1, offset0, offset1, rnd_val, 24);
5882cabdff1aSopenharmony_ci}
5883cabdff1aSopenharmony_ci
5884cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr,
5885cabdff1aSopenharmony_ci                                     int32_t src_stride,
5886cabdff1aSopenharmony_ci                                     int16_t *src1_ptr,
5887cabdff1aSopenharmony_ci                                     int32_t src2_stride,
5888cabdff1aSopenharmony_ci                                     uint8_t *dst,
5889cabdff1aSopenharmony_ci                                     int32_t dst_stride,
5890cabdff1aSopenharmony_ci                                     const int8_t *filter_x,
5891cabdff1aSopenharmony_ci                                     const int8_t *filter_y,
5892cabdff1aSopenharmony_ci                                     int32_t height,
5893cabdff1aSopenharmony_ci                                     int32_t weight0,
5894cabdff1aSopenharmony_ci                                     int32_t weight1,
5895cabdff1aSopenharmony_ci                                     int32_t offset0,
5896cabdff1aSopenharmony_ci                                     int32_t offset1,
5897cabdff1aSopenharmony_ci                                     int32_t rnd_val)
5898cabdff1aSopenharmony_ci{
5899cabdff1aSopenharmony_ci    hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5900cabdff1aSopenharmony_ci                                     src1_ptr, src2_stride,
5901cabdff1aSopenharmony_ci                                     dst, dst_stride,
5902cabdff1aSopenharmony_ci                                     filter_x, filter_y, height, weight0,
5903cabdff1aSopenharmony_ci                                     weight1, offset0, offset1, rnd_val, 32);
5904cabdff1aSopenharmony_ci}
5905cabdff1aSopenharmony_ci
5906cabdff1aSopenharmony_ci#define BI_W_MC_COPY(WIDTH)                                                  \
5907cabdff1aSopenharmony_civoid ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst,           \
5908cabdff1aSopenharmony_ci                                                     ptrdiff_t dst_stride,   \
5909cabdff1aSopenharmony_ci                                                     uint8_t *src,           \
5910cabdff1aSopenharmony_ci                                                     ptrdiff_t src_stride,   \
5911cabdff1aSopenharmony_ci                                                     int16_t *src_16bit,     \
5912cabdff1aSopenharmony_ci                                                     int height,             \
5913cabdff1aSopenharmony_ci                                                     int denom,              \
5914cabdff1aSopenharmony_ci                                                     int weight0,            \
5915cabdff1aSopenharmony_ci                                                     int weight1,            \
5916cabdff1aSopenharmony_ci                                                     int offset0,            \
5917cabdff1aSopenharmony_ci                                                     int offset1,            \
5918cabdff1aSopenharmony_ci                                                     intptr_t mx,            \
5919cabdff1aSopenharmony_ci                                                     intptr_t my,            \
5920cabdff1aSopenharmony_ci                                                     int width)              \
5921cabdff1aSopenharmony_ci{                                                                            \
5922cabdff1aSopenharmony_ci    int shift = 14 + 1 - 8;                                                  \
5923cabdff1aSopenharmony_ci    int log2Wd = denom + shift - 1;                                          \
5924cabdff1aSopenharmony_ci                                                                             \
5925cabdff1aSopenharmony_ci    hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE,  \
5926cabdff1aSopenharmony_ci                                   dst, dst_stride, height,                  \
5927cabdff1aSopenharmony_ci                                   weight0, weight1, offset0,                \
5928cabdff1aSopenharmony_ci                                   offset1, log2Wd);                         \
5929cabdff1aSopenharmony_ci}
5930cabdff1aSopenharmony_ci
5931cabdff1aSopenharmony_ciBI_W_MC_COPY(4);
5932cabdff1aSopenharmony_ciBI_W_MC_COPY(6);
5933cabdff1aSopenharmony_ciBI_W_MC_COPY(8);
5934cabdff1aSopenharmony_ciBI_W_MC_COPY(12);
5935cabdff1aSopenharmony_ciBI_W_MC_COPY(16);
5936cabdff1aSopenharmony_ciBI_W_MC_COPY(24);
5937cabdff1aSopenharmony_ciBI_W_MC_COPY(32);
5938cabdff1aSopenharmony_ciBI_W_MC_COPY(48);
5939cabdff1aSopenharmony_ciBI_W_MC_COPY(64);
5940cabdff1aSopenharmony_ci
5941cabdff1aSopenharmony_ci#undef BI_W_MC_COPY
5942cabdff1aSopenharmony_ci
5943cabdff1aSopenharmony_ci#define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                         \
5944cabdff1aSopenharmony_civoid ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,         \
5945cabdff1aSopenharmony_ci                                                        ptrdiff_t             \
5946cabdff1aSopenharmony_ci                                                        dst_stride,           \
5947cabdff1aSopenharmony_ci                                                        uint8_t *src,         \
5948cabdff1aSopenharmony_ci                                                        ptrdiff_t             \
5949cabdff1aSopenharmony_ci                                                        src_stride,           \
5950cabdff1aSopenharmony_ci                                                        int16_t *src_16bit,   \
5951cabdff1aSopenharmony_ci                                                        int height,           \
5952cabdff1aSopenharmony_ci                                                        int denom,            \
5953cabdff1aSopenharmony_ci                                                        int weight0,          \
5954cabdff1aSopenharmony_ci                                                        int weight1,          \
5955cabdff1aSopenharmony_ci                                                        int offset0,          \
5956cabdff1aSopenharmony_ci                                                        int offset1,          \
5957cabdff1aSopenharmony_ci                                                        intptr_t mx,          \
5958cabdff1aSopenharmony_ci                                                        intptr_t my,          \
5959cabdff1aSopenharmony_ci                                                        int width)            \
5960cabdff1aSopenharmony_ci{                                                                             \
5961cabdff1aSopenharmony_ci    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];             \
5962cabdff1aSopenharmony_ci    int log2Wd = denom + 14 - 8;                                              \
5963cabdff1aSopenharmony_ci                                                                              \
5964cabdff1aSopenharmony_ci    hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,   \
5965cabdff1aSopenharmony_ci                                                MAX_PB_SIZE, dst, dst_stride, \
5966cabdff1aSopenharmony_ci                                                filter, height, weight0,      \
5967cabdff1aSopenharmony_ci                                                weight1, offset0, offset1,    \
5968cabdff1aSopenharmony_ci                                                log2Wd);                      \
5969cabdff1aSopenharmony_ci}
5970cabdff1aSopenharmony_ci
5971cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 4, 8, hz, mx);
5972cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 8, 8, hz, mx);
5973cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 12, 8, hz, mx);
5974cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 16, 8, hz, mx);
5975cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 24, 8, hz, mx);
5976cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 32, 8, hz, mx);
5977cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 48, 8, hz, mx);
5978cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 64, 8, hz, mx);
5979cabdff1aSopenharmony_ci
5980cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 4, 8, vt, my);
5981cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 8, 8, vt, my);
5982cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 12, 8, vt, my);
5983cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 16, 8, vt, my);
5984cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 24, 8, vt, my);
5985cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 32, 8, vt, my);
5986cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 48, 8, vt, my);
5987cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 64, 8, vt, my);
5988cabdff1aSopenharmony_ci
5989cabdff1aSopenharmony_ciBI_W_MC(epel, h, 4, 4, hz, mx);
5990cabdff1aSopenharmony_ciBI_W_MC(epel, h, 8, 4, hz, mx);
5991cabdff1aSopenharmony_ciBI_W_MC(epel, h, 6, 4, hz, mx);
5992cabdff1aSopenharmony_ciBI_W_MC(epel, h, 12, 4, hz, mx);
5993cabdff1aSopenharmony_ciBI_W_MC(epel, h, 16, 4, hz, mx);
5994cabdff1aSopenharmony_ciBI_W_MC(epel, h, 24, 4, hz, mx);
5995cabdff1aSopenharmony_ciBI_W_MC(epel, h, 32, 4, hz, mx);
5996cabdff1aSopenharmony_ci
5997cabdff1aSopenharmony_ciBI_W_MC(epel, v, 4, 4, vt, my);
5998cabdff1aSopenharmony_ciBI_W_MC(epel, v, 8, 4, vt, my);
5999cabdff1aSopenharmony_ciBI_W_MC(epel, v, 6, 4, vt, my);
6000cabdff1aSopenharmony_ciBI_W_MC(epel, v, 12, 4, vt, my);
6001cabdff1aSopenharmony_ciBI_W_MC(epel, v, 16, 4, vt, my);
6002cabdff1aSopenharmony_ciBI_W_MC(epel, v, 24, 4, vt, my);
6003cabdff1aSopenharmony_ciBI_W_MC(epel, v, 32, 4, vt, my);
6004cabdff1aSopenharmony_ci
6005cabdff1aSopenharmony_ci#undef BI_W_MC
6006cabdff1aSopenharmony_ci
6007cabdff1aSopenharmony_ci#define BI_W_MC_HV(PEL, WIDTH, TAP)                                         \
6008cabdff1aSopenharmony_civoid ff_hevc_put_hevc_bi_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst,          \
6009cabdff1aSopenharmony_ci                                                     ptrdiff_t dst_stride,  \
6010cabdff1aSopenharmony_ci                                                     uint8_t *src,          \
6011cabdff1aSopenharmony_ci                                                     ptrdiff_t src_stride,  \
6012cabdff1aSopenharmony_ci                                                     int16_t *src_16bit,    \
6013cabdff1aSopenharmony_ci                                                     int height,            \
6014cabdff1aSopenharmony_ci                                                     int denom,             \
6015cabdff1aSopenharmony_ci                                                     int weight0,           \
6016cabdff1aSopenharmony_ci                                                     int weight1,           \
6017cabdff1aSopenharmony_ci                                                     int offset0,           \
6018cabdff1aSopenharmony_ci                                                     int offset1,           \
6019cabdff1aSopenharmony_ci                                                     intptr_t mx,           \
6020cabdff1aSopenharmony_ci                                                     intptr_t my,           \
6021cabdff1aSopenharmony_ci                                                     int width)             \
6022cabdff1aSopenharmony_ci{                                                                           \
6023cabdff1aSopenharmony_ci    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];               \
6024cabdff1aSopenharmony_ci    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];               \
6025cabdff1aSopenharmony_ci    int log2Wd = denom + 14 - 8;                                            \
6026cabdff1aSopenharmony_ci                                                                            \
6027cabdff1aSopenharmony_ci    hevc_hv_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,       \
6028cabdff1aSopenharmony_ci                                          MAX_PB_SIZE, dst, dst_stride,     \
6029cabdff1aSopenharmony_ci                                          filter_x, filter_y, height,       \
6030cabdff1aSopenharmony_ci                                          weight0, weight1, offset0,        \
6031cabdff1aSopenharmony_ci                                          offset1, log2Wd);                 \
6032cabdff1aSopenharmony_ci}
6033cabdff1aSopenharmony_ci
6034cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 4, 8);
6035cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 8, 8);
6036cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 12, 8);
6037cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 16, 8);
6038cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 24, 8);
6039cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 32, 8);
6040cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 48, 8);
6041cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 64, 8);
6042cabdff1aSopenharmony_ci
6043cabdff1aSopenharmony_ciBI_W_MC_HV(epel, 4, 4);
6044cabdff1aSopenharmony_ciBI_W_MC_HV(epel, 8, 4);
6045cabdff1aSopenharmony_ciBI_W_MC_HV(epel, 6, 4);
6046cabdff1aSopenharmony_ciBI_W_MC_HV(epel, 12, 4);
6047cabdff1aSopenharmony_ciBI_W_MC_HV(epel, 16, 4);
6048cabdff1aSopenharmony_ciBI_W_MC_HV(epel, 24, 4);
6049cabdff1aSopenharmony_ciBI_W_MC_HV(epel, 32, 4);
6050cabdff1aSopenharmony_ci
6051cabdff1aSopenharmony_ci#undef BI_W_MC_HV
6052