1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
22cabdff1aSopenharmony_ci#include "h264dsp_mips.h"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_cistatic void avc_wgt_4x2_msa(uint8_t *data, ptrdiff_t stride,
25cabdff1aSopenharmony_ci                            int32_t log2_denom, int32_t src_weight,
26cabdff1aSopenharmony_ci                            int32_t offset_in)
27cabdff1aSopenharmony_ci{
28cabdff1aSopenharmony_ci    uint32_t tp0, tp1, offset_val;
29cabdff1aSopenharmony_ci    v16u8 zero = { 0 };
30cabdff1aSopenharmony_ci    v16u8 src0 = { 0 };
31cabdff1aSopenharmony_ci    v8i16 src0_r, tmp0, wgt, denom, offset;
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_ci    offset_val = (unsigned) offset_in << log2_denom;
34cabdff1aSopenharmony_ci
35cabdff1aSopenharmony_ci    wgt = __msa_fill_h(src_weight);
36cabdff1aSopenharmony_ci    offset = __msa_fill_h(offset_val);
37cabdff1aSopenharmony_ci    denom = __msa_fill_h(log2_denom);
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_ci    LW2(data, stride, tp0, tp1);
40cabdff1aSopenharmony_ci    INSERT_W2_UB(tp0, tp1, src0);
41cabdff1aSopenharmony_ci    src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0);
42cabdff1aSopenharmony_ci    tmp0 = wgt * src0_r;
43cabdff1aSopenharmony_ci    tmp0 = __msa_adds_s_h(tmp0, offset);
44cabdff1aSopenharmony_ci    tmp0 = __msa_maxi_s_h(tmp0, 0);
45cabdff1aSopenharmony_ci    tmp0 = __msa_srlr_h(tmp0, denom);
46cabdff1aSopenharmony_ci    tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
47cabdff1aSopenharmony_ci    src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
48cabdff1aSopenharmony_ci    ST_W2(src0, 0, 1, data, stride);
49cabdff1aSopenharmony_ci}
50cabdff1aSopenharmony_ci
51cabdff1aSopenharmony_cistatic void avc_wgt_4x4_msa(uint8_t *data, ptrdiff_t stride,
52cabdff1aSopenharmony_ci                            int32_t log2_denom, int32_t src_weight,
53cabdff1aSopenharmony_ci                            int32_t offset_in)
54cabdff1aSopenharmony_ci{
55cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3, offset_val;
56cabdff1aSopenharmony_ci    v16u8 src0 = { 0 };
57cabdff1aSopenharmony_ci    v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset;
58cabdff1aSopenharmony_ci
59cabdff1aSopenharmony_ci    offset_val = (unsigned) offset_in << log2_denom;
60cabdff1aSopenharmony_ci
61cabdff1aSopenharmony_ci    wgt = __msa_fill_h(src_weight);
62cabdff1aSopenharmony_ci    offset = __msa_fill_h(offset_val);
63cabdff1aSopenharmony_ci    denom = __msa_fill_h(log2_denom);
64cabdff1aSopenharmony_ci
65cabdff1aSopenharmony_ci    LW4(data, stride, tp0, tp1, tp2, tp3);
66cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
67cabdff1aSopenharmony_ci    UNPCK_UB_SH(src0, src0_r, src1_r);
68cabdff1aSopenharmony_ci    MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1);
69cabdff1aSopenharmony_ci    ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1);
70cabdff1aSopenharmony_ci    MAXI_SH2_SH(tmp0, tmp1, 0);
71cabdff1aSopenharmony_ci    tmp0 = __msa_srlr_h(tmp0, denom);
72cabdff1aSopenharmony_ci    tmp1 = __msa_srlr_h(tmp1, denom);
73cabdff1aSopenharmony_ci    SAT_UH2_SH(tmp0, tmp1, 7);
74cabdff1aSopenharmony_ci    src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
75cabdff1aSopenharmony_ci    ST_W4(src0, 0, 1, 2, 3, data, stride);
76cabdff1aSopenharmony_ci}
77cabdff1aSopenharmony_ci
78cabdff1aSopenharmony_cistatic void avc_wgt_4x8_msa(uint8_t *data, ptrdiff_t stride,
79cabdff1aSopenharmony_ci                            int32_t log2_denom, int32_t src_weight,
80cabdff1aSopenharmony_ci                            int32_t offset_in)
81cabdff1aSopenharmony_ci{
82cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3, offset_val;
83cabdff1aSopenharmony_ci    v16u8 src0 = { 0 }, src1 = { 0 };
84cabdff1aSopenharmony_ci    v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
85cabdff1aSopenharmony_ci    v8i16 wgt, denom, offset;
86cabdff1aSopenharmony_ci
87cabdff1aSopenharmony_ci    offset_val = (unsigned) offset_in << log2_denom;
88cabdff1aSopenharmony_ci
89cabdff1aSopenharmony_ci    wgt = __msa_fill_h(src_weight);
90cabdff1aSopenharmony_ci    offset = __msa_fill_h(offset_val);
91cabdff1aSopenharmony_ci    denom = __msa_fill_h(log2_denom);
92cabdff1aSopenharmony_ci
93cabdff1aSopenharmony_ci    LW4(data, stride, tp0, tp1, tp2, tp3);
94cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
95cabdff1aSopenharmony_ci    LW4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
96cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
97cabdff1aSopenharmony_ci    UNPCK_UB_SH(src0, src0_r, src1_r);
98cabdff1aSopenharmony_ci    UNPCK_UB_SH(src1, src2_r, src3_r);
99cabdff1aSopenharmony_ci    MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
100cabdff1aSopenharmony_ci         tmp3);
101cabdff1aSopenharmony_ci    ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
102cabdff1aSopenharmony_ci                tmp1, tmp2, tmp3);
103cabdff1aSopenharmony_ci    MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
104cabdff1aSopenharmony_ci    SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
105cabdff1aSopenharmony_ci    SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
106cabdff1aSopenharmony_ci    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
107cabdff1aSopenharmony_ci    ST_W8(src0, src1, 0, 1, 2, 3, 0, 1, 2, 3, data, stride);
108cabdff1aSopenharmony_ci}
109cabdff1aSopenharmony_ci
110cabdff1aSopenharmony_cistatic void avc_wgt_8x4_msa(uint8_t *data, ptrdiff_t stride,
111cabdff1aSopenharmony_ci                            int32_t log2_denom, int32_t src_weight,
112cabdff1aSopenharmony_ci                            int32_t offset_in)
113cabdff1aSopenharmony_ci{
114cabdff1aSopenharmony_ci    uint32_t offset_val;
115cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
116cabdff1aSopenharmony_ci    v16u8 src0 = { 0 }, src1 = { 0 };
117cabdff1aSopenharmony_ci    v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
118cabdff1aSopenharmony_ci    v8i16 wgt, denom, offset;
119cabdff1aSopenharmony_ci
120cabdff1aSopenharmony_ci    offset_val = (unsigned) offset_in << log2_denom;
121cabdff1aSopenharmony_ci
122cabdff1aSopenharmony_ci    wgt = __msa_fill_h(src_weight);
123cabdff1aSopenharmony_ci    offset = __msa_fill_h(offset_val);
124cabdff1aSopenharmony_ci    denom = __msa_fill_h(log2_denom);
125cabdff1aSopenharmony_ci
126cabdff1aSopenharmony_ci    LD4(data, stride, tp0, tp1, tp2, tp3);
127cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, src0);
128cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, src1);
129cabdff1aSopenharmony_ci    UNPCK_UB_SH(src0, src0_r, src1_r);
130cabdff1aSopenharmony_ci    UNPCK_UB_SH(src1, src2_r, src3_r);
131cabdff1aSopenharmony_ci    MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
132cabdff1aSopenharmony_ci         tmp3);
133cabdff1aSopenharmony_ci    ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
134cabdff1aSopenharmony_ci                tmp1, tmp2, tmp3);
135cabdff1aSopenharmony_ci    MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
136cabdff1aSopenharmony_ci    SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
137cabdff1aSopenharmony_ci    SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
138cabdff1aSopenharmony_ci    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
139cabdff1aSopenharmony_ci    ST_D4(src0, src1, 0, 1, 0, 1, data, stride);
140cabdff1aSopenharmony_ci}
141cabdff1aSopenharmony_ci
142cabdff1aSopenharmony_cistatic void avc_wgt_8x8_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom,
143cabdff1aSopenharmony_ci                            int32_t src_weight, int32_t offset_in)
144cabdff1aSopenharmony_ci{
145cabdff1aSopenharmony_ci    uint32_t offset_val;
146cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
147cabdff1aSopenharmony_ci    v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
148cabdff1aSopenharmony_ci    v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
149cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
150cabdff1aSopenharmony_ci    v8i16 wgt, denom, offset;
151cabdff1aSopenharmony_ci
152cabdff1aSopenharmony_ci    offset_val = (unsigned) offset_in << log2_denom;
153cabdff1aSopenharmony_ci
154cabdff1aSopenharmony_ci    wgt = __msa_fill_h(src_weight);
155cabdff1aSopenharmony_ci    offset = __msa_fill_h(offset_val);
156cabdff1aSopenharmony_ci    denom = __msa_fill_h(log2_denom);
157cabdff1aSopenharmony_ci
158cabdff1aSopenharmony_ci    LD4(data, stride, tp0, tp1, tp2, tp3);
159cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, src0);
160cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, src1);
161cabdff1aSopenharmony_ci    LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
162cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, src2);
163cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, src3);
164cabdff1aSopenharmony_ci    UNPCK_UB_SH(src0, src0_r, src1_r);
165cabdff1aSopenharmony_ci    UNPCK_UB_SH(src1, src2_r, src3_r);
166cabdff1aSopenharmony_ci    UNPCK_UB_SH(src2, src4_r, src5_r);
167cabdff1aSopenharmony_ci    UNPCK_UB_SH(src3, src6_r, src7_r);
168cabdff1aSopenharmony_ci    MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
169cabdff1aSopenharmony_ci         tmp3);
170cabdff1aSopenharmony_ci    MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6,
171cabdff1aSopenharmony_ci         tmp7);
172cabdff1aSopenharmony_ci    ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
173cabdff1aSopenharmony_ci                tmp1, tmp2, tmp3);
174cabdff1aSopenharmony_ci    ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
175cabdff1aSopenharmony_ci                tmp5, tmp6, tmp7);
176cabdff1aSopenharmony_ci    MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
177cabdff1aSopenharmony_ci    SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
178cabdff1aSopenharmony_ci    SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
179cabdff1aSopenharmony_ci    PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
180cabdff1aSopenharmony_ci                src2, src3);
181cabdff1aSopenharmony_ci    ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
182cabdff1aSopenharmony_ci}
183cabdff1aSopenharmony_ci
184cabdff1aSopenharmony_cistatic void avc_wgt_8x16_msa(uint8_t *data, ptrdiff_t stride,
185cabdff1aSopenharmony_ci                             int32_t log2_denom, int32_t src_weight,
186cabdff1aSopenharmony_ci                             int32_t offset_in)
187cabdff1aSopenharmony_ci{
188cabdff1aSopenharmony_ci    uint32_t offset_val, cnt;
189cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
190cabdff1aSopenharmony_ci    v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
191cabdff1aSopenharmony_ci    v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
192cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
193cabdff1aSopenharmony_ci    v8i16 wgt, denom, offset;
194cabdff1aSopenharmony_ci
195cabdff1aSopenharmony_ci    offset_val = (unsigned) offset_in << log2_denom;
196cabdff1aSopenharmony_ci
197cabdff1aSopenharmony_ci    wgt = __msa_fill_h(src_weight);
198cabdff1aSopenharmony_ci    offset = __msa_fill_h(offset_val);
199cabdff1aSopenharmony_ci    denom = __msa_fill_h(log2_denom);
200cabdff1aSopenharmony_ci
201cabdff1aSopenharmony_ci    for (cnt = 2; cnt--;) {
202cabdff1aSopenharmony_ci        LD4(data, stride, tp0, tp1, tp2, tp3);
203cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, src0);
204cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, src1);
205cabdff1aSopenharmony_ci        LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
206cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, src2);
207cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, src3);
208cabdff1aSopenharmony_ci        UNPCK_UB_SH(src0, src0_r, src1_r);
209cabdff1aSopenharmony_ci        UNPCK_UB_SH(src1, src2_r, src3_r);
210cabdff1aSopenharmony_ci        UNPCK_UB_SH(src2, src4_r, src5_r);
211cabdff1aSopenharmony_ci        UNPCK_UB_SH(src3, src6_r, src7_r);
212cabdff1aSopenharmony_ci        MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1,
213cabdff1aSopenharmony_ci             tmp2, tmp3);
214cabdff1aSopenharmony_ci        MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5,
215cabdff1aSopenharmony_ci             tmp6, tmp7);
216cabdff1aSopenharmony_ci        ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
217cabdff1aSopenharmony_ci                    tmp0, tmp1, tmp2, tmp3);
218cabdff1aSopenharmony_ci        ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
219cabdff1aSopenharmony_ci                    tmp4, tmp5, tmp6, tmp7);
220cabdff1aSopenharmony_ci        MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
221cabdff1aSopenharmony_ci        SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
222cabdff1aSopenharmony_ci        SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
223cabdff1aSopenharmony_ci        PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
224cabdff1aSopenharmony_ci                    src2, src3);
225cabdff1aSopenharmony_ci        ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
226cabdff1aSopenharmony_ci        data += 8 * stride;
227cabdff1aSopenharmony_ci    }
228cabdff1aSopenharmony_ci}
229cabdff1aSopenharmony_ci
230cabdff1aSopenharmony_cistatic void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
231cabdff1aSopenharmony_ci                              int32_t log2_denom, int32_t src_weight,
232cabdff1aSopenharmony_ci                              int32_t dst_weight, int32_t offset_in)
233cabdff1aSopenharmony_ci{
234cabdff1aSopenharmony_ci    uint32_t tp0, tp1;
235cabdff1aSopenharmony_ci    v16i8 src_wgt, dst_wgt, wgt, vec0;
236cabdff1aSopenharmony_ci    v16u8 src0 = { 0 }, dst0 = { 0 };
237cabdff1aSopenharmony_ci    v8i16 tmp0, denom, offset, max255 = __msa_ldi_h(255);
238cabdff1aSopenharmony_ci
239cabdff1aSopenharmony_ci    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
240cabdff1aSopenharmony_ci    offset_in += (128 * (src_weight + dst_weight));
241cabdff1aSopenharmony_ci
242cabdff1aSopenharmony_ci    src_wgt = __msa_fill_b(src_weight);
243cabdff1aSopenharmony_ci    dst_wgt = __msa_fill_b(dst_weight);
244cabdff1aSopenharmony_ci    offset = __msa_fill_h(offset_in);
245cabdff1aSopenharmony_ci    denom = __msa_fill_h(log2_denom + 1);
246cabdff1aSopenharmony_ci
247cabdff1aSopenharmony_ci    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
248cabdff1aSopenharmony_ci
249cabdff1aSopenharmony_ci    LW2(src, stride, tp0, tp1);
250cabdff1aSopenharmony_ci    INSERT_W2_UB(tp0, tp1, src0);
251cabdff1aSopenharmony_ci    LW2(dst, stride, tp0, tp1);
252cabdff1aSopenharmony_ci    INSERT_W2_UB(tp0, tp1, dst0);
253cabdff1aSopenharmony_ci    XORI_B2_128_UB(src0, dst0);
254cabdff1aSopenharmony_ci    vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8) src0);
255cabdff1aSopenharmony_ci    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
256cabdff1aSopenharmony_ci    tmp0 >>= denom;
257cabdff1aSopenharmony_ci    tmp0 = __msa_maxi_s_h(tmp0, 0);
258cabdff1aSopenharmony_ci    tmp0 = __msa_min_s_h(max255, tmp0);
259cabdff1aSopenharmony_ci    dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
260cabdff1aSopenharmony_ci    ST_W2(dst0, 0, 1, dst, stride);
261cabdff1aSopenharmony_ci}
262cabdff1aSopenharmony_ci
263cabdff1aSopenharmony_cistatic void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
264cabdff1aSopenharmony_ci                              int32_t log2_denom, int32_t src_weight,
265cabdff1aSopenharmony_ci                              int32_t dst_weight, int32_t offset_in)
266cabdff1aSopenharmony_ci{
267cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
268cabdff1aSopenharmony_ci    v16i8 src_wgt, dst_wgt, wgt, vec0, vec1;
269cabdff1aSopenharmony_ci    v16u8 src0, dst0;
270cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, denom, offset;
271cabdff1aSopenharmony_ci
272cabdff1aSopenharmony_ci    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
273cabdff1aSopenharmony_ci    offset_in += (128 * (src_weight + dst_weight));
274cabdff1aSopenharmony_ci
275cabdff1aSopenharmony_ci    src_wgt = __msa_fill_b(src_weight);
276cabdff1aSopenharmony_ci    dst_wgt = __msa_fill_b(dst_weight);
277cabdff1aSopenharmony_ci    offset = __msa_fill_h(offset_in);
278cabdff1aSopenharmony_ci    denom = __msa_fill_h(log2_denom + 1);
279cabdff1aSopenharmony_ci
280cabdff1aSopenharmony_ci    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
281cabdff1aSopenharmony_ci
282cabdff1aSopenharmony_ci    LW4(src, stride, tp0, tp1, tp2, tp3);
283cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
284cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
285cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
286cabdff1aSopenharmony_ci    XORI_B2_128_UB(src0, dst0);
287cabdff1aSopenharmony_ci    ILVRL_B2_SB(dst0, src0, vec0, vec1);
288cabdff1aSopenharmony_ci    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
289cabdff1aSopenharmony_ci    tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
290cabdff1aSopenharmony_ci    tmp0 >>= denom;
291cabdff1aSopenharmony_ci    tmp1 >>= denom;
292cabdff1aSopenharmony_ci    CLIP_SH2_0_255(tmp0, tmp1);
293cabdff1aSopenharmony_ci    dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
294cabdff1aSopenharmony_ci    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
295cabdff1aSopenharmony_ci}
296cabdff1aSopenharmony_ci
297cabdff1aSopenharmony_cistatic void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
298cabdff1aSopenharmony_ci                              int32_t log2_denom, int32_t src_weight,
299cabdff1aSopenharmony_ci                              int32_t dst_weight, int32_t offset_in)
300cabdff1aSopenharmony_ci{
301cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
302cabdff1aSopenharmony_ci    v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
303cabdff1aSopenharmony_ci    v16u8 src0, src1, dst0, dst1;
304cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
305cabdff1aSopenharmony_ci
306cabdff1aSopenharmony_ci    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
307cabdff1aSopenharmony_ci    offset_in += (128 * (src_weight + dst_weight));
308cabdff1aSopenharmony_ci
309cabdff1aSopenharmony_ci    src_wgt = __msa_fill_b(src_weight);
310cabdff1aSopenharmony_ci    dst_wgt = __msa_fill_b(dst_weight);
311cabdff1aSopenharmony_ci    offset = __msa_fill_h(offset_in);
312cabdff1aSopenharmony_ci    denom = __msa_fill_h(log2_denom + 1);
313cabdff1aSopenharmony_ci    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
314cabdff1aSopenharmony_ci
315cabdff1aSopenharmony_ci    LW4(src, stride, tp0, tp1, tp2, tp3);
316cabdff1aSopenharmony_ci    src += 4 * stride;
317cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
318cabdff1aSopenharmony_ci    LW4(src, stride, tp0, tp1, tp2, tp3);
319cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
320cabdff1aSopenharmony_ci    LW4(dst, stride, tp0, tp1, tp2, tp3);
321cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
322cabdff1aSopenharmony_ci    LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
323cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
324cabdff1aSopenharmony_ci    XORI_B4_128_UB(src0, src1, dst0, dst1);
325cabdff1aSopenharmony_ci    ILVRL_B2_SB(dst0, src0, vec0, vec1);
326cabdff1aSopenharmony_ci    ILVRL_B2_SB(dst1, src1, vec2, vec3);
327cabdff1aSopenharmony_ci    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
328cabdff1aSopenharmony_ci    tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
329cabdff1aSopenharmony_ci    tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
330cabdff1aSopenharmony_ci    tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
331cabdff1aSopenharmony_ci    SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
332cabdff1aSopenharmony_ci    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
333cabdff1aSopenharmony_ci    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
334cabdff1aSopenharmony_ci    ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
335cabdff1aSopenharmony_ci}
336cabdff1aSopenharmony_ci
337cabdff1aSopenharmony_cistatic void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
338cabdff1aSopenharmony_ci                              int32_t log2_denom, int32_t src_weight,
339cabdff1aSopenharmony_ci                              int32_t dst_weight, int32_t offset_in)
340cabdff1aSopenharmony_ci{
341cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
342cabdff1aSopenharmony_ci    v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
343cabdff1aSopenharmony_ci    v16u8 src0, src1, dst0, dst1;
344cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
345cabdff1aSopenharmony_ci
346cabdff1aSopenharmony_ci    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
347cabdff1aSopenharmony_ci    offset_in += (128 * (src_weight + dst_weight));
348cabdff1aSopenharmony_ci
349cabdff1aSopenharmony_ci    src_wgt = __msa_fill_b(src_weight);
350cabdff1aSopenharmony_ci    dst_wgt = __msa_fill_b(dst_weight);
351cabdff1aSopenharmony_ci    offset = __msa_fill_h(offset_in);
352cabdff1aSopenharmony_ci    denom = __msa_fill_h(log2_denom + 1);
353cabdff1aSopenharmony_ci
354cabdff1aSopenharmony_ci    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
355cabdff1aSopenharmony_ci
356cabdff1aSopenharmony_ci    LD4(src, stride, tp0, tp1, tp2, tp3);
357cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, src0);
358cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, src1);
359cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
360cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
361cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
362cabdff1aSopenharmony_ci    XORI_B4_128_UB(src0, src1, dst0, dst1);
363cabdff1aSopenharmony_ci    ILVRL_B2_SB(dst0, src0, vec0, vec1);
364cabdff1aSopenharmony_ci    ILVRL_B2_SB(dst1, src1, vec2, vec3);
365cabdff1aSopenharmony_ci    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
366cabdff1aSopenharmony_ci    tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
367cabdff1aSopenharmony_ci    tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
368cabdff1aSopenharmony_ci    tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
369cabdff1aSopenharmony_ci    SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
370cabdff1aSopenharmony_ci    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
371cabdff1aSopenharmony_ci    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
372cabdff1aSopenharmony_ci    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
373cabdff1aSopenharmony_ci}
374cabdff1aSopenharmony_ci
375cabdff1aSopenharmony_cistatic void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
376cabdff1aSopenharmony_ci                              int32_t log2_denom, int32_t src_weight,
377cabdff1aSopenharmony_ci                              int32_t dst_weight, int32_t offset_in)
378cabdff1aSopenharmony_ci{
379cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
380cabdff1aSopenharmony_ci    v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
381cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
382cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
383cabdff1aSopenharmony_ci
384cabdff1aSopenharmony_ci    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
385cabdff1aSopenharmony_ci    offset_in += (128 * (src_weight + dst_weight));
386cabdff1aSopenharmony_ci
387cabdff1aSopenharmony_ci    src_wgt = __msa_fill_b(src_weight);
388cabdff1aSopenharmony_ci    dst_wgt = __msa_fill_b(dst_weight);
389cabdff1aSopenharmony_ci    offset = __msa_fill_h(offset_in);
390cabdff1aSopenharmony_ci    denom = __msa_fill_h(log2_denom + 1);
391cabdff1aSopenharmony_ci    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
392cabdff1aSopenharmony_ci
393cabdff1aSopenharmony_ci    LD4(src, stride, tp0, tp1, tp2, tp3);
394cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, src0);
395cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, src1);
396cabdff1aSopenharmony_ci    LD4(src + 4 * stride, stride, tp0, tp1, tp2, tp3);
397cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, src2);
398cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, src3);
399cabdff1aSopenharmony_ci    LD4(dst, stride, tp0, tp1, tp2, tp3);
400cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
401cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
402cabdff1aSopenharmony_ci    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
403cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst2);
404cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst3);
405cabdff1aSopenharmony_ci    XORI_B8_128_UB(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
406cabdff1aSopenharmony_ci    ILVRL_B2_SB(dst0, src0, vec0, vec1);
407cabdff1aSopenharmony_ci    ILVRL_B2_SB(dst1, src1, vec2, vec3);
408cabdff1aSopenharmony_ci    ILVRL_B2_SB(dst2, src2, vec4, vec5);
409cabdff1aSopenharmony_ci    ILVRL_B2_SB(dst3, src3, vec6, vec7);
410cabdff1aSopenharmony_ci    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
411cabdff1aSopenharmony_ci    tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
412cabdff1aSopenharmony_ci    tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
413cabdff1aSopenharmony_ci    tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
414cabdff1aSopenharmony_ci    tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
415cabdff1aSopenharmony_ci    tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
416cabdff1aSopenharmony_ci    tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
417cabdff1aSopenharmony_ci    tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
418cabdff1aSopenharmony_ci    SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
419cabdff1aSopenharmony_ci    SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
420cabdff1aSopenharmony_ci    CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
421cabdff1aSopenharmony_ci    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
422cabdff1aSopenharmony_ci    PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
423cabdff1aSopenharmony_ci    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
424cabdff1aSopenharmony_ci}
425cabdff1aSopenharmony_ci
426cabdff1aSopenharmony_cistatic void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
427cabdff1aSopenharmony_ci                               int32_t log2_denom, int32_t src_weight,
428cabdff1aSopenharmony_ci                               int32_t dst_weight, int32_t offset_in)
429cabdff1aSopenharmony_ci{
430cabdff1aSopenharmony_ci    uint8_t cnt;
431cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
432cabdff1aSopenharmony_ci    v16i8 src_wgt, dst_wgt, wgt;
433cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3;
434cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3;
435cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
436cabdff1aSopenharmony_ci    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
437cabdff1aSopenharmony_ci    v8i16 denom, offset;
438cabdff1aSopenharmony_ci
439cabdff1aSopenharmony_ci    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
440cabdff1aSopenharmony_ci    offset_in += (128 * (src_weight + dst_weight));
441cabdff1aSopenharmony_ci
442cabdff1aSopenharmony_ci    src_wgt = __msa_fill_b(src_weight);
443cabdff1aSopenharmony_ci    dst_wgt = __msa_fill_b(dst_weight);
444cabdff1aSopenharmony_ci    offset = __msa_fill_h(offset_in);
445cabdff1aSopenharmony_ci    denom = __msa_fill_h(log2_denom + 1);
446cabdff1aSopenharmony_ci    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
447cabdff1aSopenharmony_ci
448cabdff1aSopenharmony_ci    for (cnt = 2; cnt--;) {
449cabdff1aSopenharmony_ci        LD4(src, stride, tp0, tp1, tp2, tp3);
450cabdff1aSopenharmony_ci        src += 4 * stride;
451cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, src0);
452cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, src1);
453cabdff1aSopenharmony_ci        LD4(src, stride, tp0, tp1, tp2, tp3);
454cabdff1aSopenharmony_ci        src += 4 * stride;
455cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, src2);
456cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, src3);
457cabdff1aSopenharmony_ci        LD4(dst, stride, tp0, tp1, tp2, tp3);
458cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, dst0);
459cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, dst1);
460cabdff1aSopenharmony_ci        LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
461cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, dst2);
462cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, dst3);
463cabdff1aSopenharmony_ci        XORI_B4_128_UB(src0, src1, src2, src3);
464cabdff1aSopenharmony_ci        XORI_B4_128_UB(dst0, dst1, dst2, dst3);
465cabdff1aSopenharmony_ci        ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
466cabdff1aSopenharmony_ci                   vec0, vec2, vec4, vec6);
467cabdff1aSopenharmony_ci        ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
468cabdff1aSopenharmony_ci                   vec1, vec3, vec5, vec7);
469cabdff1aSopenharmony_ci
470cabdff1aSopenharmony_ci        temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
471cabdff1aSopenharmony_ci        temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
472cabdff1aSopenharmony_ci        temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
473cabdff1aSopenharmony_ci        temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
474cabdff1aSopenharmony_ci        temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
475cabdff1aSopenharmony_ci        temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
476cabdff1aSopenharmony_ci        temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
477cabdff1aSopenharmony_ci        temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
478cabdff1aSopenharmony_ci
479cabdff1aSopenharmony_ci        SRA_4V(temp0, temp1, temp2, temp3, denom);
480cabdff1aSopenharmony_ci        SRA_4V(temp4, temp5, temp6, temp7, denom);
481cabdff1aSopenharmony_ci        CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
482cabdff1aSopenharmony_ci        PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
483cabdff1aSopenharmony_ci                    dst0, dst1, dst2, dst3);
484cabdff1aSopenharmony_ci        ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
485cabdff1aSopenharmony_ci        dst += 8 * stride;
486cabdff1aSopenharmony_ci    }
487cabdff1aSopenharmony_ci}
488cabdff1aSopenharmony_ci
489cabdff1aSopenharmony_ci#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,          \
490cabdff1aSopenharmony_ci                                 q3_or_p3_org_in, p1_or_q1_org_in,          \
491cabdff1aSopenharmony_ci                                 p2_or_q2_org_in, q1_or_p1_org_in,          \
492cabdff1aSopenharmony_ci                                 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)  \
493cabdff1aSopenharmony_ci{                                                                           \
494cabdff1aSopenharmony_ci    v8i16 threshold;                                                        \
495cabdff1aSopenharmony_ci    v8i16 const3 = __msa_ldi_h(3);                                          \
496cabdff1aSopenharmony_ci                                                                            \
497cabdff1aSopenharmony_ci    threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in);                      \
498cabdff1aSopenharmony_ci    threshold += (p1_or_q1_org_in);                                         \
499cabdff1aSopenharmony_ci                                                                            \
500cabdff1aSopenharmony_ci    (p0_or_q0_out) = threshold << 1;                                        \
501cabdff1aSopenharmony_ci    (p0_or_q0_out) += (p2_or_q2_org_in);                                    \
502cabdff1aSopenharmony_ci    (p0_or_q0_out) += (q1_or_p1_org_in);                                    \
503cabdff1aSopenharmony_ci    (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3);                      \
504cabdff1aSopenharmony_ci                                                                            \
505cabdff1aSopenharmony_ci    (p1_or_q1_out) = (p2_or_q2_org_in) + threshold;                         \
506cabdff1aSopenharmony_ci    (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2);                      \
507cabdff1aSopenharmony_ci                                                                            \
508cabdff1aSopenharmony_ci    (p2_or_q2_out) = (p2_or_q2_org_in) * const3;                            \
509cabdff1aSopenharmony_ci    (p2_or_q2_out) += (p3_or_q3_org_in);                                    \
510cabdff1aSopenharmony_ci    (p2_or_q2_out) += (p3_or_q3_org_in);                                    \
511cabdff1aSopenharmony_ci    (p2_or_q2_out) += threshold;                                            \
512cabdff1aSopenharmony_ci    (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3);                      \
513cabdff1aSopenharmony_ci}
514cabdff1aSopenharmony_ci
515cabdff1aSopenharmony_ci/* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
516cabdff1aSopenharmony_ci#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,   \
517cabdff1aSopenharmony_ci                         p1_or_q1_org_in, p0_or_q0_out)      \
518cabdff1aSopenharmony_ci{                                                            \
519cabdff1aSopenharmony_ci    (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in);  \
520cabdff1aSopenharmony_ci    (p0_or_q0_out) += (p1_or_q1_org_in);                     \
521cabdff1aSopenharmony_ci    (p0_or_q0_out) += (p1_or_q1_org_in);                     \
522cabdff1aSopenharmony_ci    (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2);       \
523cabdff1aSopenharmony_ci}
524cabdff1aSopenharmony_ci
525cabdff1aSopenharmony_ci#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,    \
526cabdff1aSopenharmony_ci                         p1_or_q1_org_in, p2_or_q2_org_in,    \
527cabdff1aSopenharmony_ci                         negate_tc_in, tc_in, p1_or_q1_out)   \
528cabdff1aSopenharmony_ci{                                                             \
529cabdff1aSopenharmony_ci    v8i16 clip3, temp;                                        \
530cabdff1aSopenharmony_ci                                                              \
531cabdff1aSopenharmony_ci    clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in,   \
532cabdff1aSopenharmony_ci                                   (v8u16) q0_or_p0_org_in);  \
533cabdff1aSopenharmony_ci    temp = p1_or_q1_org_in << 1;                              \
534cabdff1aSopenharmony_ci    clip3 = clip3 - temp;                                     \
535cabdff1aSopenharmony_ci    clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);            \
536cabdff1aSopenharmony_ci    CLIP_SH(clip3, negate_tc_in, tc_in);                      \
537cabdff1aSopenharmony_ci    p1_or_q1_out = p1_or_q1_org_in + clip3;                   \
538cabdff1aSopenharmony_ci}
539cabdff1aSopenharmony_ci
540cabdff1aSopenharmony_ci#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,          \
541cabdff1aSopenharmony_ci                     p1_or_q1_org_in, q1_or_p1_org_in,          \
542cabdff1aSopenharmony_ci                     negate_threshold_in, threshold_in,         \
543cabdff1aSopenharmony_ci                     p0_or_q0_out, q0_or_p0_out)                \
544cabdff1aSopenharmony_ci{                                                               \
545cabdff1aSopenharmony_ci    v8i16 q0_sub_p0, p1_sub_q1, delta;                          \
546cabdff1aSopenharmony_ci                                                                \
547cabdff1aSopenharmony_ci    q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in;              \
548cabdff1aSopenharmony_ci    p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in;              \
549cabdff1aSopenharmony_ci    q0_sub_p0 <<= 2;                                            \
550cabdff1aSopenharmony_ci    p1_sub_q1 += 4;                                             \
551cabdff1aSopenharmony_ci    delta = q0_sub_p0 + p1_sub_q1;                              \
552cabdff1aSopenharmony_ci    delta >>= 3;                                                \
553cabdff1aSopenharmony_ci                                                                \
554cabdff1aSopenharmony_ci    CLIP_SH(delta, negate_threshold_in, threshold_in);          \
555cabdff1aSopenharmony_ci                                                                \
556cabdff1aSopenharmony_ci    p0_or_q0_out = p0_or_q0_org_in + delta;                     \
557cabdff1aSopenharmony_ci    q0_or_p0_out = q0_or_p0_org_in - delta;                     \
558cabdff1aSopenharmony_ci                                                                \
559cabdff1aSopenharmony_ci    CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out);                 \
560cabdff1aSopenharmony_ci}
561cabdff1aSopenharmony_ci
562cabdff1aSopenharmony_ci#define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res)      \
563cabdff1aSopenharmony_ci{                                                                        \
564cabdff1aSopenharmony_ci    uint32_t load0, load1, load2, load3;                                 \
565cabdff1aSopenharmony_ci    v16u8 src0 = { 0 };                                                  \
566cabdff1aSopenharmony_ci    v16u8 src1 = { 0 };                                                  \
567cabdff1aSopenharmony_ci    v16u8 src2 = { 0 };                                                  \
568cabdff1aSopenharmony_ci    v16u8 src3 = { 0 };                                                  \
569cabdff1aSopenharmony_ci    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                            \
570cabdff1aSopenharmony_ci    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;           \
571cabdff1aSopenharmony_ci    v8i16 tc, q0_sub_p0, p1_sub_q1, delta;                               \
572cabdff1aSopenharmony_ci    v8i16 res0_r, res1_r;                                                \
573cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };                                                 \
574cabdff1aSopenharmony_ci    v16u8 res0, res1;                                                    \
575cabdff1aSopenharmony_ci                                                                         \
576cabdff1aSopenharmony_ci    LW4((src - 2), stride, load0, load1, load2, load3);                  \
577cabdff1aSopenharmony_ci    src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);               \
578cabdff1aSopenharmony_ci    src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);               \
579cabdff1aSopenharmony_ci    src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2);               \
580cabdff1aSopenharmony_ci    src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3);               \
581cabdff1aSopenharmony_ci                                                                         \
582cabdff1aSopenharmony_ci    TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3);  \
583cabdff1aSopenharmony_ci                                                                         \
584cabdff1aSopenharmony_ci    p0_asub_q0 = __msa_asub_u_b(src2, src1);                             \
585cabdff1aSopenharmony_ci    p1_asub_p0 = __msa_asub_u_b(src1, src0);                             \
586cabdff1aSopenharmony_ci    q1_asub_q0 = __msa_asub_u_b(src2, src3);                             \
587cabdff1aSopenharmony_ci                                                                         \
588cabdff1aSopenharmony_ci    tc = __msa_fill_h(tc_val);                                           \
589cabdff1aSopenharmony_ci                                                                         \
590cabdff1aSopenharmony_ci    is_less_than_alpha = (p0_asub_q0 < alpha);                           \
591cabdff1aSopenharmony_ci    is_less_than_beta = (p1_asub_p0 < beta);                             \
592cabdff1aSopenharmony_ci    is_less_than = is_less_than_alpha & is_less_than_beta;               \
593cabdff1aSopenharmony_ci    is_less_than_beta = (q1_asub_q0 < beta);                             \
594cabdff1aSopenharmony_ci    is_less_than = is_less_than_beta & is_less_than;                     \
595cabdff1aSopenharmony_ci                                                                         \
596cabdff1aSopenharmony_ci    ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);            \
597cabdff1aSopenharmony_ci    HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);             \
598cabdff1aSopenharmony_ci                                                                         \
599cabdff1aSopenharmony_ci    q0_sub_p0 <<= 2;                                                     \
600cabdff1aSopenharmony_ci    delta = q0_sub_p0 + p1_sub_q1;                                       \
601cabdff1aSopenharmony_ci    delta = __msa_srari_h(delta, 3);                                     \
602cabdff1aSopenharmony_ci                                                                         \
603cabdff1aSopenharmony_ci    CLIP_SH(delta, -tc, tc);                                             \
604cabdff1aSopenharmony_ci                                                                         \
605cabdff1aSopenharmony_ci    ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                \
606cabdff1aSopenharmony_ci                                                                         \
607cabdff1aSopenharmony_ci    res0_r += delta;                                                     \
608cabdff1aSopenharmony_ci    res1_r -= delta;                                                     \
609cabdff1aSopenharmony_ci                                                                         \
610cabdff1aSopenharmony_ci    CLIP_SH2_0_255(res0_r, res1_r);                                      \
611cabdff1aSopenharmony_ci    PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);             \
612cabdff1aSopenharmony_ci                                                                         \
613cabdff1aSopenharmony_ci    res0 = __msa_bmnz_v(src1, res0, is_less_than);                       \
614cabdff1aSopenharmony_ci    res1 = __msa_bmnz_v(src2, res1, is_less_than);                       \
615cabdff1aSopenharmony_ci                                                                         \
616cabdff1aSopenharmony_ci    res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);              \
617cabdff1aSopenharmony_ci}
618cabdff1aSopenharmony_ci
619cabdff1aSopenharmony_ci#define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3)  \
620cabdff1aSopenharmony_ci{                                                            \
621cabdff1aSopenharmony_ci    v16i8 zero_m = { 0 };                                    \
622cabdff1aSopenharmony_ci                                                             \
623cabdff1aSopenharmony_ci    out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0);   \
624cabdff1aSopenharmony_ci    out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2);    \
625cabdff1aSopenharmony_ci    SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3);   \
626cabdff1aSopenharmony_ci}
627cabdff1aSopenharmony_ci
628cabdff1aSopenharmony_ci#define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)  \
629cabdff1aSopenharmony_ci{                                                                          \
630cabdff1aSopenharmony_ci    uint32_t load0, load1;                                                 \
631cabdff1aSopenharmony_ci    v16u8 src0 = { 0 };                                                    \
632cabdff1aSopenharmony_ci    v16u8 src1 = { 0 };                                                    \
633cabdff1aSopenharmony_ci    v16u8 src2 = { 0 };                                                    \
634cabdff1aSopenharmony_ci    v16u8 src3 = { 0 };                                                    \
635cabdff1aSopenharmony_ci    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                              \
636cabdff1aSopenharmony_ci    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;             \
637cabdff1aSopenharmony_ci    v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r;                 \
638cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };                                                   \
639cabdff1aSopenharmony_ci    v16u8 res0, res1;                                                      \
640cabdff1aSopenharmony_ci                                                                           \
641cabdff1aSopenharmony_ci    load0 = LW(src - 2);                                                   \
642cabdff1aSopenharmony_ci    load1 = LW(src - 2 + stride);                                          \
643cabdff1aSopenharmony_ci                                                                           \
644cabdff1aSopenharmony_ci    src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);                 \
645cabdff1aSopenharmony_ci    src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);                 \
646cabdff1aSopenharmony_ci                                                                           \
647cabdff1aSopenharmony_ci    TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3);                 \
648cabdff1aSopenharmony_ci                                                                           \
649cabdff1aSopenharmony_ci    p0_asub_q0 = __msa_asub_u_b(src2, src1);                               \
650cabdff1aSopenharmony_ci    p1_asub_p0 = __msa_asub_u_b(src1, src0);                               \
651cabdff1aSopenharmony_ci    q1_asub_q0 = __msa_asub_u_b(src2, src3);                               \
652cabdff1aSopenharmony_ci                                                                           \
653cabdff1aSopenharmony_ci    tc = __msa_fill_h(tc_val);                                             \
654cabdff1aSopenharmony_ci                                                                           \
655cabdff1aSopenharmony_ci    is_less_than_alpha = (p0_asub_q0 < alpha);                             \
656cabdff1aSopenharmony_ci    is_less_than_beta = (p1_asub_p0 < beta);                               \
657cabdff1aSopenharmony_ci    is_less_than = is_less_than_alpha & is_less_than_beta;                 \
658cabdff1aSopenharmony_ci    is_less_than_beta = (q1_asub_q0 < beta);                               \
659cabdff1aSopenharmony_ci    is_less_than = is_less_than_beta & is_less_than;                       \
660cabdff1aSopenharmony_ci                                                                           \
661cabdff1aSopenharmony_ci    ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);              \
662cabdff1aSopenharmony_ci    HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);               \
663cabdff1aSopenharmony_ci                                                                           \
664cabdff1aSopenharmony_ci    q0_sub_p0 <<= 2;                                                       \
665cabdff1aSopenharmony_ci    delta = q0_sub_p0 + p1_sub_q1;                                         \
666cabdff1aSopenharmony_ci    delta = __msa_srari_h(delta, 3);                                       \
667cabdff1aSopenharmony_ci    CLIP_SH(delta, -tc, tc);                                               \
668cabdff1aSopenharmony_ci                                                                           \
669cabdff1aSopenharmony_ci    ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                  \
670cabdff1aSopenharmony_ci                                                                           \
671cabdff1aSopenharmony_ci    res0_r += delta;                                                       \
672cabdff1aSopenharmony_ci    res1_r -= delta;                                                       \
673cabdff1aSopenharmony_ci                                                                           \
674cabdff1aSopenharmony_ci    CLIP_SH2_0_255(res0_r, res1_r);                                        \
675cabdff1aSopenharmony_ci    PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);               \
676cabdff1aSopenharmony_ci                                                                           \
677cabdff1aSopenharmony_ci    res0 = __msa_bmnz_v(src1, res0, is_less_than);                         \
678cabdff1aSopenharmony_ci    res1 = __msa_bmnz_v(src2, res1, is_less_than);                         \
679cabdff1aSopenharmony_ci                                                                           \
680cabdff1aSopenharmony_ci    res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);                \
681cabdff1aSopenharmony_ci}
682cabdff1aSopenharmony_ci
683cabdff1aSopenharmony_cistatic void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data,
684cabdff1aSopenharmony_ci                                                   uint8_t alpha_in,
685cabdff1aSopenharmony_ci                                                   uint8_t beta_in,
686cabdff1aSopenharmony_ci                                                   ptrdiff_t img_width)
687cabdff1aSopenharmony_ci{
688cabdff1aSopenharmony_ci    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
689cabdff1aSopenharmony_ci    v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
690cabdff1aSopenharmony_ci    v16u8 p1_org, p0_org, q0_org, q1_org;
691cabdff1aSopenharmony_ci
692cabdff1aSopenharmony_ci    LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
693cabdff1aSopenharmony_ci
694cabdff1aSopenharmony_ci    p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
695cabdff1aSopenharmony_ci    p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
696cabdff1aSopenharmony_ci    q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
697cabdff1aSopenharmony_ci
698cabdff1aSopenharmony_ci    is_less_than_alpha = (p0_asub_q0 < alpha_in);
699cabdff1aSopenharmony_ci    is_less_than_beta = (p1_asub_p0 < beta_in);
700cabdff1aSopenharmony_ci    is_less_than = is_less_than_beta & is_less_than_alpha;
701cabdff1aSopenharmony_ci    is_less_than_beta = (q1_asub_q0 < beta_in);
702cabdff1aSopenharmony_ci    is_less_than = is_less_than_beta & is_less_than;
703cabdff1aSopenharmony_ci
704cabdff1aSopenharmony_ci    if (!__msa_test_bz_v(is_less_than)) {
705cabdff1aSopenharmony_ci        v16u8 p2_asub_p0, q2_asub_q0, p0, q0, negate_is_less_than_beta;
706cabdff1aSopenharmony_ci        v8i16 p0_r = { 0 };
707cabdff1aSopenharmony_ci        v8i16 q0_r = { 0 };
708cabdff1aSopenharmony_ci        v8i16 p0_l = { 0 };
709cabdff1aSopenharmony_ci        v8i16 q0_l = { 0 };
710cabdff1aSopenharmony_ci        v16i8 zero = { 0 };
711cabdff1aSopenharmony_ci        v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
712cabdff1aSopenharmony_ci        v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
713cabdff1aSopenharmony_ci        v16u8 q2_org = LD_UB(data + (2 * img_width));
714cabdff1aSopenharmony_ci        v16u8 p2_org = LD_UB(data - (3 * img_width));
715cabdff1aSopenharmony_ci        v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2);
716cabdff1aSopenharmony_ci
717cabdff1aSopenharmony_ci        UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
718cabdff1aSopenharmony_ci        UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
719cabdff1aSopenharmony_ci        UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
720cabdff1aSopenharmony_ci
721cabdff1aSopenharmony_ci        tmp_flag = (p0_asub_q0 < tmp_flag);
722cabdff1aSopenharmony_ci
723cabdff1aSopenharmony_ci        p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
724cabdff1aSopenharmony_ci        is_less_than_beta = (p2_asub_p0 < beta_in);
725cabdff1aSopenharmony_ci        is_less_than_beta = is_less_than_beta & tmp_flag;
726cabdff1aSopenharmony_ci        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
727cabdff1aSopenharmony_ci        is_less_than_beta = is_less_than_beta & is_less_than;
728cabdff1aSopenharmony_ci        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
729cabdff1aSopenharmony_ci
730cabdff1aSopenharmony_ci        q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
731cabdff1aSopenharmony_ci        q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
732cabdff1aSopenharmony_ci
733cabdff1aSopenharmony_ci        /* combine and store */
734cabdff1aSopenharmony_ci        if (!__msa_test_bz_v(is_less_than_beta)) {
735cabdff1aSopenharmony_ci            v8i16 p3_org_l, p3_org_r;
736cabdff1aSopenharmony_ci            v16u8 p3_org = LD_UB(data - (img_width << 2));
737cabdff1aSopenharmony_ci            v16u8 p2, p1;
738cabdff1aSopenharmony_ci            v8i16 p2_r = { 0 };
739cabdff1aSopenharmony_ci            v8i16 p2_l = { 0 };
740cabdff1aSopenharmony_ci            v8i16 p1_r = { 0 };
741cabdff1aSopenharmony_ci            v8i16 p1_l = { 0 };
742cabdff1aSopenharmony_ci
743cabdff1aSopenharmony_ci            ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
744cabdff1aSopenharmony_ci            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
745cabdff1aSopenharmony_ci                                     p2_r, q1_org_r, p0_r, p1_r, p2_r);
746cabdff1aSopenharmony_ci
747cabdff1aSopenharmony_ci            ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
748cabdff1aSopenharmony_ci            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
749cabdff1aSopenharmony_ci                                     p2_l, q1_org_l, p0_l, p1_l, p2_l);
750cabdff1aSopenharmony_ci
751cabdff1aSopenharmony_ci            PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
752cabdff1aSopenharmony_ci
753cabdff1aSopenharmony_ci            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
754cabdff1aSopenharmony_ci            p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
755cabdff1aSopenharmony_ci            p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
756cabdff1aSopenharmony_ci
757cabdff1aSopenharmony_ci            ST_UB(p1_org, data - (2 * img_width));
758cabdff1aSopenharmony_ci            ST_UB(p2_org, data - (3 * img_width));
759cabdff1aSopenharmony_ci        }
760cabdff1aSopenharmony_ci
761cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
762cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
763cabdff1aSopenharmony_ci
764cabdff1aSopenharmony_ci        /* combine */
765cabdff1aSopenharmony_ci        p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
766cabdff1aSopenharmony_ci        p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
767cabdff1aSopenharmony_ci
768cabdff1aSopenharmony_ci        ST_UB(p0_org, data - img_width);
769cabdff1aSopenharmony_ci
770cabdff1aSopenharmony_ci        /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
771cabdff1aSopenharmony_ci        q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
772cabdff1aSopenharmony_ci        is_less_than_beta = (q2_asub_q0 < beta_in);
773cabdff1aSopenharmony_ci        is_less_than_beta = is_less_than_beta & tmp_flag;
774cabdff1aSopenharmony_ci        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
775cabdff1aSopenharmony_ci        is_less_than_beta = is_less_than_beta & is_less_than;
776cabdff1aSopenharmony_ci        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
777cabdff1aSopenharmony_ci
778cabdff1aSopenharmony_ci        /* combine and store */
779cabdff1aSopenharmony_ci        if (!__msa_test_bz_v(is_less_than_beta)) {
780cabdff1aSopenharmony_ci            v8i16 q3_org_r, q3_org_l;
781cabdff1aSopenharmony_ci            v16u8 q3_org = LD_UB(data + (3 * img_width));
782cabdff1aSopenharmony_ci            v16u8 q1, q2;
783cabdff1aSopenharmony_ci            v8i16 q2_r = { 0 };
784cabdff1aSopenharmony_ci            v8i16 q2_l = { 0 };
785cabdff1aSopenharmony_ci            v8i16 q1_r = { 0 };
786cabdff1aSopenharmony_ci            v8i16 q1_l = { 0 };
787cabdff1aSopenharmony_ci
788cabdff1aSopenharmony_ci            ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
789cabdff1aSopenharmony_ci            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
790cabdff1aSopenharmony_ci                                     q2_r, p1_org_r, q0_r, q1_r, q2_r);
791cabdff1aSopenharmony_ci
792cabdff1aSopenharmony_ci            ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
793cabdff1aSopenharmony_ci            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
794cabdff1aSopenharmony_ci                                     q2_l, p1_org_l, q0_l, q1_l, q2_l);
795cabdff1aSopenharmony_ci
796cabdff1aSopenharmony_ci            PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
797cabdff1aSopenharmony_ci            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
798cabdff1aSopenharmony_ci            q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
799cabdff1aSopenharmony_ci            q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
800cabdff1aSopenharmony_ci
801cabdff1aSopenharmony_ci            ST_UB(q1_org, data + img_width);
802cabdff1aSopenharmony_ci            ST_UB(q2_org, data + 2 * img_width);
803cabdff1aSopenharmony_ci        }
804cabdff1aSopenharmony_ci
805cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
806cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
807cabdff1aSopenharmony_ci
808cabdff1aSopenharmony_ci        /* combine */
809cabdff1aSopenharmony_ci        q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
810cabdff1aSopenharmony_ci        q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
811cabdff1aSopenharmony_ci
812cabdff1aSopenharmony_ci        ST_UB(q0_org, data);
813cabdff1aSopenharmony_ci    }
814cabdff1aSopenharmony_ci}
815cabdff1aSopenharmony_ci
816cabdff1aSopenharmony_cistatic void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data,
817cabdff1aSopenharmony_ci                                                   uint8_t alpha_in,
818cabdff1aSopenharmony_ci                                                   uint8_t beta_in,
819cabdff1aSopenharmony_ci                                                   ptrdiff_t img_width)
820cabdff1aSopenharmony_ci{
821cabdff1aSopenharmony_ci    uint8_t *src = data - 4;
822cabdff1aSopenharmony_ci    v16u8 alpha, beta, p0_asub_q0;
823cabdff1aSopenharmony_ci    v16u8 is_less_than_alpha, is_less_than, is_less_than_beta;
824cabdff1aSopenharmony_ci    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
825cabdff1aSopenharmony_ci    v16u8 p1_asub_p0, q1_asub_q0;
826cabdff1aSopenharmony_ci
827cabdff1aSopenharmony_ci
828cabdff1aSopenharmony_ci    {
829cabdff1aSopenharmony_ci        v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
830cabdff1aSopenharmony_ci        v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
831cabdff1aSopenharmony_ci
832cabdff1aSopenharmony_ci        LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
833cabdff1aSopenharmony_ci        LD_UB8(src + (8 * img_width), img_width,
834cabdff1aSopenharmony_ci               row8, row9, row10, row11, row12, row13, row14, row15);
835cabdff1aSopenharmony_ci
836cabdff1aSopenharmony_ci        TRANSPOSE16x8_UB_UB(row0, row1, row2, row3,
837cabdff1aSopenharmony_ci                            row4, row5, row6, row7,
838cabdff1aSopenharmony_ci                            row8, row9, row10, row11,
839cabdff1aSopenharmony_ci                            row12, row13, row14, row15,
840cabdff1aSopenharmony_ci                            p3_org, p2_org, p1_org, p0_org,
841cabdff1aSopenharmony_ci                            q0_org, q1_org, q2_org, q3_org);
842cabdff1aSopenharmony_ci    }
843cabdff1aSopenharmony_ci
844cabdff1aSopenharmony_ci    p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
845cabdff1aSopenharmony_ci    p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
846cabdff1aSopenharmony_ci    q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
847cabdff1aSopenharmony_ci
848cabdff1aSopenharmony_ci    alpha = (v16u8) __msa_fill_b(alpha_in);
849cabdff1aSopenharmony_ci    beta = (v16u8) __msa_fill_b(beta_in);
850cabdff1aSopenharmony_ci
851cabdff1aSopenharmony_ci    is_less_than_alpha = (p0_asub_q0 < alpha);
852cabdff1aSopenharmony_ci    is_less_than_beta = (p1_asub_p0 < beta);
853cabdff1aSopenharmony_ci    is_less_than = is_less_than_beta & is_less_than_alpha;
854cabdff1aSopenharmony_ci    is_less_than_beta = (q1_asub_q0 < beta);
855cabdff1aSopenharmony_ci    is_less_than = is_less_than_beta & is_less_than;
856cabdff1aSopenharmony_ci
857cabdff1aSopenharmony_ci    if (!__msa_test_bz_v(is_less_than)) {
858cabdff1aSopenharmony_ci        v8i16 p0_r = { 0 };
859cabdff1aSopenharmony_ci        v8i16 q0_r = { 0 };
860cabdff1aSopenharmony_ci        v8i16 p0_l = { 0 };
861cabdff1aSopenharmony_ci        v8i16 q0_l = { 0 };
862cabdff1aSopenharmony_ci        v16i8 zero = { 0 };
863cabdff1aSopenharmony_ci        v16u8 tmp_flag, p0, q0, p2_asub_p0, q2_asub_q0;
864cabdff1aSopenharmony_ci        v16u8 negate_is_less_than_beta;
865cabdff1aSopenharmony_ci        v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
866cabdff1aSopenharmony_ci        v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
867cabdff1aSopenharmony_ci
868cabdff1aSopenharmony_ci        UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
869cabdff1aSopenharmony_ci        UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
870cabdff1aSopenharmony_ci        UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
871cabdff1aSopenharmony_ci        UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l);
872cabdff1aSopenharmony_ci
873cabdff1aSopenharmony_ci        tmp_flag = alpha >> 2;
874cabdff1aSopenharmony_ci        tmp_flag = tmp_flag + 2;
875cabdff1aSopenharmony_ci        tmp_flag = (p0_asub_q0 < tmp_flag);
876cabdff1aSopenharmony_ci
877cabdff1aSopenharmony_ci        p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
878cabdff1aSopenharmony_ci        is_less_than_beta = (p2_asub_p0 < beta);
879cabdff1aSopenharmony_ci        is_less_than_beta = tmp_flag & is_less_than_beta;
880cabdff1aSopenharmony_ci        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
881cabdff1aSopenharmony_ci        is_less_than_beta = is_less_than_beta & is_less_than;
882cabdff1aSopenharmony_ci        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
883cabdff1aSopenharmony_ci
884cabdff1aSopenharmony_ci        if (!__msa_test_bz_v(is_less_than_beta)) {
885cabdff1aSopenharmony_ci            v16u8 p2, p1;
886cabdff1aSopenharmony_ci            v8i16 p3_org_r, p3_org_l;
887cabdff1aSopenharmony_ci            v8i16 p2_l = { 0 };
888cabdff1aSopenharmony_ci            v8i16 p2_r = { 0 };
889cabdff1aSopenharmony_ci            v8i16 p1_l = { 0 };
890cabdff1aSopenharmony_ci            v8i16 p1_r = { 0 };
891cabdff1aSopenharmony_ci
892cabdff1aSopenharmony_ci            ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
893cabdff1aSopenharmony_ci            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
894cabdff1aSopenharmony_ci                                     p2_r, q1_org_r, p0_r, p1_r, p2_r);
895cabdff1aSopenharmony_ci
896cabdff1aSopenharmony_ci            ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
897cabdff1aSopenharmony_ci            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
898cabdff1aSopenharmony_ci                                         p2_l, q1_org_l, p0_l, p1_l, p2_l);
899cabdff1aSopenharmony_ci
900cabdff1aSopenharmony_ci            PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
901cabdff1aSopenharmony_ci            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
902cabdff1aSopenharmony_ci            p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
903cabdff1aSopenharmony_ci            p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
904cabdff1aSopenharmony_ci        }
905cabdff1aSopenharmony_ci
906cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
907cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
908cabdff1aSopenharmony_ci
909cabdff1aSopenharmony_ci        p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
910cabdff1aSopenharmony_ci        p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
911cabdff1aSopenharmony_ci
912cabdff1aSopenharmony_ci        q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
913cabdff1aSopenharmony_ci        is_less_than_beta = (q2_asub_q0 < beta);
914cabdff1aSopenharmony_ci
915cabdff1aSopenharmony_ci        is_less_than_beta = is_less_than_beta & tmp_flag;
916cabdff1aSopenharmony_ci        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
917cabdff1aSopenharmony_ci
918cabdff1aSopenharmony_ci        is_less_than_beta = is_less_than_beta & is_less_than;
919cabdff1aSopenharmony_ci        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
920cabdff1aSopenharmony_ci
921cabdff1aSopenharmony_ci        if (!__msa_test_bz_v(is_less_than_beta)) {
922cabdff1aSopenharmony_ci            v16u8 q1, q2;
923cabdff1aSopenharmony_ci            v8i16 q3_org_r, q3_org_l;
924cabdff1aSopenharmony_ci            v8i16 q1_l = { 0 };
925cabdff1aSopenharmony_ci            v8i16 q1_r = { 0 };
926cabdff1aSopenharmony_ci            v8i16 q2_l = { 0 };
927cabdff1aSopenharmony_ci            v8i16 q2_r = { 0 };
928cabdff1aSopenharmony_ci
929cabdff1aSopenharmony_ci            ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
930cabdff1aSopenharmony_ci            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
931cabdff1aSopenharmony_ci                                     q2_r, p1_org_r, q0_r, q1_r, q2_r);
932cabdff1aSopenharmony_ci
933cabdff1aSopenharmony_ci            ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
934cabdff1aSopenharmony_ci            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
935cabdff1aSopenharmony_ci                                     q2_l, p1_org_l, q0_l, q1_l, q2_l);
936cabdff1aSopenharmony_ci
937cabdff1aSopenharmony_ci            PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
938cabdff1aSopenharmony_ci            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
939cabdff1aSopenharmony_ci            q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
940cabdff1aSopenharmony_ci            q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
941cabdff1aSopenharmony_ci        }
942cabdff1aSopenharmony_ci
943cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
944cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
945cabdff1aSopenharmony_ci
946cabdff1aSopenharmony_ci        q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
947cabdff1aSopenharmony_ci        q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
948cabdff1aSopenharmony_ci
949cabdff1aSopenharmony_ci    {
950cabdff1aSopenharmony_ci        v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
951cabdff1aSopenharmony_ci
952cabdff1aSopenharmony_ci        ILVRL_B2_SH(p1_org, p2_org, tp0, tp2);
953cabdff1aSopenharmony_ci        ILVRL_B2_SH(q0_org, p0_org, tp1, tp3);
954cabdff1aSopenharmony_ci        ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
955cabdff1aSopenharmony_ci
956cabdff1aSopenharmony_ci        ILVRL_H2_SH(tp1, tp0, tmp3, tmp4);
957cabdff1aSopenharmony_ci        ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
958cabdff1aSopenharmony_ci
959cabdff1aSopenharmony_ci        src = data - 3;
960cabdff1aSopenharmony_ci        ST_W4(tmp3, 0, 1, 2, 3, src, img_width);
961cabdff1aSopenharmony_ci        ST_H4(tmp2, 0, 1, 2, 3, src + 4, img_width);
962cabdff1aSopenharmony_ci        src += 4 * img_width;
963cabdff1aSopenharmony_ci        ST_W4(tmp4, 0, 1, 2, 3, src, img_width);
964cabdff1aSopenharmony_ci        ST_H4(tmp2, 4, 5, 6, 7, src + 4, img_width);
965cabdff1aSopenharmony_ci        src += 4 * img_width;
966cabdff1aSopenharmony_ci
967cabdff1aSopenharmony_ci        ST_W4(tmp6, 0, 1, 2, 3, src, img_width);
968cabdff1aSopenharmony_ci        ST_H4(tmp5, 0, 1, 2, 3, src + 4, img_width);
969cabdff1aSopenharmony_ci        src += 4 * img_width;
970cabdff1aSopenharmony_ci        ST_W4(tmp7, 0, 1, 2, 3, src, img_width);
971cabdff1aSopenharmony_ci        ST_H4(tmp5, 4, 5, 6, 7, src + 4, img_width);
972cabdff1aSopenharmony_ci    }
973cabdff1aSopenharmony_ci    }
974cabdff1aSopenharmony_ci}
975cabdff1aSopenharmony_ci
976cabdff1aSopenharmony_cistatic void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
977cabdff1aSopenharmony_ci                                                   ptrdiff_t stride,
978cabdff1aSopenharmony_ci                                                   int32_t alpha_in,
979cabdff1aSopenharmony_ci                                                   int32_t beta_in)
980cabdff1aSopenharmony_ci{
981cabdff1aSopenharmony_ci    uint64_t load0, load1;
982cabdff1aSopenharmony_ci    uint32_t out0, out2;
983cabdff1aSopenharmony_ci    uint16_t out1, out3;
984cabdff1aSopenharmony_ci    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
985cabdff1aSopenharmony_ci    v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
986cabdff1aSopenharmony_ci    v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
987cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
988cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3;
989cabdff1aSopenharmony_ci    v16u8 alpha, beta;
990cabdff1aSopenharmony_ci    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
991cabdff1aSopenharmony_ci    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
992cabdff1aSopenharmony_ci    v16u8 is_less_than_beta1, is_less_than_beta2;
993cabdff1aSopenharmony_ci    v16i8 src0 = { 0 };
994cabdff1aSopenharmony_ci    v16i8 src1 = { 0 };
995cabdff1aSopenharmony_ci    v16i8 src2 = { 0 };
996cabdff1aSopenharmony_ci    v16i8 src3 = { 0 };
997cabdff1aSopenharmony_ci    v16i8 src4 = { 0 };
998cabdff1aSopenharmony_ci    v16i8 src5 = { 0 };
999cabdff1aSopenharmony_ci    v16i8 src6 = { 0 };
1000cabdff1aSopenharmony_ci    v16i8 src7 = { 0 };
1001cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };
1002cabdff1aSopenharmony_ci
1003cabdff1aSopenharmony_ci    load0 = LD(src - 4);
1004cabdff1aSopenharmony_ci    load1 = LD(src + stride - 4);
1005cabdff1aSopenharmony_ci    src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0);
1006cabdff1aSopenharmony_ci    src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1);
1007cabdff1aSopenharmony_ci
1008cabdff1aSopenharmony_ci    load0 = LD(src + (2 * stride) - 4);
1009cabdff1aSopenharmony_ci    load1 = LD(src + (3 * stride) - 4);
1010cabdff1aSopenharmony_ci    src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
1011cabdff1aSopenharmony_ci    src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
1012cabdff1aSopenharmony_ci
1013cabdff1aSopenharmony_ci    load0 = LD(src + (4 * stride) - 4);
1014cabdff1aSopenharmony_ci    load1 = LD(src + (5 * stride) - 4);
1015cabdff1aSopenharmony_ci    src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
1016cabdff1aSopenharmony_ci    src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
1017cabdff1aSopenharmony_ci
1018cabdff1aSopenharmony_ci    load0 = LD(src + (6 * stride) - 4);
1019cabdff1aSopenharmony_ci    load1 = LD(src + (7 * stride) - 4);
1020cabdff1aSopenharmony_ci    src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
1021cabdff1aSopenharmony_ci    src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
1022cabdff1aSopenharmony_ci
1023cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
1024cabdff1aSopenharmony_ci               src0, src1, src2, src3);
1025cabdff1aSopenharmony_ci
1026cabdff1aSopenharmony_ci    ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
1027cabdff1aSopenharmony_ci    ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
1028cabdff1aSopenharmony_ci
1029cabdff1aSopenharmony_ci    ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
1030cabdff1aSopenharmony_ci    ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
1031cabdff1aSopenharmony_ci    SLDI_B4_SB(zeros, src6, zeros, src1, zeros, src3, zeros, src5,
1032cabdff1aSopenharmony_ci               8, src0, src2, src4, src7);
1033cabdff1aSopenharmony_ci
1034cabdff1aSopenharmony_ci    p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1035cabdff1aSopenharmony_ci    p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1036cabdff1aSopenharmony_ci    q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1037cabdff1aSopenharmony_ci
1038cabdff1aSopenharmony_ci    alpha = (v16u8) __msa_fill_b(alpha_in);
1039cabdff1aSopenharmony_ci    beta = (v16u8) __msa_fill_b(beta_in);
1040cabdff1aSopenharmony_ci
1041cabdff1aSopenharmony_ci    is_less_than_alpha = (p0_asub_q0 < alpha);
1042cabdff1aSopenharmony_ci    is_less_than_beta = (p1_asub_p0 < beta);
1043cabdff1aSopenharmony_ci    is_less_than = is_less_than_alpha & is_less_than_beta;
1044cabdff1aSopenharmony_ci    is_less_than_beta = (q1_asub_q0 < beta);
1045cabdff1aSopenharmony_ci    is_less_than = is_less_than & is_less_than_beta;
1046cabdff1aSopenharmony_ci
1047cabdff1aSopenharmony_ci    alpha >>= 2;
1048cabdff1aSopenharmony_ci    alpha += 2;
1049cabdff1aSopenharmony_ci
1050cabdff1aSopenharmony_ci    is_less_than_alpha = (p0_asub_q0 < alpha);
1051cabdff1aSopenharmony_ci
1052cabdff1aSopenharmony_ci    p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1053cabdff1aSopenharmony_ci    is_less_than_beta1 = (p2_asub_p0 < beta);
1054cabdff1aSopenharmony_ci    q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1055cabdff1aSopenharmony_ci    is_less_than_beta2 = (q2_asub_q0 < beta);
1056cabdff1aSopenharmony_ci
1057cabdff1aSopenharmony_ci    ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
1058cabdff1aSopenharmony_ci               src0_r, src1_r, src2_r, src3_r);
1059cabdff1aSopenharmony_ci    ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
1060cabdff1aSopenharmony_ci               src4_r, src5_r, src6_r, src7_r);
1061cabdff1aSopenharmony_ci
1062cabdff1aSopenharmony_ci    dst2_x_r = src1_r + src2_r + src3_r;
1063cabdff1aSopenharmony_ci    dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
1064cabdff1aSopenharmony_ci    dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
1065cabdff1aSopenharmony_ci    dst1_r = src0_r + src1_r + src2_r + src3_r;
1066cabdff1aSopenharmony_ci    dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
1067cabdff1aSopenharmony_ci
1068cabdff1aSopenharmony_ci    dst0_r = (2 * src6_r) + (3 * src0_r);
1069cabdff1aSopenharmony_ci    dst0_r += src1_r + src2_r + src3_r;
1070cabdff1aSopenharmony_ci    dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
1071cabdff1aSopenharmony_ci    dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1072cabdff1aSopenharmony_ci    dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1073cabdff1aSopenharmony_ci
1074cabdff1aSopenharmony_ci    PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
1075cabdff1aSopenharmony_ci    dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
1076cabdff1aSopenharmony_ci
1077cabdff1aSopenharmony_ci    dst3_x_r = src2_r + src3_r + src4_r;
1078cabdff1aSopenharmony_ci    dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
1079cabdff1aSopenharmony_ci    dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
1080cabdff1aSopenharmony_ci    dst4_r = src2_r + src3_r + src4_r + src5_r;
1081cabdff1aSopenharmony_ci    dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
1082cabdff1aSopenharmony_ci
1083cabdff1aSopenharmony_ci    dst5_r = (2 * src7_r) + (3 * src5_r);
1084cabdff1aSopenharmony_ci    dst5_r += src4_r + src3_r + src2_r;
1085cabdff1aSopenharmony_ci    dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
1086cabdff1aSopenharmony_ci    dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1087cabdff1aSopenharmony_ci    dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1088cabdff1aSopenharmony_ci
1089cabdff1aSopenharmony_ci    PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
1090cabdff1aSopenharmony_ci    dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
1091cabdff1aSopenharmony_ci
1092cabdff1aSopenharmony_ci    dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1093cabdff1aSopenharmony_ci    dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1094cabdff1aSopenharmony_ci    dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1095cabdff1aSopenharmony_ci    dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1096cabdff1aSopenharmony_ci
1097cabdff1aSopenharmony_ci    PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
1098cabdff1aSopenharmony_ci
1099cabdff1aSopenharmony_ci    dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
1100cabdff1aSopenharmony_ci    dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
1101cabdff1aSopenharmony_ci    dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
1102cabdff1aSopenharmony_ci    dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
1103cabdff1aSopenharmony_ci
1104cabdff1aSopenharmony_ci    is_less_than = is_less_than_alpha & is_less_than;
1105cabdff1aSopenharmony_ci    dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
1106cabdff1aSopenharmony_ci    is_less_than_beta1 = is_less_than_beta1 & is_less_than;
1107cabdff1aSopenharmony_ci    dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
1108cabdff1aSopenharmony_ci
1109cabdff1aSopenharmony_ci    dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
1110cabdff1aSopenharmony_ci    dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
1111cabdff1aSopenharmony_ci    dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
1112cabdff1aSopenharmony_ci    is_less_than_beta2 = is_less_than_beta2 & is_less_than;
1113cabdff1aSopenharmony_ci    dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
1114cabdff1aSopenharmony_ci    dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
1115cabdff1aSopenharmony_ci    dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
1116cabdff1aSopenharmony_ci
1117cabdff1aSopenharmony_ci    ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
1118cabdff1aSopenharmony_ci    dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
1119cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst1, dst0, tmp0, tmp1);
1120cabdff1aSopenharmony_ci    ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
1121cabdff1aSopenharmony_ci
1122cabdff1aSopenharmony_ci    ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
1123cabdff1aSopenharmony_ci    SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5);
1124cabdff1aSopenharmony_ci    dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
1125cabdff1aSopenharmony_ci    dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
1126cabdff1aSopenharmony_ci    SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y);
1127cabdff1aSopenharmony_ci
1128cabdff1aSopenharmony_ci    out0 = __msa_copy_u_w((v4i32) dst0, 0);
1129cabdff1aSopenharmony_ci    out1 = __msa_copy_u_h((v8i16) dst0, 2);
1130cabdff1aSopenharmony_ci    out2 = __msa_copy_u_w((v4i32) dst1, 0);
1131cabdff1aSopenharmony_ci    out3 = __msa_copy_u_h((v8i16) dst1, 2);
1132cabdff1aSopenharmony_ci
1133cabdff1aSopenharmony_ci    SW(out0, (src - 3));
1134cabdff1aSopenharmony_ci    SH(out1, (src + 1));
1135cabdff1aSopenharmony_ci    src += stride;
1136cabdff1aSopenharmony_ci    SW(out2, (src - 3));
1137cabdff1aSopenharmony_ci    SH(out3, (src + 1));
1138cabdff1aSopenharmony_ci    src += stride;
1139cabdff1aSopenharmony_ci
1140cabdff1aSopenharmony_ci    out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
1141cabdff1aSopenharmony_ci    out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
1142cabdff1aSopenharmony_ci    out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
1143cabdff1aSopenharmony_ci    out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
1144cabdff1aSopenharmony_ci
1145cabdff1aSopenharmony_ci    SW(out0, (src - 3));
1146cabdff1aSopenharmony_ci    SH(out1, (src + 1));
1147cabdff1aSopenharmony_ci    src += stride;
1148cabdff1aSopenharmony_ci    SW(out2, (src - 3));
1149cabdff1aSopenharmony_ci    SH(out3, (src + 1));
1150cabdff1aSopenharmony_ci    src += stride;
1151cabdff1aSopenharmony_ci
1152cabdff1aSopenharmony_ci    out0 = __msa_copy_u_w((v4i32) dst4, 0);
1153cabdff1aSopenharmony_ci    out1 = __msa_copy_u_h((v8i16) dst4, 2);
1154cabdff1aSopenharmony_ci    out2 = __msa_copy_u_w((v4i32) dst5, 0);
1155cabdff1aSopenharmony_ci    out3 = __msa_copy_u_h((v8i16) dst5, 2);
1156cabdff1aSopenharmony_ci
1157cabdff1aSopenharmony_ci    SW(out0, (src - 3));
1158cabdff1aSopenharmony_ci    SH(out1, (src + 1));
1159cabdff1aSopenharmony_ci    src += stride;
1160cabdff1aSopenharmony_ci    SW(out2, (src - 3));
1161cabdff1aSopenharmony_ci    SH(out3, (src + 1));
1162cabdff1aSopenharmony_ci    src += stride;
1163cabdff1aSopenharmony_ci
1164cabdff1aSopenharmony_ci    out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
1165cabdff1aSopenharmony_ci    out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
1166cabdff1aSopenharmony_ci    out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
1167cabdff1aSopenharmony_ci    out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
1168cabdff1aSopenharmony_ci
1169cabdff1aSopenharmony_ci    SW(out0, (src - 3));
1170cabdff1aSopenharmony_ci    SH(out1, (src + 1));
1171cabdff1aSopenharmony_ci    src += stride;
1172cabdff1aSopenharmony_ci    SW(out2, (src - 3));
1173cabdff1aSopenharmony_ci    SH(out3, (src + 1));
1174cabdff1aSopenharmony_ci}
1175cabdff1aSopenharmony_ci
1176cabdff1aSopenharmony_cistatic void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr,
1177cabdff1aSopenharmony_ci                                                       uint8_t alpha_in,
1178cabdff1aSopenharmony_ci                                                       uint8_t beta_in,
1179cabdff1aSopenharmony_ci                                                       ptrdiff_t img_width)
1180cabdff1aSopenharmony_ci{
1181cabdff1aSopenharmony_ci    v16u8 alpha, beta;
1182cabdff1aSopenharmony_ci    v16u8 is_less_than;
1183cabdff1aSopenharmony_ci    v8i16 p0_or_q0, q0_or_p0;
1184cabdff1aSopenharmony_ci    v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1185cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
1186cabdff1aSopenharmony_ci    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1187cabdff1aSopenharmony_ci    v16u8 is_less_than_alpha, is_less_than_beta;
1188cabdff1aSopenharmony_ci    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1189cabdff1aSopenharmony_ci
1190cabdff1aSopenharmony_ci    alpha = (v16u8) __msa_fill_b(alpha_in);
1191cabdff1aSopenharmony_ci    beta = (v16u8) __msa_fill_b(beta_in);
1192cabdff1aSopenharmony_ci
1193cabdff1aSopenharmony_ci    LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
1194cabdff1aSopenharmony_ci           p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
1195cabdff1aSopenharmony_ci
1196cabdff1aSopenharmony_ci    p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1197cabdff1aSopenharmony_ci    p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1198cabdff1aSopenharmony_ci    q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1199cabdff1aSopenharmony_ci
1200cabdff1aSopenharmony_ci    is_less_than_alpha = (p0_asub_q0 < alpha);
1201cabdff1aSopenharmony_ci    is_less_than_beta = (p1_asub_p0 < beta);
1202cabdff1aSopenharmony_ci    is_less_than = is_less_than_beta & is_less_than_alpha;
1203cabdff1aSopenharmony_ci    is_less_than_beta = (q1_asub_q0 < beta);
1204cabdff1aSopenharmony_ci    is_less_than = is_less_than_beta & is_less_than;
1205cabdff1aSopenharmony_ci
1206cabdff1aSopenharmony_ci    is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1207cabdff1aSopenharmony_ci
1208cabdff1aSopenharmony_ci    if (!__msa_test_bz_v(is_less_than)) {
1209cabdff1aSopenharmony_ci        ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1210cabdff1aSopenharmony_ci                   zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1211cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1212cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1213cabdff1aSopenharmony_ci        PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1214cabdff1aSopenharmony_ci
1215cabdff1aSopenharmony_ci        p0_or_q0_org =
1216cabdff1aSopenharmony_ci            __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1217cabdff1aSopenharmony_ci        q0_or_p0_org =
1218cabdff1aSopenharmony_ci            __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1219cabdff1aSopenharmony_ci
1220cabdff1aSopenharmony_ci        ST_UB(q0_or_p0_org, data_cb_or_cr);
1221cabdff1aSopenharmony_ci        ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
1222cabdff1aSopenharmony_ci    }
1223cabdff1aSopenharmony_ci}
1224cabdff1aSopenharmony_ci
1225cabdff1aSopenharmony_cistatic void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr,
1226cabdff1aSopenharmony_ci                                                       uint8_t alpha_in,
1227cabdff1aSopenharmony_ci                                                       uint8_t beta_in,
1228cabdff1aSopenharmony_ci                                                       ptrdiff_t img_width)
1229cabdff1aSopenharmony_ci{
1230cabdff1aSopenharmony_ci    v8i16 tmp1;
1231cabdff1aSopenharmony_ci    v16u8 alpha, beta, is_less_than;
1232cabdff1aSopenharmony_ci    v8i16 p0_or_q0, q0_or_p0;
1233cabdff1aSopenharmony_ci    v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1234cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
1235cabdff1aSopenharmony_ci    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1236cabdff1aSopenharmony_ci    v16u8 is_less_than_alpha, is_less_than_beta;
1237cabdff1aSopenharmony_ci    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1238cabdff1aSopenharmony_ci
1239cabdff1aSopenharmony_ci    {
1240cabdff1aSopenharmony_ci        v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1241cabdff1aSopenharmony_ci
1242cabdff1aSopenharmony_ci        LD_UB8((data_cb_or_cr - 2), img_width,
1243cabdff1aSopenharmony_ci               row0, row1, row2, row3, row4, row5, row6, row7);
1244cabdff1aSopenharmony_ci
1245cabdff1aSopenharmony_ci        TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1246cabdff1aSopenharmony_ci                           p1_or_q1_org, p0_or_q0_org,
1247cabdff1aSopenharmony_ci                           q0_or_p0_org, q1_or_p1_org);
1248cabdff1aSopenharmony_ci    }
1249cabdff1aSopenharmony_ci
1250cabdff1aSopenharmony_ci    alpha = (v16u8) __msa_fill_b(alpha_in);
1251cabdff1aSopenharmony_ci    beta = (v16u8) __msa_fill_b(beta_in);
1252cabdff1aSopenharmony_ci
1253cabdff1aSopenharmony_ci    p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1254cabdff1aSopenharmony_ci    p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1255cabdff1aSopenharmony_ci    q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1256cabdff1aSopenharmony_ci
1257cabdff1aSopenharmony_ci    is_less_than_alpha = (p0_asub_q0 < alpha);
1258cabdff1aSopenharmony_ci    is_less_than_beta = (p1_asub_p0 < beta);
1259cabdff1aSopenharmony_ci    is_less_than = is_less_than_beta & is_less_than_alpha;
1260cabdff1aSopenharmony_ci    is_less_than_beta = (q1_asub_q0 < beta);
1261cabdff1aSopenharmony_ci    is_less_than = is_less_than_beta & is_less_than;
1262cabdff1aSopenharmony_ci    is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1263cabdff1aSopenharmony_ci
1264cabdff1aSopenharmony_ci    if (!__msa_test_bz_v(is_less_than)) {
1265cabdff1aSopenharmony_ci        ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1266cabdff1aSopenharmony_ci                   zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1267cabdff1aSopenharmony_ci
1268cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1269cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1270cabdff1aSopenharmony_ci
1271cabdff1aSopenharmony_ci        /* convert 16 bit output into 8 bit output */
1272cabdff1aSopenharmony_ci        PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1273cabdff1aSopenharmony_ci
1274cabdff1aSopenharmony_ci        p0_or_q0_org =
1275cabdff1aSopenharmony_ci            __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1276cabdff1aSopenharmony_ci        q0_or_p0_org =
1277cabdff1aSopenharmony_ci            __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1278cabdff1aSopenharmony_ci        tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
1279cabdff1aSopenharmony_ci
1280cabdff1aSopenharmony_ci        data_cb_or_cr -= 1;
1281cabdff1aSopenharmony_ci        ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width);
1282cabdff1aSopenharmony_ci        data_cb_or_cr += 4 * img_width;
1283cabdff1aSopenharmony_ci        ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width);
1284cabdff1aSopenharmony_ci    }
1285cabdff1aSopenharmony_ci}
1286cabdff1aSopenharmony_ci
1287cabdff1aSopenharmony_cistatic void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t* pPix, uint32_t iStride,
1288cabdff1aSopenharmony_ci                                                   uint8_t iAlpha, uint8_t iBeta,
1289cabdff1aSopenharmony_ci                                                   uint8_t* pTc)
1290cabdff1aSopenharmony_ci{
1291cabdff1aSopenharmony_ci    v16u8 p0, p1, p2, q0, q1, q2;
1292cabdff1aSopenharmony_ci    v16i8 iTc, negiTc, negTc, flags, f;
1293cabdff1aSopenharmony_ci    v8i16 p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r, q1_l, q1_r, q2_l, q2_r;
1294cabdff1aSopenharmony_ci    v8i16 tc_l, tc_r, negTc_l, negTc_r;
1295cabdff1aSopenharmony_ci    v8i16 iTc_l, iTc_r, negiTc_l, negiTc_r;
1296cabdff1aSopenharmony_ci    // Use for temporary variable
1297cabdff1aSopenharmony_ci    v8i16 t0, t1, t2, t3;
1298cabdff1aSopenharmony_ci    v16u8 alpha, beta;
1299cabdff1aSopenharmony_ci    v16u8 bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0;
1300cabdff1aSopenharmony_ci    v16i8 const_1_b = __msa_ldi_b(1);
1301cabdff1aSopenharmony_ci    v8i16 const_1_h = __msa_ldi_h(1);
1302cabdff1aSopenharmony_ci    v8i16 const_4_h = __msa_ldi_h(4);
1303cabdff1aSopenharmony_ci    v8i16 const_not_255_h = __msa_ldi_h(~255);
1304cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
1305cabdff1aSopenharmony_ci    v16i8 tc = { pTc[0  >> 2], pTc[1  >> 2], pTc[2  >> 2], pTc[3  >> 2],
1306cabdff1aSopenharmony_ci                 pTc[4  >> 2], pTc[5  >> 2], pTc[6  >> 2], pTc[7  >> 2],
1307cabdff1aSopenharmony_ci                 pTc[8  >> 2], pTc[9  >> 2], pTc[10 >> 2], pTc[11 >> 2],
1308cabdff1aSopenharmony_ci                 pTc[12 >> 2], pTc[13 >> 2], pTc[14 >> 2], pTc[15 >> 2] };
1309cabdff1aSopenharmony_ci    negTc = zero - tc;
1310cabdff1aSopenharmony_ci    iTc = tc;
1311cabdff1aSopenharmony_ci
1312cabdff1aSopenharmony_ci    // Load data from pPix
1313cabdff1aSopenharmony_ci    LD_SH8(pPix - 3, iStride, t0, t1, t2, t3, q1_l, q1_r, q2_l, q2_r);
1314cabdff1aSopenharmony_ci    LD_SH8(pPix + 8 * iStride - 3, iStride, p0_l, p0_r, p1_l, p1_r,
1315cabdff1aSopenharmony_ci           p2_l, p2_r, q0_l, q0_r);
1316cabdff1aSopenharmony_ci    TRANSPOSE16x8_UB_UB(t0, t1, t2, t3, q1_l, q1_r, q2_l, q2_r,
1317cabdff1aSopenharmony_ci                        p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r,
1318cabdff1aSopenharmony_ci                        p2, p1, p0, q0, q1, q2, alpha, beta);
1319cabdff1aSopenharmony_ci
1320cabdff1aSopenharmony_ci    alpha = (v16u8)__msa_fill_b(iAlpha);
1321cabdff1aSopenharmony_ci    beta  = (v16u8)__msa_fill_b(iBeta);
1322cabdff1aSopenharmony_ci
1323cabdff1aSopenharmony_ci    bDetaP0Q0 = __msa_asub_u_b(p0, q0);
1324cabdff1aSopenharmony_ci    bDetaP1P0 = __msa_asub_u_b(p1, p0);
1325cabdff1aSopenharmony_ci    bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
1326cabdff1aSopenharmony_ci    bDetaP2P0 = __msa_asub_u_b(p2, p0);
1327cabdff1aSopenharmony_ci    bDetaQ2Q0 = __msa_asub_u_b(q2, q0);
1328cabdff1aSopenharmony_ci    bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha);
1329cabdff1aSopenharmony_ci    bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
1330cabdff1aSopenharmony_ci    bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
1331cabdff1aSopenharmony_ci    bDetaP2P0 = (v16u8)__msa_clt_u_b(bDetaP2P0, beta);
1332cabdff1aSopenharmony_ci    bDetaQ2Q0 = (v16u8)__msa_clt_u_b(bDetaQ2Q0, beta);
1333cabdff1aSopenharmony_ci
1334cabdff1aSopenharmony_ci    // Unsigned extend p0, p1, p2, q0, q1, q2 from 8 bits to 16 bits
1335cabdff1aSopenharmony_ci    ILVRL_B2_SH(zero, p0, p0_r, p0_l);
1336cabdff1aSopenharmony_ci    ILVRL_B2_SH(zero, p1, p1_r, p1_l);
1337cabdff1aSopenharmony_ci    ILVRL_B2_SH(zero, p2, p2_r, p2_l);
1338cabdff1aSopenharmony_ci    ILVRL_B2_SH(zero, q0, q0_r, q0_l);
1339cabdff1aSopenharmony_ci    ILVRL_B2_SH(zero, q1, q1_r, q1_l);
1340cabdff1aSopenharmony_ci    ILVRL_B2_SH(zero, q2, q2_r, q2_l);
1341cabdff1aSopenharmony_ci    // Signed extend tc, negTc from 8 bits to 16 bits
1342cabdff1aSopenharmony_ci    flags = __msa_clt_s_b(tc, zero);
1343cabdff1aSopenharmony_ci    ILVRL_B2(v8i16, flags, tc, tc_r, tc_l);
1344cabdff1aSopenharmony_ci    flags = __msa_clt_s_b(negTc, zero);
1345cabdff1aSopenharmony_ci    ILVRL_B2(v8i16, flags, negTc, negTc_r, negTc_l);
1346cabdff1aSopenharmony_ci
1347cabdff1aSopenharmony_ci    f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
1348cabdff1aSopenharmony_ci    flags = f & (v16i8)bDetaP2P0;
1349cabdff1aSopenharmony_ci    flags = __msa_ceq_b(flags, zero);
1350cabdff1aSopenharmony_ci    iTc += ((~flags) & const_1_b);
1351cabdff1aSopenharmony_ci    flags = f & (v16i8)bDetaQ2Q0;
1352cabdff1aSopenharmony_ci    flags = __msa_ceq_b(flags, zero);
1353cabdff1aSopenharmony_ci    iTc += ((~flags) & const_1_b);
1354cabdff1aSopenharmony_ci    negiTc = zero - iTc;
1355cabdff1aSopenharmony_ci    // Signed extend iTc, negiTc from 8 bits to 16 bits
1356cabdff1aSopenharmony_ci    flags = __msa_clt_s_b(iTc, zero);
1357cabdff1aSopenharmony_ci    ILVRL_B2(v8i16, flags, iTc, iTc_r, iTc_l);
1358cabdff1aSopenharmony_ci    flags = __msa_clt_s_b(negiTc, zero);
1359cabdff1aSopenharmony_ci    ILVRL_B2(v8i16, flags, negiTc, negiTc_r, negiTc_l);
1360cabdff1aSopenharmony_ci
1361cabdff1aSopenharmony_ci    // Calculate the left part
1362cabdff1aSopenharmony_ci    // p1
1363cabdff1aSopenharmony_ci    t0 = (p2_l + ((p0_l + q0_l + const_1_h) >> 1) - (p1_l << 1)) >> 1;
1364cabdff1aSopenharmony_ci    t0 = __msa_max_s_h(negTc_l, t0);
1365cabdff1aSopenharmony_ci    t0 = __msa_min_s_h(tc_l, t0);
1366cabdff1aSopenharmony_ci    t1 = p1_l + t0;
1367cabdff1aSopenharmony_ci    // q1
1368cabdff1aSopenharmony_ci    t0 = (q2_l + ((p0_l + q0_l + const_1_h) >> 1) - (q1_l << 1)) >> 1;
1369cabdff1aSopenharmony_ci    t0 = __msa_max_s_h(negTc_l, t0);
1370cabdff1aSopenharmony_ci    t0 = __msa_min_s_h(tc_l, t0);
1371cabdff1aSopenharmony_ci    t2 = q1_l + t0;
1372cabdff1aSopenharmony_ci    // iDeta
1373cabdff1aSopenharmony_ci    t0 = (((q0_l - p0_l) << 2) + (p1_l - q1_l) + const_4_h) >> 3;
1374cabdff1aSopenharmony_ci    t0 = __msa_max_s_h(negiTc_l, t0);
1375cabdff1aSopenharmony_ci    t0 = __msa_min_s_h(iTc_l, t0);
1376cabdff1aSopenharmony_ci    p1_l = t1;
1377cabdff1aSopenharmony_ci    q1_l = t2;
1378cabdff1aSopenharmony_ci    // p0
1379cabdff1aSopenharmony_ci    t1 = p0_l + t0;
1380cabdff1aSopenharmony_ci    t2 = t1 & const_not_255_h;
1381cabdff1aSopenharmony_ci    t3 = __msa_cle_s_h((v8i16)zero, t1);
1382cabdff1aSopenharmony_ci    flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1383cabdff1aSopenharmony_ci    p0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1384cabdff1aSopenharmony_ci    // q0
1385cabdff1aSopenharmony_ci    t1 = q0_l - t0;
1386cabdff1aSopenharmony_ci    t2 = t1 & const_not_255_h;
1387cabdff1aSopenharmony_ci    t3 = __msa_cle_s_h((v8i16)zero, t1);
1388cabdff1aSopenharmony_ci    flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1389cabdff1aSopenharmony_ci    q0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1390cabdff1aSopenharmony_ci
1391cabdff1aSopenharmony_ci    // Calculate the right part
1392cabdff1aSopenharmony_ci    // p1
1393cabdff1aSopenharmony_ci    t0 = (p2_r + ((p0_r + q0_r + const_1_h) >> 1) - (p1_r << 1)) >> 1;
1394cabdff1aSopenharmony_ci    t0 = __msa_max_s_h(negTc_r, t0);
1395cabdff1aSopenharmony_ci    t0 = __msa_min_s_h(tc_r, t0);
1396cabdff1aSopenharmony_ci    t1 = p1_r + t0;
1397cabdff1aSopenharmony_ci    // q1
1398cabdff1aSopenharmony_ci    t0 = (q2_r + ((p0_r + q0_r + const_1_h) >> 1) - (q1_r << 1)) >> 1;
1399cabdff1aSopenharmony_ci    t0 = __msa_max_s_h(negTc_r, t0);
1400cabdff1aSopenharmony_ci    t0 = __msa_min_s_h(tc_r, t0);
1401cabdff1aSopenharmony_ci    t2 = q1_r + t0;
1402cabdff1aSopenharmony_ci    // iDeta
1403cabdff1aSopenharmony_ci    t0 = (((q0_r - p0_r) << 2) + (p1_r - q1_r) + const_4_h) >> 3;
1404cabdff1aSopenharmony_ci    t0 = __msa_max_s_h(negiTc_r, t0);
1405cabdff1aSopenharmony_ci    t0 = __msa_min_s_h(iTc_r, t0);
1406cabdff1aSopenharmony_ci    p1_r = t1;
1407cabdff1aSopenharmony_ci    q1_r = t2;
1408cabdff1aSopenharmony_ci    // p0
1409cabdff1aSopenharmony_ci    t1 = p0_r + t0;
1410cabdff1aSopenharmony_ci    t2 = t1 & const_not_255_h;
1411cabdff1aSopenharmony_ci    t3 = __msa_cle_s_h((v8i16)zero, t1);
1412cabdff1aSopenharmony_ci    flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1413cabdff1aSopenharmony_ci    p0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1414cabdff1aSopenharmony_ci    // q0
1415cabdff1aSopenharmony_ci    t1 = q0_r - t0;
1416cabdff1aSopenharmony_ci    t2 = t1 & const_not_255_h;
1417cabdff1aSopenharmony_ci    t3 = __msa_cle_s_h((v8i16)zero, t1);
1418cabdff1aSopenharmony_ci    flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1419cabdff1aSopenharmony_ci    q0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1420cabdff1aSopenharmony_ci
1421cabdff1aSopenharmony_ci    // Combined left and right
1422cabdff1aSopenharmony_ci    PCKEV_B4(v8i16, p1_l, p1_r, p0_l, p0_r, q0_l, q0_r, q1_l, q1_r,
1423cabdff1aSopenharmony_ci             t0, t1, t2, t3);
1424cabdff1aSopenharmony_ci    flags = (v16i8)__msa_cle_s_b(zero, tc);
1425cabdff1aSopenharmony_ci    flags &= f;
1426cabdff1aSopenharmony_ci    p0 = (v16u8)(((v16i8)t1 & flags) + (p0 & (~flags)));
1427cabdff1aSopenharmony_ci    q0 = (v16u8)(((v16i8)t2 & flags) + (q0 & (~flags)));
1428cabdff1aSopenharmony_ci    // Using t1, t2 as temporary flags
1429cabdff1aSopenharmony_ci    t1 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaP2P0, zero))));
1430cabdff1aSopenharmony_ci    p1 = (v16u8)(t0 & t1) + (p1 & (v16u8)(~t1));
1431cabdff1aSopenharmony_ci    t2 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaQ2Q0, zero))));
1432cabdff1aSopenharmony_ci    q1 = (v16u8)(t3 & t2) + (q1 & (v16u8)(~t2));
1433cabdff1aSopenharmony_ci
1434cabdff1aSopenharmony_ci    ILVRL_B2_SH(p0, p1, t0, t1);
1435cabdff1aSopenharmony_ci    ILVRL_B2_SH(q1, q0, t2, t3);
1436cabdff1aSopenharmony_ci    ILVRL_H2_UB(t2, t0, p1, p0);
1437cabdff1aSopenharmony_ci    ILVRL_H2_UB(t3, t1, q0, q1);
1438cabdff1aSopenharmony_ci    // Store data to pPix
1439cabdff1aSopenharmony_ci    ST_W8(p1, p0, 0, 1, 2, 3, 0, 1, 2, 3, pPix - 2, iStride);
1440cabdff1aSopenharmony_ci    ST_W8(q0, q1, 0, 1, 2, 3, 0, 1, 2, 3, pPix + 8 * iStride - 2, iStride);
1441cabdff1aSopenharmony_ci}
1442cabdff1aSopenharmony_ci
1443cabdff1aSopenharmony_cistatic void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data,
1444cabdff1aSopenharmony_ci                                                   uint8_t bs0, uint8_t bs1,
1445cabdff1aSopenharmony_ci                                                   uint8_t bs2, uint8_t bs3,
1446cabdff1aSopenharmony_ci                                                   uint8_t tc0, uint8_t tc1,
1447cabdff1aSopenharmony_ci                                                   uint8_t tc2, uint8_t tc3,
1448cabdff1aSopenharmony_ci                                                   uint8_t alpha_in,
1449cabdff1aSopenharmony_ci                                                   uint8_t beta_in,
1450cabdff1aSopenharmony_ci                                                   ptrdiff_t image_width)
1451cabdff1aSopenharmony_ci{
1452cabdff1aSopenharmony_ci    v16u8 tmp_vec;
1453cabdff1aSopenharmony_ci    v16u8 bs = { 0 };
1454cabdff1aSopenharmony_ci
1455cabdff1aSopenharmony_ci    tmp_vec = (v16u8) __msa_fill_b(bs0);
1456cabdff1aSopenharmony_ci    bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1457cabdff1aSopenharmony_ci    tmp_vec = (v16u8) __msa_fill_b(bs1);
1458cabdff1aSopenharmony_ci    bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1459cabdff1aSopenharmony_ci    tmp_vec = (v16u8) __msa_fill_b(bs2);
1460cabdff1aSopenharmony_ci    bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1461cabdff1aSopenharmony_ci    tmp_vec = (v16u8) __msa_fill_b(bs3);
1462cabdff1aSopenharmony_ci    bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1463cabdff1aSopenharmony_ci
1464cabdff1aSopenharmony_ci    if (!__msa_test_bz_v(bs)) {
1465cabdff1aSopenharmony_ci        v16u8 alpha, beta, is_less_than, is_less_than_beta;
1466cabdff1aSopenharmony_ci        v16u8 p0, q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1467cabdff1aSopenharmony_ci        v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1468cabdff1aSopenharmony_ci        v16u8 is_less_than_alpha, is_bs_greater_than0;
1469cabdff1aSopenharmony_ci        v8i16 p0_r, q0_r, p0_l, q0_l;
1470cabdff1aSopenharmony_ci        v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1471cabdff1aSopenharmony_ci        v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1472cabdff1aSopenharmony_ci        v16i8 zero = { 0 };
1473cabdff1aSopenharmony_ci        v16i8 tc = { 0 };
1474cabdff1aSopenharmony_ci
1475cabdff1aSopenharmony_ci        tmp_vec = (v16u8) __msa_fill_b(tc0);
1476cabdff1aSopenharmony_ci        tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1477cabdff1aSopenharmony_ci        tmp_vec = (v16u8) __msa_fill_b(tc1);
1478cabdff1aSopenharmony_ci        tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1479cabdff1aSopenharmony_ci        tmp_vec = (v16u8) __msa_fill_b(tc2);
1480cabdff1aSopenharmony_ci        tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1481cabdff1aSopenharmony_ci        tmp_vec = (v16u8) __msa_fill_b(tc3);
1482cabdff1aSopenharmony_ci        tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1483cabdff1aSopenharmony_ci
1484cabdff1aSopenharmony_ci        alpha = (v16u8) __msa_fill_b(alpha_in);
1485cabdff1aSopenharmony_ci        beta = (v16u8) __msa_fill_b(beta_in);
1486cabdff1aSopenharmony_ci
1487cabdff1aSopenharmony_ci        LD_UB5(data - (3 * image_width), image_width,
1488cabdff1aSopenharmony_ci               p2_org, p1_org, p0_org, q0_org, q1_org);
1489cabdff1aSopenharmony_ci
1490cabdff1aSopenharmony_ci        is_bs_greater_than0 = ((v16u8) zero < bs);
1491cabdff1aSopenharmony_ci        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1492cabdff1aSopenharmony_ci        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1493cabdff1aSopenharmony_ci        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1494cabdff1aSopenharmony_ci
1495cabdff1aSopenharmony_ci        is_less_than_alpha = (p0_asub_q0 < alpha);
1496cabdff1aSopenharmony_ci        is_less_than_beta = (p1_asub_p0 < beta);
1497cabdff1aSopenharmony_ci        is_less_than = is_less_than_beta & is_less_than_alpha;
1498cabdff1aSopenharmony_ci        is_less_than_beta = (q1_asub_q0 < beta);
1499cabdff1aSopenharmony_ci        is_less_than = is_less_than_beta & is_less_than;
1500cabdff1aSopenharmony_ci        is_less_than = is_less_than & is_bs_greater_than0;
1501cabdff1aSopenharmony_ci
1502cabdff1aSopenharmony_ci        if (!__msa_test_bz_v(is_less_than)) {
1503cabdff1aSopenharmony_ci            v16i8 sign_negate_tc, negate_tc;
1504cabdff1aSopenharmony_ci            v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1505cabdff1aSopenharmony_ci            v16u8 p2_asub_p0, q2_asub_q0;
1506cabdff1aSopenharmony_ci
1507cabdff1aSopenharmony_ci            q2_org = LD_UB(data + (2 * image_width));
1508cabdff1aSopenharmony_ci            negate_tc = zero - tc;
1509cabdff1aSopenharmony_ci            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1510cabdff1aSopenharmony_ci
1511cabdff1aSopenharmony_ci            ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1512cabdff1aSopenharmony_ci
1513cabdff1aSopenharmony_ci            UNPCK_UB_SH(tc, tc_r, tc_l);
1514cabdff1aSopenharmony_ci            UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1515cabdff1aSopenharmony_ci            UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1516cabdff1aSopenharmony_ci            UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1517cabdff1aSopenharmony_ci
1518cabdff1aSopenharmony_ci            p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1519cabdff1aSopenharmony_ci            is_less_than_beta = (p2_asub_p0 < beta);
1520cabdff1aSopenharmony_ci            is_less_than_beta = is_less_than_beta & is_less_than;
1521cabdff1aSopenharmony_ci
1522cabdff1aSopenharmony_ci            if (!__msa_test_bz_v(is_less_than_beta)) {
1523cabdff1aSopenharmony_ci                v16u8 p1;
1524cabdff1aSopenharmony_ci                v8i16 p1_r = { 0 };
1525cabdff1aSopenharmony_ci                v8i16 p1_l = { 0 };
1526cabdff1aSopenharmony_ci                v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1527cabdff1aSopenharmony_ci                v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1528cabdff1aSopenharmony_ci
1529cabdff1aSopenharmony_ci                AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1530cabdff1aSopenharmony_ci                                 negate_tc_r, tc_r, p1_r);
1531cabdff1aSopenharmony_ci                AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1532cabdff1aSopenharmony_ci                                 i16_negatetc_l, tc_l, p1_l);
1533cabdff1aSopenharmony_ci
1534cabdff1aSopenharmony_ci                p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1535cabdff1aSopenharmony_ci                p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1536cabdff1aSopenharmony_ci                ST_UB(p1_org, data - (2 * image_width));
1537cabdff1aSopenharmony_ci
1538cabdff1aSopenharmony_ci                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1539cabdff1aSopenharmony_ci                tc = tc + (v16i8) is_less_than_beta;
1540cabdff1aSopenharmony_ci            }
1541cabdff1aSopenharmony_ci
1542cabdff1aSopenharmony_ci            q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1543cabdff1aSopenharmony_ci            is_less_than_beta = (q2_asub_q0 < beta);
1544cabdff1aSopenharmony_ci            is_less_than_beta = is_less_than_beta & is_less_than;
1545cabdff1aSopenharmony_ci
1546cabdff1aSopenharmony_ci            q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1547cabdff1aSopenharmony_ci            q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1548cabdff1aSopenharmony_ci
1549cabdff1aSopenharmony_ci            if (!__msa_test_bz_v(is_less_than_beta)) {
1550cabdff1aSopenharmony_ci                v16u8 q1;
1551cabdff1aSopenharmony_ci                v8i16 q1_r = { 0 };
1552cabdff1aSopenharmony_ci                v8i16 q1_l = { 0 };
1553cabdff1aSopenharmony_ci                v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1554cabdff1aSopenharmony_ci                v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1555cabdff1aSopenharmony_ci
1556cabdff1aSopenharmony_ci                AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1557cabdff1aSopenharmony_ci                                 negate_tc_r, tc_r, q1_r);
1558cabdff1aSopenharmony_ci                AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1559cabdff1aSopenharmony_ci                                 i16_negatetc_l, tc_l, q1_l);
1560cabdff1aSopenharmony_ci
1561cabdff1aSopenharmony_ci                q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1562cabdff1aSopenharmony_ci                q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1563cabdff1aSopenharmony_ci                ST_UB(q1_org, data + image_width);
1564cabdff1aSopenharmony_ci
1565cabdff1aSopenharmony_ci                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1566cabdff1aSopenharmony_ci                tc = tc + (v16i8) is_less_than_beta;
1567cabdff1aSopenharmony_ci            }
1568cabdff1aSopenharmony_ci            {
1569cabdff1aSopenharmony_ci                v16i8 negate_thresh, sign_negate_thresh;
1570cabdff1aSopenharmony_ci                v8i16 threshold_r, threshold_l;
1571cabdff1aSopenharmony_ci                v8i16 negate_thresh_l, negate_thresh_r;
1572cabdff1aSopenharmony_ci
1573cabdff1aSopenharmony_ci                negate_thresh = zero - tc;
1574cabdff1aSopenharmony_ci                sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1575cabdff1aSopenharmony_ci
1576cabdff1aSopenharmony_ci                ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1577cabdff1aSopenharmony_ci                           threshold_r, negate_thresh_r);
1578cabdff1aSopenharmony_ci                AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1579cabdff1aSopenharmony_ci                             negate_thresh_r, threshold_r, p0_r, q0_r);
1580cabdff1aSopenharmony_ci
1581cabdff1aSopenharmony_ci                threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
1582cabdff1aSopenharmony_ci                negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1583cabdff1aSopenharmony_ci                                                       negate_thresh);
1584cabdff1aSopenharmony_ci                AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1585cabdff1aSopenharmony_ci                             negate_thresh_l, threshold_l, p0_l, q0_l);
1586cabdff1aSopenharmony_ci            }
1587cabdff1aSopenharmony_ci
1588cabdff1aSopenharmony_ci            PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1589cabdff1aSopenharmony_ci
1590cabdff1aSopenharmony_ci            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1591cabdff1aSopenharmony_ci            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1592cabdff1aSopenharmony_ci
1593cabdff1aSopenharmony_ci            ST_UB(p0_org, (data - image_width));
1594cabdff1aSopenharmony_ci            ST_UB(q0_org, data);
1595cabdff1aSopenharmony_ci        }
1596cabdff1aSopenharmony_ci    }
1597cabdff1aSopenharmony_ci}
1598cabdff1aSopenharmony_ci
1599cabdff1aSopenharmony_cistatic void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, ptrdiff_t stride,
1600cabdff1aSopenharmony_ci                                             int32_t alpha_in, int32_t beta_in,
1601cabdff1aSopenharmony_ci                                             int8_t *tc0)
1602cabdff1aSopenharmony_ci{
1603cabdff1aSopenharmony_ci    uint8_t *data = in;
1604cabdff1aSopenharmony_ci    uint32_t out0, out1, out2, out3;
1605cabdff1aSopenharmony_ci    uint64_t load;
1606cabdff1aSopenharmony_ci    uint32_t tc_val;
1607cabdff1aSopenharmony_ci    v16u8 alpha, beta;
1608cabdff1aSopenharmony_ci    v16i8 inp0 = { 0 };
1609cabdff1aSopenharmony_ci    v16i8 inp1 = { 0 };
1610cabdff1aSopenharmony_ci    v16i8 inp2 = { 0 };
1611cabdff1aSopenharmony_ci    v16i8 inp3 = { 0 };
1612cabdff1aSopenharmony_ci    v16i8 inp4 = { 0 };
1613cabdff1aSopenharmony_ci    v16i8 inp5 = { 0 };
1614cabdff1aSopenharmony_ci    v16i8 inp6 = { 0 };
1615cabdff1aSopenharmony_ci    v16i8 inp7 = { 0 };
1616cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
1617cabdff1aSopenharmony_ci    v8i16 src4, src5, src6, src7;
1618cabdff1aSopenharmony_ci    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1619cabdff1aSopenharmony_ci    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1620cabdff1aSopenharmony_ci    v16u8 is_less_than_beta1, is_less_than_beta2;
1621cabdff1aSopenharmony_ci    v8i16 tc, tc_orig_r, tc_plus1;
1622cabdff1aSopenharmony_ci    v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
1623cabdff1aSopenharmony_ci    v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
1624cabdff1aSopenharmony_ci    v8i16 src2_r, src3_r;
1625cabdff1aSopenharmony_ci    v8i16 p2_r, p1_r, q2_r, q1_r;
1626cabdff1aSopenharmony_ci    v16u8 p2, q2, p0, q0;
1627cabdff1aSopenharmony_ci    v4i32 dst0, dst1;
1628cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };
1629cabdff1aSopenharmony_ci
1630cabdff1aSopenharmony_ci    alpha = (v16u8) __msa_fill_b(alpha_in);
1631cabdff1aSopenharmony_ci    beta = (v16u8) __msa_fill_b(beta_in);
1632cabdff1aSopenharmony_ci
1633cabdff1aSopenharmony_ci    if (tc0[0] < 0) {
1634cabdff1aSopenharmony_ci        data += (2 * stride);
1635cabdff1aSopenharmony_ci    } else {
1636cabdff1aSopenharmony_ci        load = LD(data - 3);
1637cabdff1aSopenharmony_ci        inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
1638cabdff1aSopenharmony_ci        load = LD(data - 3 + stride);
1639cabdff1aSopenharmony_ci        inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
1640cabdff1aSopenharmony_ci        data += (2 * stride);
1641cabdff1aSopenharmony_ci    }
1642cabdff1aSopenharmony_ci
1643cabdff1aSopenharmony_ci    if (tc0[1] < 0) {
1644cabdff1aSopenharmony_ci        data += (2 * stride);
1645cabdff1aSopenharmony_ci    } else {
1646cabdff1aSopenharmony_ci        load = LD(data - 3);
1647cabdff1aSopenharmony_ci        inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
1648cabdff1aSopenharmony_ci        load = LD(data - 3 + stride);
1649cabdff1aSopenharmony_ci        inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
1650cabdff1aSopenharmony_ci        data += (2 * stride);
1651cabdff1aSopenharmony_ci    }
1652cabdff1aSopenharmony_ci
1653cabdff1aSopenharmony_ci    if (tc0[2] < 0) {
1654cabdff1aSopenharmony_ci        data += (2 * stride);
1655cabdff1aSopenharmony_ci    } else {
1656cabdff1aSopenharmony_ci        load = LD(data - 3);
1657cabdff1aSopenharmony_ci        inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
1658cabdff1aSopenharmony_ci        load = LD(data - 3 + stride);
1659cabdff1aSopenharmony_ci        inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
1660cabdff1aSopenharmony_ci        data += (2 * stride);
1661cabdff1aSopenharmony_ci    }
1662cabdff1aSopenharmony_ci
1663cabdff1aSopenharmony_ci    if (tc0[3] < 0) {
1664cabdff1aSopenharmony_ci        data += (2 * stride);
1665cabdff1aSopenharmony_ci    } else {
1666cabdff1aSopenharmony_ci        load = LD(data - 3);
1667cabdff1aSopenharmony_ci        inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
1668cabdff1aSopenharmony_ci        load = LD(data - 3 + stride);
1669cabdff1aSopenharmony_ci        inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
1670cabdff1aSopenharmony_ci        data += (2 * stride);
1671cabdff1aSopenharmony_ci    }
1672cabdff1aSopenharmony_ci
1673cabdff1aSopenharmony_ci    ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
1674cabdff1aSopenharmony_ci               src0, src1, src2, src3);
1675cabdff1aSopenharmony_ci
1676cabdff1aSopenharmony_ci    ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
1677cabdff1aSopenharmony_ci    ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
1678cabdff1aSopenharmony_ci
1679cabdff1aSopenharmony_ci    src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
1680cabdff1aSopenharmony_ci    src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
1681cabdff1aSopenharmony_ci    src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
1682cabdff1aSopenharmony_ci    src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
1683cabdff1aSopenharmony_ci    src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
1684cabdff1aSopenharmony_ci    src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
1685cabdff1aSopenharmony_ci
1686cabdff1aSopenharmony_ci    p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1687cabdff1aSopenharmony_ci    p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1688cabdff1aSopenharmony_ci    q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1689cabdff1aSopenharmony_ci    p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1690cabdff1aSopenharmony_ci    q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1691cabdff1aSopenharmony_ci
1692cabdff1aSopenharmony_ci    is_less_than_alpha = (p0_asub_q0 < alpha);
1693cabdff1aSopenharmony_ci    is_less_than_beta = (p1_asub_p0 < beta);
1694cabdff1aSopenharmony_ci    is_less_than = is_less_than_alpha & is_less_than_beta;
1695cabdff1aSopenharmony_ci    is_less_than_beta = (q1_asub_q0 < beta);
1696cabdff1aSopenharmony_ci    is_less_than = is_less_than_beta & is_less_than;
1697cabdff1aSopenharmony_ci
1698cabdff1aSopenharmony_ci    is_less_than_beta1 = (p2_asub_p0 < beta);
1699cabdff1aSopenharmony_ci    is_less_than_beta2 = (q2_asub_q0 < beta);
1700cabdff1aSopenharmony_ci
1701cabdff1aSopenharmony_ci    p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
1702cabdff1aSopenharmony_ci    p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1703cabdff1aSopenharmony_ci    p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
1704cabdff1aSopenharmony_ci
1705cabdff1aSopenharmony_ci    ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
1706cabdff1aSopenharmony_ci    p2_r += p0_add_q0;
1707cabdff1aSopenharmony_ci    p2_r >>= 1;
1708cabdff1aSopenharmony_ci    p2_r -= p1_r;
1709cabdff1aSopenharmony_ci    ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
1710cabdff1aSopenharmony_ci    q2_r += p0_add_q0;
1711cabdff1aSopenharmony_ci    q2_r >>= 1;
1712cabdff1aSopenharmony_ci    q2_r -= q1_r;
1713cabdff1aSopenharmony_ci
1714cabdff1aSopenharmony_ci    tc_val = LW(tc0);
1715cabdff1aSopenharmony_ci    tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
1716cabdff1aSopenharmony_ci    tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
1717cabdff1aSopenharmony_ci    is_tc_orig1 = tc_orig;
1718cabdff1aSopenharmony_ci    is_tc_orig2 = tc_orig;
1719cabdff1aSopenharmony_ci    tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
1720cabdff1aSopenharmony_ci    tc = tc_orig_r;
1721cabdff1aSopenharmony_ci
1722cabdff1aSopenharmony_ci    CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
1723cabdff1aSopenharmony_ci    CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
1724cabdff1aSopenharmony_ci
1725cabdff1aSopenharmony_ci    p2_r += p1_r;
1726cabdff1aSopenharmony_ci    q2_r += q1_r;
1727cabdff1aSopenharmony_ci
1728cabdff1aSopenharmony_ci    PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2);
1729cabdff1aSopenharmony_ci
1730cabdff1aSopenharmony_ci    is_tc_orig1 = (zeros < is_tc_orig1);
1731cabdff1aSopenharmony_ci    is_tc_orig2 = is_tc_orig1;
1732cabdff1aSopenharmony_ci    is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
1733cabdff1aSopenharmony_ci    is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
1734cabdff1aSopenharmony_ci    is_tc_orig1 = is_less_than & is_tc_orig1;
1735cabdff1aSopenharmony_ci    is_tc_orig2 = is_less_than & is_tc_orig2;
1736cabdff1aSopenharmony_ci
1737cabdff1aSopenharmony_ci    p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
1738cabdff1aSopenharmony_ci    q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
1739cabdff1aSopenharmony_ci
1740cabdff1aSopenharmony_ci    q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1741cabdff1aSopenharmony_ci    q0_sub_p0 <<= 2;
1742cabdff1aSopenharmony_ci    p1_sub_q1 = p1_r - q1_r;
1743cabdff1aSopenharmony_ci    q0_sub_p0 += p1_sub_q1;
1744cabdff1aSopenharmony_ci    q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
1745cabdff1aSopenharmony_ci
1746cabdff1aSopenharmony_ci    tc_plus1 = tc + 1;
1747cabdff1aSopenharmony_ci    is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
1748cabdff1aSopenharmony_ci                                              (v16i8) is_less_than_beta1);
1749cabdff1aSopenharmony_ci    tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
1750cabdff1aSopenharmony_ci    tc_plus1 = tc + 1;
1751cabdff1aSopenharmony_ci    is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
1752cabdff1aSopenharmony_ci                                              (v16i8) is_less_than_beta2);
1753cabdff1aSopenharmony_ci    tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
1754cabdff1aSopenharmony_ci
1755cabdff1aSopenharmony_ci    CLIP_SH(q0_sub_p0, -tc, tc);
1756cabdff1aSopenharmony_ci
1757cabdff1aSopenharmony_ci    ILVR_B2_SH(zeros, src2, zeros, src3, src2_r, src3_r);
1758cabdff1aSopenharmony_ci    src2_r += q0_sub_p0;
1759cabdff1aSopenharmony_ci    src3_r -= q0_sub_p0;
1760cabdff1aSopenharmony_ci
1761cabdff1aSopenharmony_ci    CLIP_SH2_0_255(src2_r, src3_r);
1762cabdff1aSopenharmony_ci
1763cabdff1aSopenharmony_ci    PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
1764cabdff1aSopenharmony_ci
1765cabdff1aSopenharmony_ci    p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
1766cabdff1aSopenharmony_ci    q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
1767cabdff1aSopenharmony_ci
1768cabdff1aSopenharmony_ci    ILVR_B2_UB(p0, p2, q2, q0, p2, q2);
1769cabdff1aSopenharmony_ci
1770cabdff1aSopenharmony_ci    ILVRL_H2_SW(q2, p2, dst0, dst1);
1771cabdff1aSopenharmony_ci
1772cabdff1aSopenharmony_ci    data = in;
1773cabdff1aSopenharmony_ci
1774cabdff1aSopenharmony_ci    out0 = __msa_copy_u_w(dst0, 0);
1775cabdff1aSopenharmony_ci    out1 = __msa_copy_u_w(dst0, 1);
1776cabdff1aSopenharmony_ci    out2 = __msa_copy_u_w(dst0, 2);
1777cabdff1aSopenharmony_ci    out3 = __msa_copy_u_w(dst0, 3);
1778cabdff1aSopenharmony_ci
1779cabdff1aSopenharmony_ci    if (tc0[0] < 0) {
1780cabdff1aSopenharmony_ci        data += (2 * stride);
1781cabdff1aSopenharmony_ci    } else {
1782cabdff1aSopenharmony_ci        SW(out0, (data - 2));
1783cabdff1aSopenharmony_ci        data += stride;
1784cabdff1aSopenharmony_ci        SW(out1, (data - 2));
1785cabdff1aSopenharmony_ci        data += stride;
1786cabdff1aSopenharmony_ci    }
1787cabdff1aSopenharmony_ci
1788cabdff1aSopenharmony_ci    if (tc0[1] < 0) {
1789cabdff1aSopenharmony_ci        data += (2 * stride);
1790cabdff1aSopenharmony_ci    } else {
1791cabdff1aSopenharmony_ci        SW(out2, (data - 2));
1792cabdff1aSopenharmony_ci        data += stride;
1793cabdff1aSopenharmony_ci        SW(out3, (data - 2));
1794cabdff1aSopenharmony_ci        data += stride;
1795cabdff1aSopenharmony_ci    }
1796cabdff1aSopenharmony_ci
1797cabdff1aSopenharmony_ci    out0 = __msa_copy_u_w(dst1, 0);
1798cabdff1aSopenharmony_ci    out1 = __msa_copy_u_w(dst1, 1);
1799cabdff1aSopenharmony_ci    out2 = __msa_copy_u_w(dst1, 2);
1800cabdff1aSopenharmony_ci    out3 = __msa_copy_u_w(dst1, 3);
1801cabdff1aSopenharmony_ci
1802cabdff1aSopenharmony_ci    if (tc0[2] < 0) {
1803cabdff1aSopenharmony_ci        data += (2 * stride);
1804cabdff1aSopenharmony_ci    } else {
1805cabdff1aSopenharmony_ci        SW(out0, (data - 2));
1806cabdff1aSopenharmony_ci        data += stride;
1807cabdff1aSopenharmony_ci        SW(out1, (data - 2));
1808cabdff1aSopenharmony_ci        data += stride;
1809cabdff1aSopenharmony_ci    }
1810cabdff1aSopenharmony_ci
1811cabdff1aSopenharmony_ci    if (tc0[3] >= 0) {
1812cabdff1aSopenharmony_ci        SW(out2, (data - 2));
1813cabdff1aSopenharmony_ci        data += stride;
1814cabdff1aSopenharmony_ci        SW(out3, (data - 2));
1815cabdff1aSopenharmony_ci    }
1816cabdff1aSopenharmony_ci}
1817cabdff1aSopenharmony_ci
1818cabdff1aSopenharmony_cistatic void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data,
1819cabdff1aSopenharmony_ci                                                       uint8_t bs0, uint8_t bs1,
1820cabdff1aSopenharmony_ci                                                       uint8_t bs2, uint8_t bs3,
1821cabdff1aSopenharmony_ci                                                       uint8_t tc0, uint8_t tc1,
1822cabdff1aSopenharmony_ci                                                       uint8_t tc2, uint8_t tc3,
1823cabdff1aSopenharmony_ci                                                       uint8_t alpha_in,
1824cabdff1aSopenharmony_ci                                                       uint8_t beta_in,
1825cabdff1aSopenharmony_ci                                                       ptrdiff_t img_width)
1826cabdff1aSopenharmony_ci{
1827cabdff1aSopenharmony_ci    v16u8 alpha, beta;
1828cabdff1aSopenharmony_ci    v8i16 tmp_vec;
1829cabdff1aSopenharmony_ci    v8i16 bs = { 0 };
1830cabdff1aSopenharmony_ci    v8i16 tc = { 0 };
1831cabdff1aSopenharmony_ci    v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
1832cabdff1aSopenharmony_ci    v16u8 is_less_than;
1833cabdff1aSopenharmony_ci    v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
1834cabdff1aSopenharmony_ci    v8i16 p0_r, q0_r;
1835cabdff1aSopenharmony_ci    v16u8 p1_org, p0_org, q0_org, q1_org;
1836cabdff1aSopenharmony_ci    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1837cabdff1aSopenharmony_ci    v16i8 negate_tc, sign_negate_tc;
1838cabdff1aSopenharmony_ci    v8i16 tc_r, negate_tc_r;
1839cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
1840cabdff1aSopenharmony_ci
1841cabdff1aSopenharmony_ci    tmp_vec = (v8i16) __msa_fill_b(bs0);
1842cabdff1aSopenharmony_ci    bs = __msa_insve_h(bs, 0, tmp_vec);
1843cabdff1aSopenharmony_ci    tmp_vec = (v8i16) __msa_fill_b(bs1);
1844cabdff1aSopenharmony_ci    bs = __msa_insve_h(bs, 1, tmp_vec);
1845cabdff1aSopenharmony_ci    tmp_vec = (v8i16) __msa_fill_b(bs2);
1846cabdff1aSopenharmony_ci    bs = __msa_insve_h(bs, 2, tmp_vec);
1847cabdff1aSopenharmony_ci    tmp_vec = (v8i16) __msa_fill_b(bs3);
1848cabdff1aSopenharmony_ci    bs = __msa_insve_h(bs, 3, tmp_vec);
1849cabdff1aSopenharmony_ci
1850cabdff1aSopenharmony_ci    if (!__msa_test_bz_v((v16u8) bs)) {
1851cabdff1aSopenharmony_ci        tmp_vec = (v8i16) __msa_fill_b(tc0);
1852cabdff1aSopenharmony_ci        tc = __msa_insve_h(tc, 0, tmp_vec);
1853cabdff1aSopenharmony_ci        tmp_vec = (v8i16) __msa_fill_b(tc1);
1854cabdff1aSopenharmony_ci        tc = __msa_insve_h(tc, 1, tmp_vec);
1855cabdff1aSopenharmony_ci        tmp_vec = (v8i16) __msa_fill_b(tc2);
1856cabdff1aSopenharmony_ci        tc = __msa_insve_h(tc, 2, tmp_vec);
1857cabdff1aSopenharmony_ci        tmp_vec = (v8i16) __msa_fill_b(tc3);
1858cabdff1aSopenharmony_ci        tc = __msa_insve_h(tc, 3, tmp_vec);
1859cabdff1aSopenharmony_ci
1860cabdff1aSopenharmony_ci        is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
1861cabdff1aSopenharmony_ci
1862cabdff1aSopenharmony_ci        alpha = (v16u8) __msa_fill_b(alpha_in);
1863cabdff1aSopenharmony_ci        beta = (v16u8) __msa_fill_b(beta_in);
1864cabdff1aSopenharmony_ci
1865cabdff1aSopenharmony_ci        LD_UB4(data - (img_width << 1), img_width,
1866cabdff1aSopenharmony_ci               p1_org, p0_org, q0_org, q1_org);
1867cabdff1aSopenharmony_ci
1868cabdff1aSopenharmony_ci        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1869cabdff1aSopenharmony_ci        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1870cabdff1aSopenharmony_ci        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1871cabdff1aSopenharmony_ci
1872cabdff1aSopenharmony_ci        is_less_than_alpha = (p0_asub_q0 < alpha);
1873cabdff1aSopenharmony_ci        is_less_than_beta = (p1_asub_p0 < beta);
1874cabdff1aSopenharmony_ci        is_less_than = is_less_than_beta & is_less_than_alpha;
1875cabdff1aSopenharmony_ci        is_less_than_beta = (q1_asub_q0 < beta);
1876cabdff1aSopenharmony_ci        is_less_than = is_less_than_beta & is_less_than;
1877cabdff1aSopenharmony_ci        is_less_than = is_less_than & is_bs_greater_than0;
1878cabdff1aSopenharmony_ci
1879cabdff1aSopenharmony_ci        is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1880cabdff1aSopenharmony_ci
1881cabdff1aSopenharmony_ci        if (!__msa_test_bz_v(is_less_than)) {
1882cabdff1aSopenharmony_ci            negate_tc = zero - (v16i8) tc;
1883cabdff1aSopenharmony_ci            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1884cabdff1aSopenharmony_ci
1885cabdff1aSopenharmony_ci            ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
1886cabdff1aSopenharmony_ci
1887cabdff1aSopenharmony_ci            ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
1888cabdff1aSopenharmony_ci                       p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1889cabdff1aSopenharmony_ci
1890cabdff1aSopenharmony_ci            AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
1891cabdff1aSopenharmony_ci                         tc_r, p0_r, q0_r);
1892cabdff1aSopenharmony_ci
1893cabdff1aSopenharmony_ci            PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
1894cabdff1aSopenharmony_ci
1895cabdff1aSopenharmony_ci            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1896cabdff1aSopenharmony_ci            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1897cabdff1aSopenharmony_ci
1898cabdff1aSopenharmony_ci            ST_UB(q0_org, data);
1899cabdff1aSopenharmony_ci            ST_UB(p0_org, (data - img_width));
1900cabdff1aSopenharmony_ci        }
1901cabdff1aSopenharmony_ci    }
1902cabdff1aSopenharmony_ci}
1903cabdff1aSopenharmony_ci
1904cabdff1aSopenharmony_cistatic void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data,
1905cabdff1aSopenharmony_ci                                                       uint8_t bs0, uint8_t bs1,
1906cabdff1aSopenharmony_ci                                                       uint8_t bs2, uint8_t bs3,
1907cabdff1aSopenharmony_ci                                                       uint8_t tc0, uint8_t tc1,
1908cabdff1aSopenharmony_ci                                                       uint8_t tc2, uint8_t tc3,
1909cabdff1aSopenharmony_ci                                                       uint8_t alpha_in,
1910cabdff1aSopenharmony_ci                                                       uint8_t beta_in,
1911cabdff1aSopenharmony_ci                                                       ptrdiff_t img_width)
1912cabdff1aSopenharmony_ci{
1913cabdff1aSopenharmony_ci    uint8_t *src;
1914cabdff1aSopenharmony_ci    v16u8 alpha, beta;
1915cabdff1aSopenharmony_ci    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1916cabdff1aSopenharmony_ci    v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
1917cabdff1aSopenharmony_ci    v16u8 p0, q0;
1918cabdff1aSopenharmony_ci    v8i16 p0_r = { 0 };
1919cabdff1aSopenharmony_ci    v8i16 q0_r = { 0 };
1920cabdff1aSopenharmony_ci    v16u8 p1_org, p0_org, q0_org, q1_org;
1921cabdff1aSopenharmony_ci    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1922cabdff1aSopenharmony_ci    v16u8 is_bs_greater_than0;
1923cabdff1aSopenharmony_ci    v8i16 tc_r, negate_tc_r;
1924cabdff1aSopenharmony_ci    v16i8 negate_tc, sign_negate_tc;
1925cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
1926cabdff1aSopenharmony_ci    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1927cabdff1aSopenharmony_ci    v8i16 tmp1, tmp_vec, bs = { 0 };
1928cabdff1aSopenharmony_ci    v8i16 tc = { 0 };
1929cabdff1aSopenharmony_ci
1930cabdff1aSopenharmony_ci    tmp_vec = (v8i16) __msa_fill_b(bs0);
1931cabdff1aSopenharmony_ci    bs = __msa_insve_h(bs, 0, tmp_vec);
1932cabdff1aSopenharmony_ci    tmp_vec = (v8i16) __msa_fill_b(bs1);
1933cabdff1aSopenharmony_ci    bs = __msa_insve_h(bs, 1, tmp_vec);
1934cabdff1aSopenharmony_ci    tmp_vec = (v8i16) __msa_fill_b(bs2);
1935cabdff1aSopenharmony_ci    bs = __msa_insve_h(bs, 2, tmp_vec);
1936cabdff1aSopenharmony_ci    tmp_vec = (v8i16) __msa_fill_b(bs3);
1937cabdff1aSopenharmony_ci    bs = __msa_insve_h(bs, 3, tmp_vec);
1938cabdff1aSopenharmony_ci
1939cabdff1aSopenharmony_ci    if (!__msa_test_bz_v((v16u8) bs)) {
1940cabdff1aSopenharmony_ci        tmp_vec = (v8i16) __msa_fill_b(tc0);
1941cabdff1aSopenharmony_ci        tc = __msa_insve_h(tc, 0, tmp_vec);
1942cabdff1aSopenharmony_ci        tmp_vec = (v8i16) __msa_fill_b(tc1);
1943cabdff1aSopenharmony_ci        tc = __msa_insve_h(tc, 1, tmp_vec);
1944cabdff1aSopenharmony_ci        tmp_vec = (v8i16) __msa_fill_b(tc2);
1945cabdff1aSopenharmony_ci        tc = __msa_insve_h(tc, 2, tmp_vec);
1946cabdff1aSopenharmony_ci        tmp_vec = (v8i16) __msa_fill_b(tc3);
1947cabdff1aSopenharmony_ci        tc = __msa_insve_h(tc, 3, tmp_vec);
1948cabdff1aSopenharmony_ci
1949cabdff1aSopenharmony_ci        is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
1950cabdff1aSopenharmony_ci
1951cabdff1aSopenharmony_ci        LD_UB8((data - 2), img_width,
1952cabdff1aSopenharmony_ci               row0, row1, row2, row3, row4, row5, row6, row7);
1953cabdff1aSopenharmony_ci
1954cabdff1aSopenharmony_ci        TRANSPOSE8x4_UB_UB(row0, row1, row2, row3,
1955cabdff1aSopenharmony_ci                           row4, row5, row6, row7,
1956cabdff1aSopenharmony_ci                           p1_org, p0_org, q0_org, q1_org);
1957cabdff1aSopenharmony_ci
1958cabdff1aSopenharmony_ci        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1959cabdff1aSopenharmony_ci        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1960cabdff1aSopenharmony_ci        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1961cabdff1aSopenharmony_ci
1962cabdff1aSopenharmony_ci        alpha = (v16u8) __msa_fill_b(alpha_in);
1963cabdff1aSopenharmony_ci        beta = (v16u8) __msa_fill_b(beta_in);
1964cabdff1aSopenharmony_ci
1965cabdff1aSopenharmony_ci        is_less_than_alpha = (p0_asub_q0 < alpha);
1966cabdff1aSopenharmony_ci        is_less_than_beta = (p1_asub_p0 < beta);
1967cabdff1aSopenharmony_ci        is_less_than = is_less_than_beta & is_less_than_alpha;
1968cabdff1aSopenharmony_ci        is_less_than_beta = (q1_asub_q0 < beta);
1969cabdff1aSopenharmony_ci        is_less_than = is_less_than_beta & is_less_than;
1970cabdff1aSopenharmony_ci        is_less_than = is_bs_greater_than0 & is_less_than;
1971cabdff1aSopenharmony_ci
1972cabdff1aSopenharmony_ci        is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1973cabdff1aSopenharmony_ci
1974cabdff1aSopenharmony_ci        if (!__msa_test_bz_v(is_less_than)) {
1975cabdff1aSopenharmony_ci            ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
1976cabdff1aSopenharmony_ci                       p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1977cabdff1aSopenharmony_ci
1978cabdff1aSopenharmony_ci            negate_tc = zero - (v16i8) tc;
1979cabdff1aSopenharmony_ci            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1980cabdff1aSopenharmony_ci
1981cabdff1aSopenharmony_ci            ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
1982cabdff1aSopenharmony_ci
1983cabdff1aSopenharmony_ci            AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
1984cabdff1aSopenharmony_ci                         tc_r, p0_r, q0_r);
1985cabdff1aSopenharmony_ci
1986cabdff1aSopenharmony_ci            PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
1987cabdff1aSopenharmony_ci
1988cabdff1aSopenharmony_ci            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1989cabdff1aSopenharmony_ci            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1990cabdff1aSopenharmony_ci            tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
1991cabdff1aSopenharmony_ci            src = data - 1;
1992cabdff1aSopenharmony_ci            ST_H4(tmp1, 0, 1, 2, 3, src, img_width);
1993cabdff1aSopenharmony_ci            src += 4 * img_width;
1994cabdff1aSopenharmony_ci            ST_H4(tmp1, 4, 5, 6, 7, src, img_width);
1995cabdff1aSopenharmony_ci        }
1996cabdff1aSopenharmony_ci    }
1997cabdff1aSopenharmony_ci}
1998cabdff1aSopenharmony_ci
1999cabdff1aSopenharmony_cistatic void avc_h_loop_filter_chroma422_msa(uint8_t *src, ptrdiff_t stride,
2000cabdff1aSopenharmony_ci                                            int32_t alpha_in, int32_t beta_in,
2001cabdff1aSopenharmony_ci                                            int8_t *tc0)
2002cabdff1aSopenharmony_ci{
2003cabdff1aSopenharmony_ci    int32_t col, tc_val;
2004cabdff1aSopenharmony_ci    v16u8 alpha, beta, res;
2005cabdff1aSopenharmony_ci
2006cabdff1aSopenharmony_ci    alpha = (v16u8) __msa_fill_b(alpha_in);
2007cabdff1aSopenharmony_ci    beta = (v16u8) __msa_fill_b(beta_in);
2008cabdff1aSopenharmony_ci
2009cabdff1aSopenharmony_ci    for (col = 0; col < 4; col++) {
2010cabdff1aSopenharmony_ci        tc_val = (tc0[col] - 1) + 1;
2011cabdff1aSopenharmony_ci
2012cabdff1aSopenharmony_ci        if (tc_val <= 0) {
2013cabdff1aSopenharmony_ci            src += (4 * stride);
2014cabdff1aSopenharmony_ci            continue;
2015cabdff1aSopenharmony_ci        }
2016cabdff1aSopenharmony_ci
2017cabdff1aSopenharmony_ci        AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2018cabdff1aSopenharmony_ci        ST_H4(res, 0, 1, 2, 3, (src - 1), stride);
2019cabdff1aSopenharmony_ci        src += (4 * stride);
2020cabdff1aSopenharmony_ci    }
2021cabdff1aSopenharmony_ci}
2022cabdff1aSopenharmony_ci
2023cabdff1aSopenharmony_cistatic void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src,
2024cabdff1aSopenharmony_ci                                                  ptrdiff_t stride,
2025cabdff1aSopenharmony_ci                                                  int32_t alpha_in,
2026cabdff1aSopenharmony_ci                                                  int32_t beta_in,
2027cabdff1aSopenharmony_ci                                                  int8_t *tc0)
2028cabdff1aSopenharmony_ci{
2029cabdff1aSopenharmony_ci    int32_t col, tc_val;
2030cabdff1aSopenharmony_ci    int16_t out0, out1;
2031cabdff1aSopenharmony_ci    v16u8 alpha, beta, res;
2032cabdff1aSopenharmony_ci
2033cabdff1aSopenharmony_ci    alpha = (v16u8) __msa_fill_b(alpha_in);
2034cabdff1aSopenharmony_ci    beta = (v16u8) __msa_fill_b(beta_in);
2035cabdff1aSopenharmony_ci
2036cabdff1aSopenharmony_ci    for (col = 0; col < 4; col++) {
2037cabdff1aSopenharmony_ci        tc_val = (tc0[col] - 1) + 1;
2038cabdff1aSopenharmony_ci
2039cabdff1aSopenharmony_ci        if (tc_val <= 0) {
2040cabdff1aSopenharmony_ci            src += 4 * stride;
2041cabdff1aSopenharmony_ci            continue;
2042cabdff1aSopenharmony_ci        }
2043cabdff1aSopenharmony_ci
2044cabdff1aSopenharmony_ci        AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2045cabdff1aSopenharmony_ci
2046cabdff1aSopenharmony_ci        out0 = __msa_copy_s_h((v8i16) res, 0);
2047cabdff1aSopenharmony_ci        out1 = __msa_copy_s_h((v8i16) res, 1);
2048cabdff1aSopenharmony_ci
2049cabdff1aSopenharmony_ci        SH(out0, (src - 1));
2050cabdff1aSopenharmony_ci        src += stride;
2051cabdff1aSopenharmony_ci        SH(out1, (src - 1));
2052cabdff1aSopenharmony_ci        src += stride;
2053cabdff1aSopenharmony_ci    }
2054cabdff1aSopenharmony_ci}
2055cabdff1aSopenharmony_ci
2056cabdff1aSopenharmony_civoid ff_h264_h_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2057cabdff1aSopenharmony_ci                                  int alpha, int beta, int8_t *tc)
2058cabdff1aSopenharmony_ci{
2059cabdff1aSopenharmony_ci//    uint8_t bs0 = 1;
2060cabdff1aSopenharmony_ci//    uint8_t bs1 = 1;
2061cabdff1aSopenharmony_ci//    uint8_t bs2 = 1;
2062cabdff1aSopenharmony_ci//    uint8_t bs3 = 1;
2063cabdff1aSopenharmony_ci//
2064cabdff1aSopenharmony_ci//    if (tc[0] < 0)
2065cabdff1aSopenharmony_ci//        bs0 = 0;
2066cabdff1aSopenharmony_ci//    if (tc[1] < 0)
2067cabdff1aSopenharmony_ci//        bs1 = 0;
2068cabdff1aSopenharmony_ci//    if (tc[2] < 0)
2069cabdff1aSopenharmony_ci//        bs2 = 0;
2070cabdff1aSopenharmony_ci//    if (tc[3] < 0)
2071cabdff1aSopenharmony_ci//        bs3 = 0;
2072cabdff1aSopenharmony_ci//
2073cabdff1aSopenharmony_ci//    avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2074cabdff1aSopenharmony_ci//                                           tc[0], tc[1], tc[2], tc[3],
2075cabdff1aSopenharmony_ci//                                           alpha, beta, img_width);
2076cabdff1aSopenharmony_ci    avc_loopfilter_luma_inter_edge_ver_msa(data, img_width, alpha, beta, tc);
2077cabdff1aSopenharmony_ci}
2078cabdff1aSopenharmony_ci
2079cabdff1aSopenharmony_civoid ff_h264_v_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2080cabdff1aSopenharmony_ci                                  int alpha, int beta, int8_t *tc)
2081cabdff1aSopenharmony_ci{
2082cabdff1aSopenharmony_ci
2083cabdff1aSopenharmony_ci    uint8_t bs0 = 1;
2084cabdff1aSopenharmony_ci    uint8_t bs1 = 1;
2085cabdff1aSopenharmony_ci    uint8_t bs2 = 1;
2086cabdff1aSopenharmony_ci    uint8_t bs3 = 1;
2087cabdff1aSopenharmony_ci
2088cabdff1aSopenharmony_ci    if (tc[0] < 0)
2089cabdff1aSopenharmony_ci        bs0 = 0;
2090cabdff1aSopenharmony_ci    if (tc[1] < 0)
2091cabdff1aSopenharmony_ci        bs1 = 0;
2092cabdff1aSopenharmony_ci    if (tc[2] < 0)
2093cabdff1aSopenharmony_ci        bs2 = 0;
2094cabdff1aSopenharmony_ci    if (tc[3] < 0)
2095cabdff1aSopenharmony_ci        bs3 = 0;
2096cabdff1aSopenharmony_ci
2097cabdff1aSopenharmony_ci    avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2098cabdff1aSopenharmony_ci                                           tc[0], tc[1], tc[2], tc[3],
2099cabdff1aSopenharmony_ci                                           alpha, beta, img_width);
2100cabdff1aSopenharmony_ci}
2101cabdff1aSopenharmony_ci
2102cabdff1aSopenharmony_civoid ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2103cabdff1aSopenharmony_ci                                    int alpha, int beta, int8_t *tc)
2104cabdff1aSopenharmony_ci{
2105cabdff1aSopenharmony_ci    uint8_t bs0 = 1;
2106cabdff1aSopenharmony_ci    uint8_t bs1 = 1;
2107cabdff1aSopenharmony_ci    uint8_t bs2 = 1;
2108cabdff1aSopenharmony_ci    uint8_t bs3 = 1;
2109cabdff1aSopenharmony_ci
2110cabdff1aSopenharmony_ci    if (tc[0] < 0)
2111cabdff1aSopenharmony_ci        bs0 = 0;
2112cabdff1aSopenharmony_ci    if (tc[1] < 0)
2113cabdff1aSopenharmony_ci        bs1 = 0;
2114cabdff1aSopenharmony_ci    if (tc[2] < 0)
2115cabdff1aSopenharmony_ci        bs2 = 0;
2116cabdff1aSopenharmony_ci    if (tc[3] < 0)
2117cabdff1aSopenharmony_ci        bs3 = 0;
2118cabdff1aSopenharmony_ci
2119cabdff1aSopenharmony_ci    avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2120cabdff1aSopenharmony_ci                                               tc[0], tc[1], tc[2], tc[3],
2121cabdff1aSopenharmony_ci                                               alpha, beta, img_width);
2122cabdff1aSopenharmony_ci}
2123cabdff1aSopenharmony_ci
2124cabdff1aSopenharmony_civoid ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2125cabdff1aSopenharmony_ci                                    int alpha, int beta, int8_t *tc)
2126cabdff1aSopenharmony_ci{
2127cabdff1aSopenharmony_ci    uint8_t bs0 = 1;
2128cabdff1aSopenharmony_ci    uint8_t bs1 = 1;
2129cabdff1aSopenharmony_ci    uint8_t bs2 = 1;
2130cabdff1aSopenharmony_ci    uint8_t bs3 = 1;
2131cabdff1aSopenharmony_ci
2132cabdff1aSopenharmony_ci    if (tc[0] < 0)
2133cabdff1aSopenharmony_ci        bs0 = 0;
2134cabdff1aSopenharmony_ci    if (tc[1] < 0)
2135cabdff1aSopenharmony_ci        bs1 = 0;
2136cabdff1aSopenharmony_ci    if (tc[2] < 0)
2137cabdff1aSopenharmony_ci        bs2 = 0;
2138cabdff1aSopenharmony_ci    if (tc[3] < 0)
2139cabdff1aSopenharmony_ci        bs3 = 0;
2140cabdff1aSopenharmony_ci
2141cabdff1aSopenharmony_ci    avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2142cabdff1aSopenharmony_ci                                               tc[0], tc[1], tc[2], tc[3],
2143cabdff1aSopenharmony_ci                                               alpha, beta, img_width);
2144cabdff1aSopenharmony_ci}
2145cabdff1aSopenharmony_ci
2146cabdff1aSopenharmony_civoid ff_h264_h_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2147cabdff1aSopenharmony_ci                                  int alpha, int beta)
2148cabdff1aSopenharmony_ci{
2149cabdff1aSopenharmony_ci    avc_loopfilter_luma_intra_edge_ver_msa(data, (uint8_t) alpha,
2150cabdff1aSopenharmony_ci                                           (uint8_t) beta,
2151cabdff1aSopenharmony_ci                                           img_width);
2152cabdff1aSopenharmony_ci}
2153cabdff1aSopenharmony_ci
2154cabdff1aSopenharmony_civoid ff_h264_v_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2155cabdff1aSopenharmony_ci                                  int alpha, int beta)
2156cabdff1aSopenharmony_ci{
2157cabdff1aSopenharmony_ci    avc_loopfilter_luma_intra_edge_hor_msa(data, (uint8_t) alpha,
2158cabdff1aSopenharmony_ci                                           (uint8_t) beta,
2159cabdff1aSopenharmony_ci                                           img_width);
2160cabdff1aSopenharmony_ci}
2161cabdff1aSopenharmony_ci
2162cabdff1aSopenharmony_civoid ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2163cabdff1aSopenharmony_ci                                    int alpha, int beta)
2164cabdff1aSopenharmony_ci{
2165cabdff1aSopenharmony_ci    avc_loopfilter_cb_or_cr_intra_edge_ver_msa(data, (uint8_t) alpha,
2166cabdff1aSopenharmony_ci                                               (uint8_t) beta,
2167cabdff1aSopenharmony_ci                                               img_width);
2168cabdff1aSopenharmony_ci}
2169cabdff1aSopenharmony_ci
2170cabdff1aSopenharmony_civoid ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2171cabdff1aSopenharmony_ci                                    int alpha, int beta)
2172cabdff1aSopenharmony_ci{
2173cabdff1aSopenharmony_ci    avc_loopfilter_cb_or_cr_intra_edge_hor_msa(data, (uint8_t) alpha,
2174cabdff1aSopenharmony_ci                                               (uint8_t) beta,
2175cabdff1aSopenharmony_ci                                               img_width);
2176cabdff1aSopenharmony_ci}
2177cabdff1aSopenharmony_ci
2178cabdff1aSopenharmony_civoid ff_h264_h_loop_filter_chroma422_msa(uint8_t *src,
2179cabdff1aSopenharmony_ci                                         ptrdiff_t ystride,
2180cabdff1aSopenharmony_ci                                         int32_t alpha, int32_t beta,
2181cabdff1aSopenharmony_ci                                         int8_t *tc0)
2182cabdff1aSopenharmony_ci{
2183cabdff1aSopenharmony_ci    avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0);
2184cabdff1aSopenharmony_ci}
2185cabdff1aSopenharmony_ci
2186cabdff1aSopenharmony_civoid ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src,
2187cabdff1aSopenharmony_ci                                               ptrdiff_t ystride,
2188cabdff1aSopenharmony_ci                                               int32_t alpha,
2189cabdff1aSopenharmony_ci                                               int32_t beta,
2190cabdff1aSopenharmony_ci                                               int8_t *tc0)
2191cabdff1aSopenharmony_ci{
2192cabdff1aSopenharmony_ci    avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0);
2193cabdff1aSopenharmony_ci}
2194cabdff1aSopenharmony_ci
2195cabdff1aSopenharmony_civoid ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src,
2196cabdff1aSopenharmony_ci                                          ptrdiff_t ystride,
2197cabdff1aSopenharmony_ci                                          int32_t alpha,
2198cabdff1aSopenharmony_ci                                          int32_t beta,
2199cabdff1aSopenharmony_ci                                          int8_t *tc0)
2200cabdff1aSopenharmony_ci{
2201cabdff1aSopenharmony_ci    avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0);
2202cabdff1aSopenharmony_ci}
2203cabdff1aSopenharmony_ci
2204cabdff1aSopenharmony_civoid ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
2205cabdff1aSopenharmony_ci                                                ptrdiff_t ystride,
2206cabdff1aSopenharmony_ci                                                int32_t alpha,
2207cabdff1aSopenharmony_ci                                                int32_t beta)
2208cabdff1aSopenharmony_ci{
2209cabdff1aSopenharmony_ci    avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta);
2210cabdff1aSopenharmony_ci}
2211cabdff1aSopenharmony_ci
2212cabdff1aSopenharmony_civoid ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride,
2213cabdff1aSopenharmony_ci                                   int height, int log2_denom,
2214cabdff1aSopenharmony_ci                                   int weight_src, int offset_in)
2215cabdff1aSopenharmony_ci{
2216cabdff1aSopenharmony_ci    uint32_t offset_val;
2217cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
2218cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2219cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2220cabdff1aSopenharmony_ci    v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
2221cabdff1aSopenharmony_ci    v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r;
2222cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2223cabdff1aSopenharmony_ci    v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2224cabdff1aSopenharmony_ci    v8i16 wgt, denom, offset;
2225cabdff1aSopenharmony_ci
2226cabdff1aSopenharmony_ci    offset_val = (unsigned) offset_in << log2_denom;
2227cabdff1aSopenharmony_ci
2228cabdff1aSopenharmony_ci    wgt = __msa_fill_h(weight_src);
2229cabdff1aSopenharmony_ci    offset = __msa_fill_h(offset_val);
2230cabdff1aSopenharmony_ci    denom = __msa_fill_h(log2_denom);
2231cabdff1aSopenharmony_ci
2232cabdff1aSopenharmony_ci    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2233cabdff1aSopenharmony_ci    ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, src1_r,
2234cabdff1aSopenharmony_ci               src2_r, src3_r);
2235cabdff1aSopenharmony_ci    ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, src1_l,
2236cabdff1aSopenharmony_ci               src2_l, src3_l);
2237cabdff1aSopenharmony_ci    ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, src5_r,
2238cabdff1aSopenharmony_ci               src6_r, src7_r);
2239cabdff1aSopenharmony_ci    ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, src5_l,
2240cabdff1aSopenharmony_ci               src6_l, src7_l);
2241cabdff1aSopenharmony_ci    MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2,
2242cabdff1aSopenharmony_ci         tmp3);
2243cabdff1aSopenharmony_ci    MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6,
2244cabdff1aSopenharmony_ci         tmp7);
2245cabdff1aSopenharmony_ci    MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10,
2246cabdff1aSopenharmony_ci         tmp11);
2247cabdff1aSopenharmony_ci    MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2248cabdff1aSopenharmony_ci         tmp14, tmp15);
2249cabdff1aSopenharmony_ci    ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
2250cabdff1aSopenharmony_ci                tmp1, tmp2, tmp3);
2251cabdff1aSopenharmony_ci    ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
2252cabdff1aSopenharmony_ci                tmp5, tmp6, tmp7);
2253cabdff1aSopenharmony_ci    ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, tmp8,
2254cabdff1aSopenharmony_ci                tmp9, tmp10, tmp11);
2255cabdff1aSopenharmony_ci    ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2256cabdff1aSopenharmony_ci                tmp12, tmp13, tmp14, tmp15);
2257cabdff1aSopenharmony_ci    MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2258cabdff1aSopenharmony_ci    MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2259cabdff1aSopenharmony_ci    SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2260cabdff1aSopenharmony_ci    SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2261cabdff1aSopenharmony_ci    SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2262cabdff1aSopenharmony_ci    SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2263cabdff1aSopenharmony_ci    PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2264cabdff1aSopenharmony_ci                dst2, dst3);
2265cabdff1aSopenharmony_ci    PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2266cabdff1aSopenharmony_ci                dst5, dst6, dst7);
2267cabdff1aSopenharmony_ci    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2268cabdff1aSopenharmony_ci    src += 8 * stride;
2269cabdff1aSopenharmony_ci
2270cabdff1aSopenharmony_ci    if (16 == height) {
2271cabdff1aSopenharmony_ci        LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2272cabdff1aSopenharmony_ci        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r,
2273cabdff1aSopenharmony_ci                   src1_r, src2_r, src3_r);
2274cabdff1aSopenharmony_ci        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l,
2275cabdff1aSopenharmony_ci                   src1_l, src2_l, src3_l);
2276cabdff1aSopenharmony_ci        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r,
2277cabdff1aSopenharmony_ci                   src5_r, src6_r, src7_r);
2278cabdff1aSopenharmony_ci        ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l,
2279cabdff1aSopenharmony_ci                   src5_l, src6_l, src7_l);
2280cabdff1aSopenharmony_ci        MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1,
2281cabdff1aSopenharmony_ci             tmp2, tmp3);
2282cabdff1aSopenharmony_ci        MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5,
2283cabdff1aSopenharmony_ci             tmp6, tmp7);
2284cabdff1aSopenharmony_ci        MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9,
2285cabdff1aSopenharmony_ci             tmp10, tmp11);
2286cabdff1aSopenharmony_ci        MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2287cabdff1aSopenharmony_ci             tmp14, tmp15);
2288cabdff1aSopenharmony_ci        ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
2289cabdff1aSopenharmony_ci                    tmp0, tmp1, tmp2, tmp3);
2290cabdff1aSopenharmony_ci        ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
2291cabdff1aSopenharmony_ci                    tmp4, tmp5, tmp6, tmp7);
2292cabdff1aSopenharmony_ci        ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset,
2293cabdff1aSopenharmony_ci                    tmp8, tmp9, tmp10, tmp11);
2294cabdff1aSopenharmony_ci        ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2295cabdff1aSopenharmony_ci                    tmp12, tmp13, tmp14, tmp15);
2296cabdff1aSopenharmony_ci        MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2297cabdff1aSopenharmony_ci        MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2298cabdff1aSopenharmony_ci        SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2299cabdff1aSopenharmony_ci        SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2300cabdff1aSopenharmony_ci        SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2301cabdff1aSopenharmony_ci        SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2302cabdff1aSopenharmony_ci        PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2303cabdff1aSopenharmony_ci                    dst2, dst3);
2304cabdff1aSopenharmony_ci        PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2305cabdff1aSopenharmony_ci                    dst5, dst6, dst7);
2306cabdff1aSopenharmony_ci        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2307cabdff1aSopenharmony_ci    }
2308cabdff1aSopenharmony_ci}
2309cabdff1aSopenharmony_ci
2310cabdff1aSopenharmony_civoid ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride,
2311cabdff1aSopenharmony_ci                                  int height, int log2_denom,
2312cabdff1aSopenharmony_ci                                  int weight_src, int offset)
2313cabdff1aSopenharmony_ci{
2314cabdff1aSopenharmony_ci    if (4 == height) {
2315cabdff1aSopenharmony_ci        avc_wgt_8x4_msa(src, stride, log2_denom, weight_src, offset);
2316cabdff1aSopenharmony_ci    } else if (8 == height) {
2317cabdff1aSopenharmony_ci        avc_wgt_8x8_msa(src, stride, log2_denom, weight_src, offset);
2318cabdff1aSopenharmony_ci    } else {
2319cabdff1aSopenharmony_ci        avc_wgt_8x16_msa(src, stride, log2_denom, weight_src, offset);
2320cabdff1aSopenharmony_ci    }
2321cabdff1aSopenharmony_ci}
2322cabdff1aSopenharmony_ci
2323cabdff1aSopenharmony_civoid ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride,
2324cabdff1aSopenharmony_ci                                  int height, int log2_denom,
2325cabdff1aSopenharmony_ci                                  int weight_src, int offset)
2326cabdff1aSopenharmony_ci{
2327cabdff1aSopenharmony_ci    if (2 == height) {
2328cabdff1aSopenharmony_ci        avc_wgt_4x2_msa(src, stride, log2_denom, weight_src, offset);
2329cabdff1aSopenharmony_ci    } else if (4 == height) {
2330cabdff1aSopenharmony_ci        avc_wgt_4x4_msa(src, stride, log2_denom, weight_src, offset);
2331cabdff1aSopenharmony_ci    } else {
2332cabdff1aSopenharmony_ci        avc_wgt_4x8_msa(src, stride, log2_denom, weight_src, offset);
2333cabdff1aSopenharmony_ci    }
2334cabdff1aSopenharmony_ci}
2335cabdff1aSopenharmony_ci
2336cabdff1aSopenharmony_civoid ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
2337cabdff1aSopenharmony_ci                                     ptrdiff_t stride, int height,
2338cabdff1aSopenharmony_ci                                     int log2_denom, int weight_dst,
2339cabdff1aSopenharmony_ci                                     int weight_src, int offset_in)
2340cabdff1aSopenharmony_ci{
2341cabdff1aSopenharmony_ci    v16i8 src_wgt, dst_wgt, wgt;
2342cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2343cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2344cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2345cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2346cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2347cabdff1aSopenharmony_ci    v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2348cabdff1aSopenharmony_ci    v8i16 denom, offset;
2349cabdff1aSopenharmony_ci
2350cabdff1aSopenharmony_ci    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
2351cabdff1aSopenharmony_ci    offset_in += (128 * (weight_src + weight_dst));
2352cabdff1aSopenharmony_ci
2353cabdff1aSopenharmony_ci    src_wgt = __msa_fill_b(weight_src);
2354cabdff1aSopenharmony_ci    dst_wgt = __msa_fill_b(weight_dst);
2355cabdff1aSopenharmony_ci    offset = __msa_fill_h(offset_in);
2356cabdff1aSopenharmony_ci    denom = __msa_fill_h(log2_denom + 1);
2357cabdff1aSopenharmony_ci
2358cabdff1aSopenharmony_ci    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
2359cabdff1aSopenharmony_ci
2360cabdff1aSopenharmony_ci    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2361cabdff1aSopenharmony_ci    src += 8 * stride;
2362cabdff1aSopenharmony_ci    LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2363cabdff1aSopenharmony_ci    XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2364cabdff1aSopenharmony_ci    XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2365cabdff1aSopenharmony_ci    ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2, vec4,
2366cabdff1aSopenharmony_ci               vec6);
2367cabdff1aSopenharmony_ci    ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3, vec5,
2368cabdff1aSopenharmony_ci               vec7);
2369cabdff1aSopenharmony_ci    ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2370cabdff1aSopenharmony_ci               vec12, vec14);
2371cabdff1aSopenharmony_ci    ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2372cabdff1aSopenharmony_ci               vec13, vec15);
2373cabdff1aSopenharmony_ci    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2374cabdff1aSopenharmony_ci    tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2375cabdff1aSopenharmony_ci    tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2376cabdff1aSopenharmony_ci    tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2377cabdff1aSopenharmony_ci    tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2378cabdff1aSopenharmony_ci    tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2379cabdff1aSopenharmony_ci    tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2380cabdff1aSopenharmony_ci    tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2381cabdff1aSopenharmony_ci    tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2382cabdff1aSopenharmony_ci    tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2383cabdff1aSopenharmony_ci    tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2384cabdff1aSopenharmony_ci    tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2385cabdff1aSopenharmony_ci    tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2386cabdff1aSopenharmony_ci    tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2387cabdff1aSopenharmony_ci    tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2388cabdff1aSopenharmony_ci    tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2389cabdff1aSopenharmony_ci    SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2390cabdff1aSopenharmony_ci    SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2391cabdff1aSopenharmony_ci    SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2392cabdff1aSopenharmony_ci    SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2393cabdff1aSopenharmony_ci    CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2394cabdff1aSopenharmony_ci    CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2395cabdff1aSopenharmony_ci    PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2396cabdff1aSopenharmony_ci                dst2, dst3);
2397cabdff1aSopenharmony_ci    PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2398cabdff1aSopenharmony_ci                dst5, dst6, dst7);
2399cabdff1aSopenharmony_ci    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2400cabdff1aSopenharmony_ci    dst += 8 * stride;
2401cabdff1aSopenharmony_ci
2402cabdff1aSopenharmony_ci    if (16 == height) {
2403cabdff1aSopenharmony_ci        LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2404cabdff1aSopenharmony_ci        LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2405cabdff1aSopenharmony_ci        XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2406cabdff1aSopenharmony_ci        XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2407cabdff1aSopenharmony_ci        ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2,
2408cabdff1aSopenharmony_ci                   vec4, vec6);
2409cabdff1aSopenharmony_ci        ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3,
2410cabdff1aSopenharmony_ci                   vec5, vec7);
2411cabdff1aSopenharmony_ci        ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2412cabdff1aSopenharmony_ci                   vec12, vec14);
2413cabdff1aSopenharmony_ci        ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2414cabdff1aSopenharmony_ci                   vec13, vec15);
2415cabdff1aSopenharmony_ci        tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2416cabdff1aSopenharmony_ci        tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2417cabdff1aSopenharmony_ci        tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2418cabdff1aSopenharmony_ci        tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2419cabdff1aSopenharmony_ci        tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2420cabdff1aSopenharmony_ci        tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2421cabdff1aSopenharmony_ci        tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2422cabdff1aSopenharmony_ci        tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2423cabdff1aSopenharmony_ci        tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2424cabdff1aSopenharmony_ci        tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2425cabdff1aSopenharmony_ci        tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2426cabdff1aSopenharmony_ci        tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2427cabdff1aSopenharmony_ci        tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2428cabdff1aSopenharmony_ci        tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2429cabdff1aSopenharmony_ci        tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2430cabdff1aSopenharmony_ci        tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2431cabdff1aSopenharmony_ci        SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2432cabdff1aSopenharmony_ci        SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2433cabdff1aSopenharmony_ci        SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2434cabdff1aSopenharmony_ci        SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2435cabdff1aSopenharmony_ci        CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2436cabdff1aSopenharmony_ci        CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2437cabdff1aSopenharmony_ci        PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2438cabdff1aSopenharmony_ci                    dst2, dst3);
2439cabdff1aSopenharmony_ci        PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2440cabdff1aSopenharmony_ci                    dst5, dst6, dst7);
2441cabdff1aSopenharmony_ci        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2442cabdff1aSopenharmony_ci    }
2443cabdff1aSopenharmony_ci}
2444cabdff1aSopenharmony_ci
2445cabdff1aSopenharmony_civoid ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src,
2446cabdff1aSopenharmony_ci                                    ptrdiff_t stride, int height,
2447cabdff1aSopenharmony_ci                                    int log2_denom, int weight_dst,
2448cabdff1aSopenharmony_ci                                    int weight_src, int offset)
2449cabdff1aSopenharmony_ci{
2450cabdff1aSopenharmony_ci    if (4 == height) {
2451cabdff1aSopenharmony_ci        avc_biwgt_8x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2452cabdff1aSopenharmony_ci                          offset);
2453cabdff1aSopenharmony_ci    } else if (8 == height) {
2454cabdff1aSopenharmony_ci        avc_biwgt_8x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2455cabdff1aSopenharmony_ci                          offset);
2456cabdff1aSopenharmony_ci    } else {
2457cabdff1aSopenharmony_ci        avc_biwgt_8x16_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2458cabdff1aSopenharmony_ci                           offset);
2459cabdff1aSopenharmony_ci    }
2460cabdff1aSopenharmony_ci}
2461cabdff1aSopenharmony_ci
2462cabdff1aSopenharmony_civoid ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src,
2463cabdff1aSopenharmony_ci                                    ptrdiff_t stride, int height,
2464cabdff1aSopenharmony_ci                                    int log2_denom, int weight_dst,
2465cabdff1aSopenharmony_ci                                    int weight_src, int offset)
2466cabdff1aSopenharmony_ci{
2467cabdff1aSopenharmony_ci    if (2 == height) {
2468cabdff1aSopenharmony_ci        avc_biwgt_4x2_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2469cabdff1aSopenharmony_ci                          offset);
2470cabdff1aSopenharmony_ci    } else if (4 == height) {
2471cabdff1aSopenharmony_ci        avc_biwgt_4x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2472cabdff1aSopenharmony_ci                          offset);
2473cabdff1aSopenharmony_ci    } else {
2474cabdff1aSopenharmony_ci        avc_biwgt_4x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2475cabdff1aSopenharmony_ci                          offset);
2476cabdff1aSopenharmony_ci    }
2477cabdff1aSopenharmony_ci}
2478