1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
22cabdff1aSopenharmony_ci#include "libavcodec/mips/hevcdsp_mips.h"
23cabdff1aSopenharmony_ci#include "libavcodec/mips/hevc_macros_msa.h"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_cistatic const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
26cabdff1aSopenharmony_ci    /* 8 width cases */
27cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28cabdff1aSopenharmony_ci    /* 4 width cases */
29cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30cabdff1aSopenharmony_ci    /* 4 width cases */
31cabdff1aSopenharmony_ci    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32cabdff1aSopenharmony_ci};
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_ci#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,                  \
35cabdff1aSopenharmony_ci                                   mask0, mask1, mask2, mask3,              \
36cabdff1aSopenharmony_ci                                   filt0, filt1, filt2, filt3,              \
37cabdff1aSopenharmony_ci                                   out0, out1)                              \
38cabdff1aSopenharmony_ci{                                                                           \
39cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m, vec2_m, vec3_m,  vec4_m, vec5_m, vec6_m, vec7_m;  \
40cabdff1aSopenharmony_ci                                                                            \
41cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);       \
42cabdff1aSopenharmony_ci    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);                  \
43cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);       \
44cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);                 \
45cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);       \
46cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);                 \
47cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);       \
48cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1);                 \
49cabdff1aSopenharmony_ci}
50cabdff1aSopenharmony_ci
51cabdff1aSopenharmony_ci#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
52cabdff1aSopenharmony_ci                                   mask0, mask1, mask2, mask3,                \
53cabdff1aSopenharmony_ci                                   filt0, filt1, filt2, filt3,                \
54cabdff1aSopenharmony_ci                                   out0, out1, out2, out3)                    \
55cabdff1aSopenharmony_ci{                                                                             \
56cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
57cabdff1aSopenharmony_ci                                                                              \
58cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
59cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
60cabdff1aSopenharmony_ci    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
61cabdff1aSopenharmony_ci                out0, out1, out2, out3);                                      \
62cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \
63cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \
64cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,  \
65cabdff1aSopenharmony_ci                 out0, out1, out2, out3);                                     \
66cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \
67cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \
68cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \
69cabdff1aSopenharmony_ci                 out0, out1, out2, out3);                                     \
70cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \
71cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \
72cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \
73cabdff1aSopenharmony_ci                 out0, out1, out2, out3);                                     \
74cabdff1aSopenharmony_ci}
75cabdff1aSopenharmony_ci
76cabdff1aSopenharmony_ci#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
77cabdff1aSopenharmony_ci                                   mask0, mask1, filt0, filt1,         \
78cabdff1aSopenharmony_ci                                   out0, out1)                         \
79cabdff1aSopenharmony_ci{                                                                      \
80cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                              \
81cabdff1aSopenharmony_ci                                                                       \
82cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
83cabdff1aSopenharmony_ci    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
84cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
85cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
86cabdff1aSopenharmony_ci}
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
89cabdff1aSopenharmony_ci                                   mask0, mask1, filt0, filt1,                \
90cabdff1aSopenharmony_ci                                   out0, out1, out2, out3)                    \
91cabdff1aSopenharmony_ci{                                                                             \
92cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                     \
93cabdff1aSopenharmony_ci                                                                              \
94cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
95cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
96cabdff1aSopenharmony_ci    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
97cabdff1aSopenharmony_ci                out0, out1, out2, out3);                                      \
98cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
99cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
100cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
101cabdff1aSopenharmony_ci                 out0, out1, out2, out3);                                     \
102cabdff1aSopenharmony_ci}
103cabdff1aSopenharmony_ci
104cabdff1aSopenharmony_cistatic void copy_width8_msa(uint8_t *src, int32_t src_stride,
105cabdff1aSopenharmony_ci                            uint8_t *dst, int32_t dst_stride,
106cabdff1aSopenharmony_ci                            int32_t height)
107cabdff1aSopenharmony_ci{
108cabdff1aSopenharmony_ci    int32_t cnt;
109cabdff1aSopenharmony_ci    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
110cabdff1aSopenharmony_ci
111cabdff1aSopenharmony_ci    if (2 == height) {
112cabdff1aSopenharmony_ci        LD2(src, src_stride, out0, out1);
113cabdff1aSopenharmony_ci        SD(out0, dst);
114cabdff1aSopenharmony_ci        dst += dst_stride;
115cabdff1aSopenharmony_ci        SD(out1, dst);
116cabdff1aSopenharmony_ci    } else if (6 == height) {
117cabdff1aSopenharmony_ci        LD4(src, src_stride, out0, out1, out2, out3);
118cabdff1aSopenharmony_ci        src += (4 * src_stride);
119cabdff1aSopenharmony_ci        SD4(out0, out1, out2, out3, dst, dst_stride);
120cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
121cabdff1aSopenharmony_ci        LD2(src, src_stride, out0, out1);
122cabdff1aSopenharmony_ci        SD(out0, dst);
123cabdff1aSopenharmony_ci        dst += dst_stride;
124cabdff1aSopenharmony_ci        SD(out1, dst);
125cabdff1aSopenharmony_ci    } else if (0 == (height % 8)) {
126cabdff1aSopenharmony_ci        for (cnt = (height >> 3); cnt--;) {
127cabdff1aSopenharmony_ci            LD4(src, src_stride, out0, out1, out2, out3);
128cabdff1aSopenharmony_ci            src += (4 * src_stride);
129cabdff1aSopenharmony_ci            LD4(src, src_stride, out4, out5, out6, out7);
130cabdff1aSopenharmony_ci            src += (4 * src_stride);
131cabdff1aSopenharmony_ci            SD4(out0, out1, out2, out3, dst, dst_stride);
132cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
133cabdff1aSopenharmony_ci            SD4(out4, out5, out6, out7, dst, dst_stride);
134cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
135cabdff1aSopenharmony_ci        }
136cabdff1aSopenharmony_ci    } else if (0 == (height % 4)) {
137cabdff1aSopenharmony_ci        for (cnt = (height >> 2); cnt--;) {
138cabdff1aSopenharmony_ci            LD4(src, src_stride, out0, out1, out2, out3);
139cabdff1aSopenharmony_ci            src += (4 * src_stride);
140cabdff1aSopenharmony_ci            SD4(out0, out1, out2, out3, dst, dst_stride);
141cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
142cabdff1aSopenharmony_ci        }
143cabdff1aSopenharmony_ci    }
144cabdff1aSopenharmony_ci}
145cabdff1aSopenharmony_ci
146cabdff1aSopenharmony_cistatic void copy_width12_msa(uint8_t *src, int32_t src_stride,
147cabdff1aSopenharmony_ci                             uint8_t *dst, int32_t dst_stride,
148cabdff1aSopenharmony_ci                             int32_t height)
149cabdff1aSopenharmony_ci{
150cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
151cabdff1aSopenharmony_ci
152cabdff1aSopenharmony_ci    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153cabdff1aSopenharmony_ci    src += (8 * src_stride);
154cabdff1aSopenharmony_ci    ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155cabdff1aSopenharmony_ci    dst += (8 * dst_stride);
156cabdff1aSopenharmony_ci    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157cabdff1aSopenharmony_ci    ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
158cabdff1aSopenharmony_ci}
159cabdff1aSopenharmony_ci
160cabdff1aSopenharmony_cistatic void copy_width16_msa(uint8_t *src, int32_t src_stride,
161cabdff1aSopenharmony_ci                             uint8_t *dst, int32_t dst_stride,
162cabdff1aSopenharmony_ci                             int32_t height)
163cabdff1aSopenharmony_ci{
164cabdff1aSopenharmony_ci    int32_t cnt;
165cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_ci    if (12 == height) {
168cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169cabdff1aSopenharmony_ci        src += (8 * src_stride);
170cabdff1aSopenharmony_ci        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
172cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
173cabdff1aSopenharmony_ci        src += (4 * src_stride);
174cabdff1aSopenharmony_ci        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
176cabdff1aSopenharmony_ci    } else if (0 == (height % 8)) {
177cabdff1aSopenharmony_ci        for (cnt = (height >> 3); cnt--;) {
178cabdff1aSopenharmony_ci            LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
179cabdff1aSopenharmony_ci                   src7);
180cabdff1aSopenharmony_ci            src += (8 * src_stride);
181cabdff1aSopenharmony_ci            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
182cabdff1aSopenharmony_ci                   dst_stride);
183cabdff1aSopenharmony_ci            dst += (8 * dst_stride);
184cabdff1aSopenharmony_ci        }
185cabdff1aSopenharmony_ci    } else if (0 == (height % 4)) {
186cabdff1aSopenharmony_ci        for (cnt = (height >> 2); cnt--;) {
187cabdff1aSopenharmony_ci            LD_UB4(src, src_stride, src0, src1, src2, src3);
188cabdff1aSopenharmony_ci            src += (4 * src_stride);
189cabdff1aSopenharmony_ci
190cabdff1aSopenharmony_ci            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
192cabdff1aSopenharmony_ci        }
193cabdff1aSopenharmony_ci    }
194cabdff1aSopenharmony_ci}
195cabdff1aSopenharmony_ci
196cabdff1aSopenharmony_cistatic void copy_width24_msa(uint8_t *src, int32_t src_stride,
197cabdff1aSopenharmony_ci                             uint8_t *dst, int32_t dst_stride,
198cabdff1aSopenharmony_ci                             int32_t height)
199cabdff1aSopenharmony_ci{
200cabdff1aSopenharmony_ci    int32_t cnt;
201cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
202cabdff1aSopenharmony_ci    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
203cabdff1aSopenharmony_ci
204cabdff1aSopenharmony_ci    for (cnt = 4; cnt--;) {
205cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206cabdff1aSopenharmony_ci        LD4(src + 16, src_stride, out0, out1, out2, out3);
207cabdff1aSopenharmony_ci        src += (4 * src_stride);
208cabdff1aSopenharmony_ci        LD4(src + 16, src_stride, out4, out5, out6, out7);
209cabdff1aSopenharmony_ci        src += (4 * src_stride);
210cabdff1aSopenharmony_ci
211cabdff1aSopenharmony_ci        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212cabdff1aSopenharmony_ci        SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
214cabdff1aSopenharmony_ci        SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
216cabdff1aSopenharmony_ci    }
217cabdff1aSopenharmony_ci}
218cabdff1aSopenharmony_ci
219cabdff1aSopenharmony_cistatic void copy_width32_msa(uint8_t *src, int32_t src_stride,
220cabdff1aSopenharmony_ci                             uint8_t *dst, int32_t dst_stride,
221cabdff1aSopenharmony_ci                             int32_t height)
222cabdff1aSopenharmony_ci{
223cabdff1aSopenharmony_ci    int32_t cnt;
224cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
225cabdff1aSopenharmony_ci
226cabdff1aSopenharmony_ci    for (cnt = (height >> 2); cnt--;) {
227cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
228cabdff1aSopenharmony_ci        LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229cabdff1aSopenharmony_ci        src += (4 * src_stride);
230cabdff1aSopenharmony_ci        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231cabdff1aSopenharmony_ci        ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
233cabdff1aSopenharmony_ci    }
234cabdff1aSopenharmony_ci}
235cabdff1aSopenharmony_ci
236cabdff1aSopenharmony_cistatic void copy_width48_msa(uint8_t *src, int32_t src_stride,
237cabdff1aSopenharmony_ci                             uint8_t *dst, int32_t dst_stride,
238cabdff1aSopenharmony_ci                             int32_t height)
239cabdff1aSopenharmony_ci{
240cabdff1aSopenharmony_ci    int32_t cnt;
241cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
242cabdff1aSopenharmony_ci    v16u8 src11;
243cabdff1aSopenharmony_ci
244cabdff1aSopenharmony_ci    for (cnt = (height >> 2); cnt--;) {
245cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
246cabdff1aSopenharmony_ci        LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247cabdff1aSopenharmony_ci        LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248cabdff1aSopenharmony_ci        src += (4 * src_stride);
249cabdff1aSopenharmony_ci
250cabdff1aSopenharmony_ci        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251cabdff1aSopenharmony_ci        ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252cabdff1aSopenharmony_ci        ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
254cabdff1aSopenharmony_ci    }
255cabdff1aSopenharmony_ci}
256cabdff1aSopenharmony_ci
257cabdff1aSopenharmony_cistatic void copy_width64_msa(uint8_t *src, int32_t src_stride,
258cabdff1aSopenharmony_ci                             uint8_t *dst, int32_t dst_stride,
259cabdff1aSopenharmony_ci                             int32_t height)
260cabdff1aSopenharmony_ci{
261cabdff1aSopenharmony_ci    int32_t cnt;
262cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
263cabdff1aSopenharmony_ci    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
264cabdff1aSopenharmony_ci
265cabdff1aSopenharmony_ci    for (cnt = (height >> 2); cnt--;) {
266cabdff1aSopenharmony_ci        LD_UB4(src, 16, src0, src1, src2, src3);
267cabdff1aSopenharmony_ci        src += src_stride;
268cabdff1aSopenharmony_ci        LD_UB4(src, 16, src4, src5, src6, src7);
269cabdff1aSopenharmony_ci        src += src_stride;
270cabdff1aSopenharmony_ci        LD_UB4(src, 16, src8, src9, src10, src11);
271cabdff1aSopenharmony_ci        src += src_stride;
272cabdff1aSopenharmony_ci        LD_UB4(src, 16, src12, src13, src14, src15);
273cabdff1aSopenharmony_ci        src += src_stride;
274cabdff1aSopenharmony_ci
275cabdff1aSopenharmony_ci        ST_UB4(src0, src1, src2, src3, dst, 16);
276cabdff1aSopenharmony_ci        dst += dst_stride;
277cabdff1aSopenharmony_ci        ST_UB4(src4, src5, src6, src7, dst, 16);
278cabdff1aSopenharmony_ci        dst += dst_stride;
279cabdff1aSopenharmony_ci        ST_UB4(src8, src9, src10, src11, dst, 16);
280cabdff1aSopenharmony_ci        dst += dst_stride;
281cabdff1aSopenharmony_ci        ST_UB4(src12, src13, src14, src15, dst, 16);
282cabdff1aSopenharmony_ci        dst += dst_stride;
283cabdff1aSopenharmony_ci    }
284cabdff1aSopenharmony_ci}
285cabdff1aSopenharmony_ci
286cabdff1aSopenharmony_cistatic void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
287cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
288cabdff1aSopenharmony_ci                                 const int8_t *filter)
289cabdff1aSopenharmony_ci{
290cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, out;
291cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
292cabdff1aSopenharmony_ci    v8i16 filt, out0, out1;
293cabdff1aSopenharmony_ci
294cabdff1aSopenharmony_ci    mask0 = LD_UB(&ff_hevc_mask_arr[16]);
295cabdff1aSopenharmony_ci    src -= 3;
296cabdff1aSopenharmony_ci
297cabdff1aSopenharmony_ci    /* rearranging filter */
298cabdff1aSopenharmony_ci    filt = LD_SH(filter);
299cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
300cabdff1aSopenharmony_ci
301cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
302cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
303cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
304cabdff1aSopenharmony_ci
305cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
306cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
307cabdff1aSopenharmony_ci    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
308cabdff1aSopenharmony_ci                               mask3, filt0, filt1, filt2, filt3, out0, out1);
309cabdff1aSopenharmony_ci    SRARI_H2_SH(out0, out1, 6);
310cabdff1aSopenharmony_ci    SAT_SH2_SH(out0, out1, 7);
311cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
312cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
313cabdff1aSopenharmony_ci}
314cabdff1aSopenharmony_ci
315cabdff1aSopenharmony_cistatic void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
316cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
317cabdff1aSopenharmony_ci                                 const int8_t *filter)
318cabdff1aSopenharmony_ci{
319cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2, filt3;
320cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
321cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, out;
322cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
323cabdff1aSopenharmony_ci
324cabdff1aSopenharmony_ci    mask0 = LD_UB(&ff_hevc_mask_arr[16]);
325cabdff1aSopenharmony_ci    src -= 3;
326cabdff1aSopenharmony_ci
327cabdff1aSopenharmony_ci    /* rearranging filter */
328cabdff1aSopenharmony_ci    filt = LD_SH(filter);
329cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
330cabdff1aSopenharmony_ci
331cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
332cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
333cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
334cabdff1aSopenharmony_ci
335cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
336cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
337cabdff1aSopenharmony_ci    src += (4 * src_stride);
338cabdff1aSopenharmony_ci    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
339cabdff1aSopenharmony_ci                               mask3, filt0, filt1, filt2, filt3, out0, out1);
340cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
341cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
342cabdff1aSopenharmony_ci    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
343cabdff1aSopenharmony_ci                               mask3, filt0, filt1, filt2, filt3, out2, out3);
344cabdff1aSopenharmony_ci    SRARI_H4_SH(out0, out1, out2, out3, 6);
345cabdff1aSopenharmony_ci    SAT_SH4_SH(out0, out1, out2, out3, 7);
346cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
347cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
348cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out2, out3);
349cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
350cabdff1aSopenharmony_ci}
351cabdff1aSopenharmony_ci
352cabdff1aSopenharmony_cistatic void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
353cabdff1aSopenharmony_ci                                  uint8_t *dst, int32_t dst_stride,
354cabdff1aSopenharmony_ci                                  const int8_t *filter)
355cabdff1aSopenharmony_ci{
356cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, out;
357cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
358cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
359cabdff1aSopenharmony_ci
360cabdff1aSopenharmony_ci    mask0 = LD_UB(&ff_hevc_mask_arr[16]);
361cabdff1aSopenharmony_ci    src -= 3;
362cabdff1aSopenharmony_ci
363cabdff1aSopenharmony_ci    /* rearranging filter */
364cabdff1aSopenharmony_ci    filt = LD_SH(filter);
365cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
366cabdff1aSopenharmony_ci
367cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
368cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
369cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
370cabdff1aSopenharmony_ci
371cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
372cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
373cabdff1aSopenharmony_ci    src += (4 * src_stride);
374cabdff1aSopenharmony_ci    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
375cabdff1aSopenharmony_ci                               mask3, filt0, filt1, filt2, filt3, out0, out1);
376cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
377cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
378cabdff1aSopenharmony_ci    src += (4 * src_stride);
379cabdff1aSopenharmony_ci    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
380cabdff1aSopenharmony_ci                               mask3, filt0, filt1, filt2, filt3, out2, out3);
381cabdff1aSopenharmony_ci    SRARI_H4_SH(out0, out1, out2, out3, 6);
382cabdff1aSopenharmony_ci    SAT_SH4_SH(out0, out1, out2, out3, 7);
383cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
384cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
385cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out2, out3);
386cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
387cabdff1aSopenharmony_ci    dst += (8 * dst_stride);
388cabdff1aSopenharmony_ci
389cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
390cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
391cabdff1aSopenharmony_ci    src += (4 * src_stride);
392cabdff1aSopenharmony_ci    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
393cabdff1aSopenharmony_ci                               mask3, filt0, filt1, filt2, filt3, out0, out1);
394cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
395cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
396cabdff1aSopenharmony_ci    src += (4 * src_stride);
397cabdff1aSopenharmony_ci    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
398cabdff1aSopenharmony_ci                               mask3, filt0, filt1, filt2, filt3, out2, out3);
399cabdff1aSopenharmony_ci
400cabdff1aSopenharmony_ci    SRARI_H4_SH(out0, out1, out2, out3, 6);
401cabdff1aSopenharmony_ci    SAT_SH4_SH(out0, out1, out2, out3, 7);
402cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
403cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
404cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out2, out3);
405cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
406cabdff1aSopenharmony_ci}
407cabdff1aSopenharmony_ci
408cabdff1aSopenharmony_cistatic void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
409cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
410cabdff1aSopenharmony_ci                                const int8_t *filter, int32_t height)
411cabdff1aSopenharmony_ci{
412cabdff1aSopenharmony_ci    if (4 == height) {
413cabdff1aSopenharmony_ci        common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
414cabdff1aSopenharmony_ci    } else if (8 == height) {
415cabdff1aSopenharmony_ci        common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
416cabdff1aSopenharmony_ci    } else if (16 == height) {
417cabdff1aSopenharmony_ci        common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
418cabdff1aSopenharmony_ci    }
419cabdff1aSopenharmony_ci}
420cabdff1aSopenharmony_ci
421cabdff1aSopenharmony_cistatic void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
422cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
423cabdff1aSopenharmony_ci                                const int8_t *filter, int32_t height)
424cabdff1aSopenharmony_ci{
425cabdff1aSopenharmony_ci    uint32_t loop_cnt;
426cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
427cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
428cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
429cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
430cabdff1aSopenharmony_ci
431cabdff1aSopenharmony_ci    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
432cabdff1aSopenharmony_ci    src -= 3;
433cabdff1aSopenharmony_ci
434cabdff1aSopenharmony_ci    /* rearranging filter */
435cabdff1aSopenharmony_ci    filt = LD_SH(filter);
436cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
437cabdff1aSopenharmony_ci
438cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
439cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
440cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
441cabdff1aSopenharmony_ci
442cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
443cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
444cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
445cabdff1aSopenharmony_ci        src += (4 * src_stride);
446cabdff1aSopenharmony_ci
447cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
448cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
449cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
450cabdff1aSopenharmony_ci                    out0, out1, out2, out3);
451cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
452cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
453cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
454cabdff1aSopenharmony_ci                     out0, out1, out2, out3);
455cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
456cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
457cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
458cabdff1aSopenharmony_ci                     out0, out1, out2, out3);
459cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
460cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
461cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
462cabdff1aSopenharmony_ci                     out0, out1, out2, out3);
463cabdff1aSopenharmony_ci
464cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 6);
465cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
466cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out0, out1);
467cabdff1aSopenharmony_ci        tmp1 = PCKEV_XORI128_UB(out2, out3);
468cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
469cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
470cabdff1aSopenharmony_ci    }
471cabdff1aSopenharmony_ci}
472cabdff1aSopenharmony_ci
473cabdff1aSopenharmony_cistatic void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
474cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
475cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
476cabdff1aSopenharmony_ci{
477cabdff1aSopenharmony_ci    uint32_t loop_cnt;
478cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
479cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2;
480cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
481cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
482cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2, filt3;
483cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3, out4, out5;
484cabdff1aSopenharmony_ci
485cabdff1aSopenharmony_ci    mask00 = LD_UB(&ff_hevc_mask_arr[0]);
486cabdff1aSopenharmony_ci    mask0 = LD_UB(&ff_hevc_mask_arr[16]);
487cabdff1aSopenharmony_ci
488cabdff1aSopenharmony_ci    src = src - 3;
489cabdff1aSopenharmony_ci
490cabdff1aSopenharmony_ci    /* rearranging filter */
491cabdff1aSopenharmony_ci    filt = LD_SH(filter);
492cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
493cabdff1aSopenharmony_ci
494cabdff1aSopenharmony_ci    mask1 = mask00 + 2;
495cabdff1aSopenharmony_ci    mask2 = mask00 + 4;
496cabdff1aSopenharmony_ci    mask3 = mask00 + 6;
497cabdff1aSopenharmony_ci    mask4 = mask0 + 2;
498cabdff1aSopenharmony_ci    mask5 = mask0 + 4;
499cabdff1aSopenharmony_ci    mask6 = mask0 + 6;
500cabdff1aSopenharmony_ci
501cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
502cabdff1aSopenharmony_ci        /* 8 width */
503cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
504cabdff1aSopenharmony_ci        /* 4 width */
505cabdff1aSopenharmony_ci        LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
506cabdff1aSopenharmony_ci
507cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
508cabdff1aSopenharmony_ci        XORI_B4_128_SB(src4, src5, src6, src7);
509cabdff1aSopenharmony_ci        src += (4 * src_stride);
510cabdff1aSopenharmony_ci
511cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
512cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
513cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
514cabdff1aSopenharmony_ci                    out1, out2, out3);
515cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
516cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
517cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
518cabdff1aSopenharmony_ci                     out1, out2, out3);
519cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
520cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
521cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
522cabdff1aSopenharmony_ci                     out1, out2, out3);
523cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
524cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
525cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
526cabdff1aSopenharmony_ci                     out1, out2, out3);
527cabdff1aSopenharmony_ci
528cabdff1aSopenharmony_ci        /* 4 width */
529cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
530cabdff1aSopenharmony_ci        DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
531cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
532cabdff1aSopenharmony_ci        DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
533cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
534cabdff1aSopenharmony_ci        DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
535cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
536cabdff1aSopenharmony_ci        DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
537cabdff1aSopenharmony_ci
538cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 6);
539cabdff1aSopenharmony_ci        SRARI_H2_SH(out4, out5, 6);
540cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
541cabdff1aSopenharmony_ci        SAT_SH2_SH(out4, out5, 7);
542cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out0, out1);
543cabdff1aSopenharmony_ci        tmp1 = PCKEV_XORI128_UB(out2, out3);
544cabdff1aSopenharmony_ci        tmp2 = PCKEV_XORI128_UB(out4, out5);
545cabdff1aSopenharmony_ci
546cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
547cabdff1aSopenharmony_ci        ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
548cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
549cabdff1aSopenharmony_ci    }
550cabdff1aSopenharmony_ci}
551cabdff1aSopenharmony_ci
552cabdff1aSopenharmony_cistatic void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
553cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
554cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
555cabdff1aSopenharmony_ci{
556cabdff1aSopenharmony_ci    uint32_t loop_cnt;
557cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, out;
558cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
559cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2, filt3;
560cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
561cabdff1aSopenharmony_ci
562cabdff1aSopenharmony_ci    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
563cabdff1aSopenharmony_ci    src -= 3;
564cabdff1aSopenharmony_ci
565cabdff1aSopenharmony_ci    /* rearranging filter */
566cabdff1aSopenharmony_ci    filt = LD_SH(filter);
567cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
568cabdff1aSopenharmony_ci
569cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
570cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
571cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
572cabdff1aSopenharmony_ci
573cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
574cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src0, src2);
575cabdff1aSopenharmony_ci        LD_SB2(src + 8, src_stride, src1, src3);
576cabdff1aSopenharmony_ci        src += (2 * src_stride);
577cabdff1aSopenharmony_ci
578cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src4, src6);
579cabdff1aSopenharmony_ci        LD_SB2(src + 8, src_stride, src5, src7);
580cabdff1aSopenharmony_ci        src += (2 * src_stride);
581cabdff1aSopenharmony_ci
582cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
583cabdff1aSopenharmony_ci        XORI_B4_128_SB(src4, src5, src6, src7);
584cabdff1aSopenharmony_ci        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
585cabdff1aSopenharmony_ci                                   mask3, filt0, filt1, filt2, filt3, out0,
586cabdff1aSopenharmony_ci                                   out1, out2, out3);
587cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 6);
588cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
589cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0, out1);
590cabdff1aSopenharmony_ci        ST_UB(out, dst);
591cabdff1aSopenharmony_ci        dst += dst_stride;
592cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2, out3);
593cabdff1aSopenharmony_ci        ST_UB(out, dst);
594cabdff1aSopenharmony_ci        dst += dst_stride;
595cabdff1aSopenharmony_ci
596cabdff1aSopenharmony_ci        HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
597cabdff1aSopenharmony_ci                                   mask3, filt0, filt1, filt2, filt3, out0,
598cabdff1aSopenharmony_ci                                   out1, out2, out3);
599cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 6);
600cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
601cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0, out1);
602cabdff1aSopenharmony_ci        ST_UB(out, dst);
603cabdff1aSopenharmony_ci        dst += dst_stride;
604cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2, out3);
605cabdff1aSopenharmony_ci        ST_UB(out, dst);
606cabdff1aSopenharmony_ci        dst += dst_stride;
607cabdff1aSopenharmony_ci    }
608cabdff1aSopenharmony_ci}
609cabdff1aSopenharmony_ci
610cabdff1aSopenharmony_cistatic void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
611cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
612cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
613cabdff1aSopenharmony_ci{
614cabdff1aSopenharmony_ci    uint32_t loop_cnt;
615cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
616cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
617cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
618cabdff1aSopenharmony_ci    v16i8 vec11;
619cabdff1aSopenharmony_ci    v8i16 out0, out1, out2, out3, out8, out9, filt;
620cabdff1aSopenharmony_ci
621cabdff1aSopenharmony_ci    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
622cabdff1aSopenharmony_ci    src -= 3;
623cabdff1aSopenharmony_ci
624cabdff1aSopenharmony_ci    /* rearranging filter */
625cabdff1aSopenharmony_ci    filt = LD_SH(filter);
626cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
627cabdff1aSopenharmony_ci
628cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
629cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
630cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
631cabdff1aSopenharmony_ci    mask4 = mask0 + 8;
632cabdff1aSopenharmony_ci    mask5 = mask0 + 10;
633cabdff1aSopenharmony_ci    mask6 = mask0 + 12;
634cabdff1aSopenharmony_ci    mask7 = mask0 + 14;
635cabdff1aSopenharmony_ci
636cabdff1aSopenharmony_ci    for (loop_cnt = 16; loop_cnt--;) {
637cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src0, src2);
638cabdff1aSopenharmony_ci        LD_SB2(src + 16, src_stride, src1, src3);
639cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
640cabdff1aSopenharmony_ci        src += (2 * src_stride);
641cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
642cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
643cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
644cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
645cabdff1aSopenharmony_ci                    out8, out2, out9);
646cabdff1aSopenharmony_ci        DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
647cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
648cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
649cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
650cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
651cabdff1aSopenharmony_ci                     out0, out8, out2, out9);
652cabdff1aSopenharmony_ci        DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
653cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
654cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
655cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
656cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
657cabdff1aSopenharmony_ci                     out0, out8, out2, out9);
658cabdff1aSopenharmony_ci        DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
659cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
660cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
661cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
662cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
663cabdff1aSopenharmony_ci                     out0, out8, out2, out9);
664cabdff1aSopenharmony_ci        DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
665cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out8, out2, out9, 6);
666cabdff1aSopenharmony_ci        SRARI_H2_SH(out1, out3, 6);
667cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out8, out2, out9, 7);
668cabdff1aSopenharmony_ci        SAT_SH2_SH(out1, out3, 7);
669cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out8, out9);
670cabdff1aSopenharmony_ci        ST_D2(out, 0, 1, dst + 16, dst_stride);
671cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0, out1);
672cabdff1aSopenharmony_ci        ST_UB(out, dst);
673cabdff1aSopenharmony_ci        dst += dst_stride;
674cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2, out3);
675cabdff1aSopenharmony_ci        ST_UB(out, dst);
676cabdff1aSopenharmony_ci        dst += dst_stride;
677cabdff1aSopenharmony_ci    }
678cabdff1aSopenharmony_ci}
679cabdff1aSopenharmony_ci
680cabdff1aSopenharmony_cistatic void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
681cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
682cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
683cabdff1aSopenharmony_ci{
684cabdff1aSopenharmony_ci    uint32_t loop_cnt;
685cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, out;
686cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
687cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2, filt3;
688cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
689cabdff1aSopenharmony_ci
690cabdff1aSopenharmony_ci    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
691cabdff1aSopenharmony_ci    src -= 3;
692cabdff1aSopenharmony_ci
693cabdff1aSopenharmony_ci    /* rearranging filter */
694cabdff1aSopenharmony_ci    filt = LD_SH(filter);
695cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
696cabdff1aSopenharmony_ci
697cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
698cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
699cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
700cabdff1aSopenharmony_ci
701cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
702cabdff1aSopenharmony_ci        src0 = LD_SB(src);
703cabdff1aSopenharmony_ci        src1 = LD_SB(src + 8);
704cabdff1aSopenharmony_ci        src2 = LD_SB(src + 16);
705cabdff1aSopenharmony_ci        src3 = LD_SB(src + 24);
706cabdff1aSopenharmony_ci        src += src_stride;
707cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
708cabdff1aSopenharmony_ci
709cabdff1aSopenharmony_ci        src4 = LD_SB(src);
710cabdff1aSopenharmony_ci        src5 = LD_SB(src + 8);
711cabdff1aSopenharmony_ci        src6 = LD_SB(src + 16);
712cabdff1aSopenharmony_ci        src7 = LD_SB(src + 24);
713cabdff1aSopenharmony_ci        src += src_stride;
714cabdff1aSopenharmony_ci        XORI_B4_128_SB(src4, src5, src6, src7);
715cabdff1aSopenharmony_ci
716cabdff1aSopenharmony_ci        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
717cabdff1aSopenharmony_ci                                   mask3, filt0, filt1, filt2, filt3, out0,
718cabdff1aSopenharmony_ci                                   out1, out2, out3);
719cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 6);
720cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
721cabdff1aSopenharmony_ci
722cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0, out1);
723cabdff1aSopenharmony_ci        ST_UB(out, dst);
724cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2, out3);
725cabdff1aSopenharmony_ci        ST_UB(out, dst + 16);
726cabdff1aSopenharmony_ci        dst += dst_stride;
727cabdff1aSopenharmony_ci
728cabdff1aSopenharmony_ci        HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
729cabdff1aSopenharmony_ci                                   mask3, filt0, filt1, filt2, filt3, out0,
730cabdff1aSopenharmony_ci                                   out1, out2, out3);
731cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 6);
732cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
733cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0, out1);
734cabdff1aSopenharmony_ci        ST_UB(out, dst);
735cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2, out3);
736cabdff1aSopenharmony_ci        ST_UB(out, dst + 16);
737cabdff1aSopenharmony_ci        dst += dst_stride;
738cabdff1aSopenharmony_ci    }
739cabdff1aSopenharmony_ci}
740cabdff1aSopenharmony_ci
741cabdff1aSopenharmony_cistatic void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
742cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
743cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
744cabdff1aSopenharmony_ci{
745cabdff1aSopenharmony_ci    uint32_t loop_cnt;
746cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
747cabdff1aSopenharmony_ci    v16i8 src4;
748cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
749cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
750cabdff1aSopenharmony_ci
751cabdff1aSopenharmony_ci    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
752cabdff1aSopenharmony_ci    src -= 3;
753cabdff1aSopenharmony_ci
754cabdff1aSopenharmony_ci    /* rearranging filter */
755cabdff1aSopenharmony_ci    filt = LD_SH(filter);
756cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
757cabdff1aSopenharmony_ci
758cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
759cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
760cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
761cabdff1aSopenharmony_ci    mask4 = mask0 + 8;
762cabdff1aSopenharmony_ci    mask5 = mask0 + 10;
763cabdff1aSopenharmony_ci    mask6 = mask0 + 12;
764cabdff1aSopenharmony_ci    mask7 = mask0 + 14;
765cabdff1aSopenharmony_ci
766cabdff1aSopenharmony_ci    for (loop_cnt = 64; loop_cnt--;) {
767cabdff1aSopenharmony_ci        src0 = LD_SB(src);
768cabdff1aSopenharmony_ci        src1 = LD_SB(src + 8);
769cabdff1aSopenharmony_ci        src2 = LD_SB(src + 16);
770cabdff1aSopenharmony_ci        src3 = LD_SB(src + 32);
771cabdff1aSopenharmony_ci        src4 = LD_SB(src + 40);
772cabdff1aSopenharmony_ci        src += src_stride;
773cabdff1aSopenharmony_ci
774cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
775cabdff1aSopenharmony_ci        src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
776cabdff1aSopenharmony_ci
777cabdff1aSopenharmony_ci        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
778cabdff1aSopenharmony_ci                   vec0, vec1, vec2);
779cabdff1aSopenharmony_ci        DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
780cabdff1aSopenharmony_ci        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
781cabdff1aSopenharmony_ci                   vec0, vec1, vec2);
782cabdff1aSopenharmony_ci        DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
783cabdff1aSopenharmony_ci        out2 = __msa_dpadd_s_h(out2, vec2, filt1);
784cabdff1aSopenharmony_ci        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
785cabdff1aSopenharmony_ci                   vec0, vec1, vec2);
786cabdff1aSopenharmony_ci        DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
787cabdff1aSopenharmony_ci        out2 = __msa_dpadd_s_h(out2, vec2, filt2);
788cabdff1aSopenharmony_ci
789cabdff1aSopenharmony_ci        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
790cabdff1aSopenharmony_ci                   vec0, vec1, vec2);
791cabdff1aSopenharmony_ci        DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
792cabdff1aSopenharmony_ci        out2 = __msa_dpadd_s_h(out2, vec2, filt3);
793cabdff1aSopenharmony_ci
794cabdff1aSopenharmony_ci        SRARI_H2_SH(out0, out1, 6);
795cabdff1aSopenharmony_ci        out3 = __msa_srari_h(out2, 6);
796cabdff1aSopenharmony_ci        SAT_SH3_SH(out0, out1, out3, 7);
797cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0, out1);
798cabdff1aSopenharmony_ci        ST_UB(out, dst);
799cabdff1aSopenharmony_ci
800cabdff1aSopenharmony_ci        VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
801cabdff1aSopenharmony_ci                   vec0, vec1, vec2);
802cabdff1aSopenharmony_ci        DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
803cabdff1aSopenharmony_ci        VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
804cabdff1aSopenharmony_ci                   vec0, vec1, vec2);
805cabdff1aSopenharmony_ci        DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
806cabdff1aSopenharmony_ci        out2 = __msa_dpadd_s_h(out2, vec2, filt1);
807cabdff1aSopenharmony_ci        VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
808cabdff1aSopenharmony_ci                   vec0, vec1, vec2);
809cabdff1aSopenharmony_ci        DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
810cabdff1aSopenharmony_ci        out2 = __msa_dpadd_s_h(out2, vec2, filt2);
811cabdff1aSopenharmony_ci        VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
812cabdff1aSopenharmony_ci                   vec0, vec1, vec2);
813cabdff1aSopenharmony_ci        DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
814cabdff1aSopenharmony_ci        out2 = __msa_dpadd_s_h(out2, vec2, filt3);
815cabdff1aSopenharmony_ci
816cabdff1aSopenharmony_ci        SRARI_H2_SH(out0, out1, 6);
817cabdff1aSopenharmony_ci        out2 = __msa_srari_h(out2, 6);
818cabdff1aSopenharmony_ci        SAT_SH3_SH(out0, out1, out2, 7);
819cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out3, out0);
820cabdff1aSopenharmony_ci        ST_UB(out, dst + 16);
821cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out1, out2);
822cabdff1aSopenharmony_ci        ST_UB(out, dst + 32);
823cabdff1aSopenharmony_ci        dst += dst_stride;
824cabdff1aSopenharmony_ci    }
825cabdff1aSopenharmony_ci}
826cabdff1aSopenharmony_ci
827cabdff1aSopenharmony_cistatic void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
828cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
829cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
830cabdff1aSopenharmony_ci{
831cabdff1aSopenharmony_ci    int32_t loop_cnt;
832cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, out;
833cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
834cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
835cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2, filt3;
836cabdff1aSopenharmony_ci    v8i16 res0, res1, res2, res3, filt;
837cabdff1aSopenharmony_ci
838cabdff1aSopenharmony_ci    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
839cabdff1aSopenharmony_ci    src -= 3;
840cabdff1aSopenharmony_ci
841cabdff1aSopenharmony_ci    /* rearranging filter */
842cabdff1aSopenharmony_ci    filt = LD_SH(filter);
843cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
844cabdff1aSopenharmony_ci
845cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
846cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
847cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
848cabdff1aSopenharmony_ci
849cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
850cabdff1aSopenharmony_ci        LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
851cabdff1aSopenharmony_ci        src += src_stride;
852cabdff1aSopenharmony_ci
853cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
854cabdff1aSopenharmony_ci
855cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
856cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
857cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
858cabdff1aSopenharmony_ci                    res1, res2, res3);
859cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
860cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
861cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
862cabdff1aSopenharmony_ci                     res1, res2, res3);
863cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
864cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
865cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
866cabdff1aSopenharmony_ci                     res1, res2, res3);
867cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
868cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
869cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
870cabdff1aSopenharmony_ci                     res1, res2, res3);
871cabdff1aSopenharmony_ci
872cabdff1aSopenharmony_ci        SRARI_H4_SH(res0, res1, res2, res3, 6);
873cabdff1aSopenharmony_ci        SAT_SH4_SH(res0, res1, res2, res3, 7);
874cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(res0, res1);
875cabdff1aSopenharmony_ci        ST_UB(out, dst);
876cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(res2, res3);
877cabdff1aSopenharmony_ci        ST_UB(out, dst + 16);
878cabdff1aSopenharmony_ci
879cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
880cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
881cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
882cabdff1aSopenharmony_ci                    res1, res2, res3);
883cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
884cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
885cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
886cabdff1aSopenharmony_ci                     res1, res2, res3);
887cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
888cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
889cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
890cabdff1aSopenharmony_ci                     res1, res2, res3);
891cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
892cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
893cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
894cabdff1aSopenharmony_ci                     res1, res2, res3);
895cabdff1aSopenharmony_ci
896cabdff1aSopenharmony_ci        SRARI_H4_SH(res0, res1, res2, res3, 6);
897cabdff1aSopenharmony_ci        SAT_SH4_SH(res0, res1, res2, res3, 7);
898cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(res0, res1);
899cabdff1aSopenharmony_ci        ST_UB(out, dst + 32);
900cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(res2, res3);
901cabdff1aSopenharmony_ci        ST_UB(out, dst + 48);
902cabdff1aSopenharmony_ci        dst += dst_stride;
903cabdff1aSopenharmony_ci    }
904cabdff1aSopenharmony_ci}
905cabdff1aSopenharmony_ci
906cabdff1aSopenharmony_cistatic void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
907cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
908cabdff1aSopenharmony_ci                                const int8_t *filter, int32_t height)
909cabdff1aSopenharmony_ci{
910cabdff1aSopenharmony_ci    uint32_t loop_cnt;
911cabdff1aSopenharmony_ci    v16u8 out0, out1;
912cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
913cabdff1aSopenharmony_ci    v16i8 src11, src12, src13, src14;
914cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
915cabdff1aSopenharmony_ci    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
916cabdff1aSopenharmony_ci    v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
917cabdff1aSopenharmony_ci    v16i8 src10998, filt0, filt1, filt2, filt3;
918cabdff1aSopenharmony_ci    v8i16 filt, out10, out32, out54, out76;
919cabdff1aSopenharmony_ci
920cabdff1aSopenharmony_ci    src -= (3 * src_stride);
921cabdff1aSopenharmony_ci
922cabdff1aSopenharmony_ci    filt = LD_SH(filter);
923cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
924cabdff1aSopenharmony_ci
925cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
926cabdff1aSopenharmony_ci    src += (7 * src_stride);
927cabdff1aSopenharmony_ci
928cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
929cabdff1aSopenharmony_ci               src54_r, src21_r);
930cabdff1aSopenharmony_ci    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
931cabdff1aSopenharmony_ci    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
932cabdff1aSopenharmony_ci               src4332, src6554);
933cabdff1aSopenharmony_ci    XORI_B3_128_SB(src2110, src4332, src6554);
934cabdff1aSopenharmony_ci
935cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
936cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
937cabdff1aSopenharmony_ci        src += (4 * src_stride);
938cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src11, src12, src13, src14);
939cabdff1aSopenharmony_ci        src += (4 * src_stride);
940cabdff1aSopenharmony_ci
941cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
942cabdff1aSopenharmony_ci                   src87_r, src98_r, src109_r);
943cabdff1aSopenharmony_ci        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
944cabdff1aSopenharmony_ci                   src1110_r, src1211_r, src1312_r, src1413_r);
945cabdff1aSopenharmony_ci        ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
946cabdff1aSopenharmony_ci        ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
947cabdff1aSopenharmony_ci                   src12111110, src14131312);
948cabdff1aSopenharmony_ci        XORI_B2_128_SB(src8776, src10998);
949cabdff1aSopenharmony_ci        XORI_B2_128_SB(src12111110, src14131312);
950cabdff1aSopenharmony_ci
951cabdff1aSopenharmony_ci        DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
952cabdff1aSopenharmony_ci        DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
953cabdff1aSopenharmony_ci        DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
954cabdff1aSopenharmony_ci        DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
955cabdff1aSopenharmony_ci        DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
956cabdff1aSopenharmony_ci        DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
957cabdff1aSopenharmony_ci        DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
958cabdff1aSopenharmony_ci        DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
959cabdff1aSopenharmony_ci        SRARI_H2_SH(out10, out32, 6);
960cabdff1aSopenharmony_ci        SRARI_H2_SH(out54, out76, 6);
961cabdff1aSopenharmony_ci        SAT_SH2_SH(out10, out32, 7);
962cabdff1aSopenharmony_ci        SAT_SH2_SH(out54, out76, 7);
963cabdff1aSopenharmony_ci        out0 = PCKEV_XORI128_UB(out10, out32);
964cabdff1aSopenharmony_ci        out1 = PCKEV_XORI128_UB(out54, out76);
965cabdff1aSopenharmony_ci        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
966cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
967cabdff1aSopenharmony_ci
968cabdff1aSopenharmony_ci        src2110 = src10998;
969cabdff1aSopenharmony_ci        src4332 = src12111110;
970cabdff1aSopenharmony_ci        src6554 = src14131312;
971cabdff1aSopenharmony_ci        src6 = src14;
972cabdff1aSopenharmony_ci    }
973cabdff1aSopenharmony_ci}
974cabdff1aSopenharmony_ci
975cabdff1aSopenharmony_cistatic void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
976cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
977cabdff1aSopenharmony_ci                                const int8_t *filter, int32_t height)
978cabdff1aSopenharmony_ci{
979cabdff1aSopenharmony_ci    uint32_t loop_cnt;
980cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
981cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
982cabdff1aSopenharmony_ci    v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
983cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1;
984cabdff1aSopenharmony_ci    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
985cabdff1aSopenharmony_ci
986cabdff1aSopenharmony_ci    src -= (3 * src_stride);
987cabdff1aSopenharmony_ci
988cabdff1aSopenharmony_ci    filt = LD_SH(filter);
989cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
990cabdff1aSopenharmony_ci
991cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
992cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
993cabdff1aSopenharmony_ci    src += (7 * src_stride);
994cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
995cabdff1aSopenharmony_ci               src54_r, src21_r);
996cabdff1aSopenharmony_ci    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
997cabdff1aSopenharmony_ci
998cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
999cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
1000cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
1001cabdff1aSopenharmony_ci        src += (4 * src_stride);
1002cabdff1aSopenharmony_ci
1003cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1004cabdff1aSopenharmony_ci                   src87_r, src98_r, src109_r);
1005cabdff1aSopenharmony_ci        DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1006cabdff1aSopenharmony_ci                    filt0, out0_r, out1_r, out2_r, out3_r);
1007cabdff1aSopenharmony_ci        DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1008cabdff1aSopenharmony_ci                     filt1, out0_r, out1_r, out2_r, out3_r);
1009cabdff1aSopenharmony_ci        DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1010cabdff1aSopenharmony_ci                     filt2, out0_r, out1_r, out2_r, out3_r);
1011cabdff1aSopenharmony_ci        DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1012cabdff1aSopenharmony_ci                     filt3, out0_r, out1_r, out2_r, out3_r);
1013cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1014cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1015cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1016cabdff1aSopenharmony_ci        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1017cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
1018cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1019cabdff1aSopenharmony_ci
1020cabdff1aSopenharmony_ci        src10_r = src54_r;
1021cabdff1aSopenharmony_ci        src32_r = src76_r;
1022cabdff1aSopenharmony_ci        src54_r = src98_r;
1023cabdff1aSopenharmony_ci        src21_r = src65_r;
1024cabdff1aSopenharmony_ci        src43_r = src87_r;
1025cabdff1aSopenharmony_ci        src65_r = src109_r;
1026cabdff1aSopenharmony_ci        src6 = src10;
1027cabdff1aSopenharmony_ci    }
1028cabdff1aSopenharmony_ci}
1029cabdff1aSopenharmony_ci
1030cabdff1aSopenharmony_cistatic void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1031cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1032cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
1033cabdff1aSopenharmony_ci{
1034cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1035cabdff1aSopenharmony_ci    uint32_t out2, out3;
1036cabdff1aSopenharmony_ci    uint64_t out0, out1;
1037cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3;
1038cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1039cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2, filt3;
1040cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1041cabdff1aSopenharmony_ci    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1042cabdff1aSopenharmony_ci    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1043cabdff1aSopenharmony_ci    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1044cabdff1aSopenharmony_ci
1045cabdff1aSopenharmony_ci    src -= (3 * src_stride);
1046cabdff1aSopenharmony_ci
1047cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1048cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1049cabdff1aSopenharmony_ci
1050cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1051cabdff1aSopenharmony_ci    src += (7 * src_stride);
1052cabdff1aSopenharmony_ci
1053cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1054cabdff1aSopenharmony_ci
1055cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1056cabdff1aSopenharmony_ci               src54_r, src21_r);
1057cabdff1aSopenharmony_ci    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1058cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1059cabdff1aSopenharmony_ci               src54_l, src21_l);
1060cabdff1aSopenharmony_ci    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1061cabdff1aSopenharmony_ci
1062cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
1063cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
1064cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
1065cabdff1aSopenharmony_ci        src += (4 * src_stride);
1066cabdff1aSopenharmony_ci
1067cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1068cabdff1aSopenharmony_ci                   src87_r, src98_r, src109_r);
1069cabdff1aSopenharmony_ci        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1070cabdff1aSopenharmony_ci                   src87_l, src98_l, src109_l);
1071cabdff1aSopenharmony_ci        out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1072cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1073cabdff1aSopenharmony_ci        out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1074cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1075cabdff1aSopenharmony_ci        out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1076cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1077cabdff1aSopenharmony_ci        out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1078cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1079cabdff1aSopenharmony_ci        out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1080cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1081cabdff1aSopenharmony_ci        out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1082cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1083cabdff1aSopenharmony_ci        out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1084cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1085cabdff1aSopenharmony_ci        out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1086cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1087cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1088cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1089cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1090cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1091cabdff1aSopenharmony_ci        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1092cabdff1aSopenharmony_ci                    out3_r, tmp0, tmp1, tmp2, tmp3);
1093cabdff1aSopenharmony_ci        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1094cabdff1aSopenharmony_ci
1095cabdff1aSopenharmony_ci        out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1096cabdff1aSopenharmony_ci        out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1097cabdff1aSopenharmony_ci        out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1098cabdff1aSopenharmony_ci        out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1099cabdff1aSopenharmony_ci        SD(out0, dst);
1100cabdff1aSopenharmony_ci        SW(out2, (dst + 8));
1101cabdff1aSopenharmony_ci        dst += dst_stride;
1102cabdff1aSopenharmony_ci        SD(out1, dst);
1103cabdff1aSopenharmony_ci        SW(out3, (dst + 8));
1104cabdff1aSopenharmony_ci        dst += dst_stride;
1105cabdff1aSopenharmony_ci        out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1106cabdff1aSopenharmony_ci        out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1107cabdff1aSopenharmony_ci        out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1108cabdff1aSopenharmony_ci        out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1109cabdff1aSopenharmony_ci        SD(out0, dst);
1110cabdff1aSopenharmony_ci        SW(out2, (dst + 8));
1111cabdff1aSopenharmony_ci        dst += dst_stride;
1112cabdff1aSopenharmony_ci        SD(out1, dst);
1113cabdff1aSopenharmony_ci        SW(out3, (dst + 8));
1114cabdff1aSopenharmony_ci        dst += dst_stride;
1115cabdff1aSopenharmony_ci
1116cabdff1aSopenharmony_ci        src10_r = src54_r;
1117cabdff1aSopenharmony_ci        src32_r = src76_r;
1118cabdff1aSopenharmony_ci        src54_r = src98_r;
1119cabdff1aSopenharmony_ci        src21_r = src65_r;
1120cabdff1aSopenharmony_ci        src43_r = src87_r;
1121cabdff1aSopenharmony_ci        src65_r = src109_r;
1122cabdff1aSopenharmony_ci        src10_l = src54_l;
1123cabdff1aSopenharmony_ci        src32_l = src76_l;
1124cabdff1aSopenharmony_ci        src54_l = src98_l;
1125cabdff1aSopenharmony_ci        src21_l = src65_l;
1126cabdff1aSopenharmony_ci        src43_l = src87_l;
1127cabdff1aSopenharmony_ci        src65_l = src109_l;
1128cabdff1aSopenharmony_ci        src6 = src10;
1129cabdff1aSopenharmony_ci    }
1130cabdff1aSopenharmony_ci}
1131cabdff1aSopenharmony_ci
1132cabdff1aSopenharmony_cistatic void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1133cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1134cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
1135cabdff1aSopenharmony_ci{
1136cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1137cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1138cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2, filt3;
1139cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1140cabdff1aSopenharmony_ci    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1141cabdff1aSopenharmony_ci    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1142cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3;
1143cabdff1aSopenharmony_ci    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1144cabdff1aSopenharmony_ci
1145cabdff1aSopenharmony_ci    src -= (3 * src_stride);
1146cabdff1aSopenharmony_ci
1147cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1148cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1149cabdff1aSopenharmony_ci
1150cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1151cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1152cabdff1aSopenharmony_ci    src += (7 * src_stride);
1153cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1154cabdff1aSopenharmony_ci               src54_r, src21_r);
1155cabdff1aSopenharmony_ci    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1156cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1157cabdff1aSopenharmony_ci               src54_l, src21_l);
1158cabdff1aSopenharmony_ci    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1159cabdff1aSopenharmony_ci
1160cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1161cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
1162cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
1163cabdff1aSopenharmony_ci        src += (4 * src_stride);
1164cabdff1aSopenharmony_ci
1165cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1166cabdff1aSopenharmony_ci                   src87_r, src98_r, src109_r);
1167cabdff1aSopenharmony_ci        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1168cabdff1aSopenharmony_ci                   src87_l, src98_l, src109_l);
1169cabdff1aSopenharmony_ci        out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1170cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1171cabdff1aSopenharmony_ci        out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1172cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1173cabdff1aSopenharmony_ci        out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1174cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1175cabdff1aSopenharmony_ci        out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1176cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1177cabdff1aSopenharmony_ci        out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1178cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1179cabdff1aSopenharmony_ci        out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1180cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1181cabdff1aSopenharmony_ci        out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1182cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1183cabdff1aSopenharmony_ci        out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1184cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1185cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1186cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1187cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1188cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1189cabdff1aSopenharmony_ci        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1190cabdff1aSopenharmony_ci                    out3_r, tmp0, tmp1, tmp2, tmp3);
1191cabdff1aSopenharmony_ci        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1192cabdff1aSopenharmony_ci        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1193cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1194cabdff1aSopenharmony_ci
1195cabdff1aSopenharmony_ci        src10_r = src54_r;
1196cabdff1aSopenharmony_ci        src32_r = src76_r;
1197cabdff1aSopenharmony_ci        src54_r = src98_r;
1198cabdff1aSopenharmony_ci        src21_r = src65_r;
1199cabdff1aSopenharmony_ci        src43_r = src87_r;
1200cabdff1aSopenharmony_ci        src65_r = src109_r;
1201cabdff1aSopenharmony_ci        src10_l = src54_l;
1202cabdff1aSopenharmony_ci        src32_l = src76_l;
1203cabdff1aSopenharmony_ci        src54_l = src98_l;
1204cabdff1aSopenharmony_ci        src21_l = src65_l;
1205cabdff1aSopenharmony_ci        src43_l = src87_l;
1206cabdff1aSopenharmony_ci        src65_l = src109_l;
1207cabdff1aSopenharmony_ci        src6 = src10;
1208cabdff1aSopenharmony_ci    }
1209cabdff1aSopenharmony_ci}
1210cabdff1aSopenharmony_ci
1211cabdff1aSopenharmony_cistatic void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride,
1212cabdff1aSopenharmony_ci                                      uint8_t *dst, int32_t dst_stride,
1213cabdff1aSopenharmony_ci                                      const int8_t *filter, int32_t height,
1214cabdff1aSopenharmony_ci                                      int32_t width)
1215cabdff1aSopenharmony_ci{
1216cabdff1aSopenharmony_ci    uint8_t *src_tmp;
1217cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
1218cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
1219cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1220cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2, filt3;
1221cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1222cabdff1aSopenharmony_ci    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1223cabdff1aSopenharmony_ci    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1224cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3;
1225cabdff1aSopenharmony_ci    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1226cabdff1aSopenharmony_ci
1227cabdff1aSopenharmony_ci    src -= (3 * src_stride);
1228cabdff1aSopenharmony_ci
1229cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1230cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1231cabdff1aSopenharmony_ci
1232cabdff1aSopenharmony_ci    for (cnt = (width >> 4); cnt--;) {
1233cabdff1aSopenharmony_ci        src_tmp = src;
1234cabdff1aSopenharmony_ci        dst_tmp = dst;
1235cabdff1aSopenharmony_ci
1236cabdff1aSopenharmony_ci        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1237cabdff1aSopenharmony_ci        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1238cabdff1aSopenharmony_ci        src_tmp += (7 * src_stride);
1239cabdff1aSopenharmony_ci        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1240cabdff1aSopenharmony_ci                   src32_r, src54_r, src21_r);
1241cabdff1aSopenharmony_ci        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1242cabdff1aSopenharmony_ci        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1243cabdff1aSopenharmony_ci                   src32_l, src54_l, src21_l);
1244cabdff1aSopenharmony_ci        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1245cabdff1aSopenharmony_ci
1246cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 2); loop_cnt--;) {
1247cabdff1aSopenharmony_ci            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1248cabdff1aSopenharmony_ci            XORI_B4_128_SB(src7, src8, src9, src10);
1249cabdff1aSopenharmony_ci            src_tmp += (4 * src_stride);
1250cabdff1aSopenharmony_ci            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1251cabdff1aSopenharmony_ci                       src87_r, src98_r, src109_r);
1252cabdff1aSopenharmony_ci            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1253cabdff1aSopenharmony_ci                       src87_l, src98_l, src109_l);
1254cabdff1aSopenharmony_ci            out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r,
1255cabdff1aSopenharmony_ci                                       filt0, filt1, filt2, filt3);
1256cabdff1aSopenharmony_ci            out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r,
1257cabdff1aSopenharmony_ci                                       filt0, filt1, filt2, filt3);
1258cabdff1aSopenharmony_ci            out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r,
1259cabdff1aSopenharmony_ci                                       filt0, filt1, filt2, filt3);
1260cabdff1aSopenharmony_ci            out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r,
1261cabdff1aSopenharmony_ci                                       filt0, filt1, filt2, filt3);
1262cabdff1aSopenharmony_ci            out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l,
1263cabdff1aSopenharmony_ci                                       filt0, filt1, filt2, filt3);
1264cabdff1aSopenharmony_ci            out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l,
1265cabdff1aSopenharmony_ci                                       filt0, filt1, filt2, filt3);
1266cabdff1aSopenharmony_ci            out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l,
1267cabdff1aSopenharmony_ci                                       filt0, filt1, filt2, filt3);
1268cabdff1aSopenharmony_ci            out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l,
1269cabdff1aSopenharmony_ci                                       filt0, filt1, filt2, filt3);
1270cabdff1aSopenharmony_ci            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1271cabdff1aSopenharmony_ci            SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1272cabdff1aSopenharmony_ci            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1273cabdff1aSopenharmony_ci            SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1274cabdff1aSopenharmony_ci            PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1275cabdff1aSopenharmony_ci                        out3_r, tmp0, tmp1, tmp2, tmp3);
1276cabdff1aSopenharmony_ci            XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1277cabdff1aSopenharmony_ci            ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1278cabdff1aSopenharmony_ci            dst_tmp += (4 * dst_stride);
1279cabdff1aSopenharmony_ci
1280cabdff1aSopenharmony_ci            src10_r = src54_r;
1281cabdff1aSopenharmony_ci            src32_r = src76_r;
1282cabdff1aSopenharmony_ci            src54_r = src98_r;
1283cabdff1aSopenharmony_ci            src21_r = src65_r;
1284cabdff1aSopenharmony_ci            src43_r = src87_r;
1285cabdff1aSopenharmony_ci            src65_r = src109_r;
1286cabdff1aSopenharmony_ci            src10_l = src54_l;
1287cabdff1aSopenharmony_ci            src32_l = src76_l;
1288cabdff1aSopenharmony_ci            src54_l = src98_l;
1289cabdff1aSopenharmony_ci            src21_l = src65_l;
1290cabdff1aSopenharmony_ci            src43_l = src87_l;
1291cabdff1aSopenharmony_ci            src65_l = src109_l;
1292cabdff1aSopenharmony_ci            src6 = src10;
1293cabdff1aSopenharmony_ci        }
1294cabdff1aSopenharmony_ci
1295cabdff1aSopenharmony_ci        src += 16;
1296cabdff1aSopenharmony_ci        dst += 16;
1297cabdff1aSopenharmony_ci    }
1298cabdff1aSopenharmony_ci}
1299cabdff1aSopenharmony_ci
1300cabdff1aSopenharmony_cistatic void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1301cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1302cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
1303cabdff1aSopenharmony_ci{
1304cabdff1aSopenharmony_ci    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1305cabdff1aSopenharmony_ci                              16);
1306cabdff1aSopenharmony_ci
1307cabdff1aSopenharmony_ci    common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1308cabdff1aSopenharmony_ci                        height);
1309cabdff1aSopenharmony_ci}
1310cabdff1aSopenharmony_ci
1311cabdff1aSopenharmony_cistatic void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1312cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1313cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
1314cabdff1aSopenharmony_ci{
1315cabdff1aSopenharmony_ci    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1316cabdff1aSopenharmony_ci                              32);
1317cabdff1aSopenharmony_ci}
1318cabdff1aSopenharmony_ci
1319cabdff1aSopenharmony_cistatic void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1320cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1321cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
1322cabdff1aSopenharmony_ci{
1323cabdff1aSopenharmony_ci    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1324cabdff1aSopenharmony_ci                              48);
1325cabdff1aSopenharmony_ci}
1326cabdff1aSopenharmony_ci
1327cabdff1aSopenharmony_cistatic void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1328cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1329cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
1330cabdff1aSopenharmony_ci{
1331cabdff1aSopenharmony_ci    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1332cabdff1aSopenharmony_ci                              64);
1333cabdff1aSopenharmony_ci}
1334cabdff1aSopenharmony_ci
1335cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_4w_msa(uint8_t *src,
1336cabdff1aSopenharmony_ci                                  int32_t src_stride,
1337cabdff1aSopenharmony_ci                                  uint8_t *dst,
1338cabdff1aSopenharmony_ci                                  int32_t dst_stride,
1339cabdff1aSopenharmony_ci                                  const int8_t *filter_x,
1340cabdff1aSopenharmony_ci                                  const int8_t *filter_y,
1341cabdff1aSopenharmony_ci                                  int32_t height)
1342cabdff1aSopenharmony_ci{
1343cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1344cabdff1aSopenharmony_ci    v16u8 out0, out1;
1345cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1346cabdff1aSopenharmony_ci    v16i8 src9, src10, src11, src12, src13, src14;
1347cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1348cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1349cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3;
1350cabdff1aSopenharmony_ci    v8i16 filter_vec;
1351cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1352cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1353cabdff1aSopenharmony_ci    v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1354cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1355cabdff1aSopenharmony_ci    v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1356cabdff1aSopenharmony_ci    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1357cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1358cabdff1aSopenharmony_ci
1359cabdff1aSopenharmony_ci    src -= ((3 * src_stride) + 3);
1360cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
1361cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1362cabdff1aSopenharmony_ci
1363cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
1364cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
1365cabdff1aSopenharmony_ci
1366cabdff1aSopenharmony_ci    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1367cabdff1aSopenharmony_ci
1368cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1369cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1370cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1371cabdff1aSopenharmony_ci
1372cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1373cabdff1aSopenharmony_ci    src += (7 * src_stride);
1374cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1375cabdff1aSopenharmony_ci
1376cabdff1aSopenharmony_ci    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1377cabdff1aSopenharmony_ci    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1378cabdff1aSopenharmony_ci    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1379cabdff1aSopenharmony_ci               vec8, vec9, vec10, vec11);
1380cabdff1aSopenharmony_ci    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1381cabdff1aSopenharmony_ci               vec12, vec13, vec14, vec15);
1382cabdff1aSopenharmony_ci
1383cabdff1aSopenharmony_ci    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1384cabdff1aSopenharmony_ci                              filt3);
1385cabdff1aSopenharmony_ci    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1386cabdff1aSopenharmony_ci                              filt3);
1387cabdff1aSopenharmony_ci    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1388cabdff1aSopenharmony_ci                              filt3);
1389cabdff1aSopenharmony_ci    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1390cabdff1aSopenharmony_ci                              filt3);
1391cabdff1aSopenharmony_ci
1392cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1393cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1394cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1395cabdff1aSopenharmony_ci
1396cabdff1aSopenharmony_ci    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1397cabdff1aSopenharmony_ci
1398cabdff1aSopenharmony_ci    for (loop_cnt = height >> 3; loop_cnt--;) {
1399cabdff1aSopenharmony_ci        LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1400cabdff1aSopenharmony_ci               src14);
1401cabdff1aSopenharmony_ci        src += (8 * src_stride);
1402cabdff1aSopenharmony_ci        XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1403cabdff1aSopenharmony_ci
1404cabdff1aSopenharmony_ci        VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1405cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1406cabdff1aSopenharmony_ci        VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1407cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
1408cabdff1aSopenharmony_ci        VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1409cabdff1aSopenharmony_ci                   vec8, vec9, vec10, vec11);
1410cabdff1aSopenharmony_ci        VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1411cabdff1aSopenharmony_ci                   vec12, vec13, vec14, vec15);
1412cabdff1aSopenharmony_ci
1413cabdff1aSopenharmony_ci        dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1414cabdff1aSopenharmony_ci                                   filt3);
1415cabdff1aSopenharmony_ci        dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1416cabdff1aSopenharmony_ci                                   filt3);
1417cabdff1aSopenharmony_ci        dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1418cabdff1aSopenharmony_ci                                   filt2, filt3);
1419cabdff1aSopenharmony_ci        dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1420cabdff1aSopenharmony_ci                                   filt2, filt3);
1421cabdff1aSopenharmony_ci
1422cabdff1aSopenharmony_ci        dst76_r = __msa_ilvr_h(dst117, dst66);
1423cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1424cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1425cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1426cabdff1aSopenharmony_ci        dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1427cabdff1aSopenharmony_ci        dst1110_r = __msa_ilvr_h(dst117, dst1410);
1428cabdff1aSopenharmony_ci
1429cabdff1aSopenharmony_ci        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1430cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1431cabdff1aSopenharmony_ci        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1432cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1433cabdff1aSopenharmony_ci        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1434cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1435cabdff1aSopenharmony_ci        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1436cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1437cabdff1aSopenharmony_ci        dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1438cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1439cabdff1aSopenharmony_ci        dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1440cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1441cabdff1aSopenharmony_ci        dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1442cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1443cabdff1aSopenharmony_ci        dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1444cabdff1aSopenharmony_ci                                filt_h0, filt_h1, filt_h2, filt_h3);
1445cabdff1aSopenharmony_ci
1446cabdff1aSopenharmony_ci        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1447cabdff1aSopenharmony_ci        SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1448cabdff1aSopenharmony_ci        SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1449cabdff1aSopenharmony_ci        SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1450cabdff1aSopenharmony_ci        SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1451cabdff1aSopenharmony_ci        SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1452cabdff1aSopenharmony_ci        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1453cabdff1aSopenharmony_ci        PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1454cabdff1aSopenharmony_ci        out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1455cabdff1aSopenharmony_ci        out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1456cabdff1aSopenharmony_ci        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1457cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
1458cabdff1aSopenharmony_ci
1459cabdff1aSopenharmony_ci        dst10_r = dst98_r;
1460cabdff1aSopenharmony_ci        dst32_r = dst1110_r;
1461cabdff1aSopenharmony_ci        dst54_r = dst1312_r;
1462cabdff1aSopenharmony_ci        dst21_r = dst109_r;
1463cabdff1aSopenharmony_ci        dst43_r = dst1211_r;
1464cabdff1aSopenharmony_ci        dst65_r = dst1413_r;
1465cabdff1aSopenharmony_ci        dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1466cabdff1aSopenharmony_ci    }
1467cabdff1aSopenharmony_ci}
1468cabdff1aSopenharmony_ci
1469cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
1470cabdff1aSopenharmony_ci                                           int32_t src_stride,
1471cabdff1aSopenharmony_ci                                           uint8_t *dst,
1472cabdff1aSopenharmony_ci                                           int32_t dst_stride,
1473cabdff1aSopenharmony_ci                                           const int8_t *filter_x,
1474cabdff1aSopenharmony_ci                                           const int8_t *filter_y,
1475cabdff1aSopenharmony_ci                                           int32_t height, int32_t width)
1476cabdff1aSopenharmony_ci{
1477cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
1478cabdff1aSopenharmony_ci    uint8_t *src_tmp;
1479cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
1480cabdff1aSopenharmony_ci    v16u8 out;
1481cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1482cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3;
1483cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1484cabdff1aSopenharmony_ci    v16i8 mask1, mask2, mask3;
1485cabdff1aSopenharmony_ci    v8i16 filter_vec;
1486cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1487cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1488cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1489cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1490cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1491cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1492cabdff1aSopenharmony_ci    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1493cabdff1aSopenharmony_ci    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1494cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1495cabdff1aSopenharmony_ci
1496cabdff1aSopenharmony_ci    src -= ((3 * src_stride) + 3);
1497cabdff1aSopenharmony_ci
1498cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
1499cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1500cabdff1aSopenharmony_ci
1501cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
1502cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
1503cabdff1aSopenharmony_ci
1504cabdff1aSopenharmony_ci    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1505cabdff1aSopenharmony_ci
1506cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1507cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1508cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1509cabdff1aSopenharmony_ci
1510cabdff1aSopenharmony_ci    for (cnt = width >> 3; cnt--;) {
1511cabdff1aSopenharmony_ci        src_tmp = src;
1512cabdff1aSopenharmony_ci        dst_tmp = dst;
1513cabdff1aSopenharmony_ci
1514cabdff1aSopenharmony_ci        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1515cabdff1aSopenharmony_ci        src_tmp += (7 * src_stride);
1516cabdff1aSopenharmony_ci        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1517cabdff1aSopenharmony_ci
1518cabdff1aSopenharmony_ci        /* row 0 row 1 row 2 row 3 */
1519cabdff1aSopenharmony_ci        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1520cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1521cabdff1aSopenharmony_ci        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1522cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
1523cabdff1aSopenharmony_ci        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1524cabdff1aSopenharmony_ci                   vec8, vec9, vec10, vec11);
1525cabdff1aSopenharmony_ci        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1526cabdff1aSopenharmony_ci                   vec12, vec13, vec14, vec15);
1527cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1528cabdff1aSopenharmony_ci                                 filt3);
1529cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1530cabdff1aSopenharmony_ci                                 filt3);
1531cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1532cabdff1aSopenharmony_ci                                 filt3);
1533cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1534cabdff1aSopenharmony_ci                                 filt2, filt3);
1535cabdff1aSopenharmony_ci
1536cabdff1aSopenharmony_ci        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1537cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1538cabdff1aSopenharmony_ci        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1539cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
1540cabdff1aSopenharmony_ci        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1541cabdff1aSopenharmony_ci                   vec8, vec9, vec10, vec11);
1542cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1543cabdff1aSopenharmony_ci                                 filt3);
1544cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1545cabdff1aSopenharmony_ci                                 filt3);
1546cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1547cabdff1aSopenharmony_ci                                 filt3);
1548cabdff1aSopenharmony_ci
1549cabdff1aSopenharmony_ci        for (loop_cnt = height >> 1; loop_cnt--;) {
1550cabdff1aSopenharmony_ci            LD_SB2(src_tmp, src_stride, src7, src8);
1551cabdff1aSopenharmony_ci            XORI_B2_128_SB(src7, src8);
1552cabdff1aSopenharmony_ci            src_tmp += 2 * src_stride;
1553cabdff1aSopenharmony_ci
1554cabdff1aSopenharmony_ci            ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1555cabdff1aSopenharmony_ci                       dst10_r, dst32_r, dst54_r, dst21_r);
1556cabdff1aSopenharmony_ci            ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1557cabdff1aSopenharmony_ci                       dst10_l, dst32_l, dst54_l, dst21_l);
1558cabdff1aSopenharmony_ci            ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1559cabdff1aSopenharmony_ci            ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1560cabdff1aSopenharmony_ci
1561cabdff1aSopenharmony_ci            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1562cabdff1aSopenharmony_ci                       vec0, vec1, vec2, vec3);
1563cabdff1aSopenharmony_ci            dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1564cabdff1aSopenharmony_ci                                     filt2, filt3);
1565cabdff1aSopenharmony_ci
1566cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1567cabdff1aSopenharmony_ci            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1568cabdff1aSopenharmony_ci                                    filt_h0, filt_h1, filt_h2, filt_h3);
1569cabdff1aSopenharmony_ci            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1570cabdff1aSopenharmony_ci                                    filt_h0, filt_h1, filt_h2, filt_h3);
1571cabdff1aSopenharmony_ci            dst0_r >>= 6;
1572cabdff1aSopenharmony_ci            dst0_l >>= 6;
1573cabdff1aSopenharmony_ci
1574cabdff1aSopenharmony_ci            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1575cabdff1aSopenharmony_ci                       vec0, vec1, vec2, vec3);
1576cabdff1aSopenharmony_ci            dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1577cabdff1aSopenharmony_ci                                     filt2, filt3);
1578cabdff1aSopenharmony_ci
1579cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1580cabdff1aSopenharmony_ci            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1581cabdff1aSopenharmony_ci                                    filt_h0, filt_h1, filt_h2, filt_h3);
1582cabdff1aSopenharmony_ci            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1583cabdff1aSopenharmony_ci                                    filt_h0, filt_h1, filt_h2, filt_h3);
1584cabdff1aSopenharmony_ci            dst1_r >>= 6;
1585cabdff1aSopenharmony_ci            dst1_l >>= 6;
1586cabdff1aSopenharmony_ci            SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1587cabdff1aSopenharmony_ci            SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1588cabdff1aSopenharmony_ci
1589cabdff1aSopenharmony_ci            PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1590cabdff1aSopenharmony_ci            out = PCKEV_XORI128_UB(dst0, dst1);
1591cabdff1aSopenharmony_ci            ST_D2(out, 0, 1, dst_tmp, dst_stride);
1592cabdff1aSopenharmony_ci            dst_tmp += (2 * dst_stride);
1593cabdff1aSopenharmony_ci
1594cabdff1aSopenharmony_ci            dst0 = dst2;
1595cabdff1aSopenharmony_ci            dst1 = dst3;
1596cabdff1aSopenharmony_ci            dst2 = dst4;
1597cabdff1aSopenharmony_ci            dst3 = dst5;
1598cabdff1aSopenharmony_ci            dst4 = dst6;
1599cabdff1aSopenharmony_ci            dst5 = dst7;
1600cabdff1aSopenharmony_ci            dst6 = dst8;
1601cabdff1aSopenharmony_ci        }
1602cabdff1aSopenharmony_ci
1603cabdff1aSopenharmony_ci        src += 8;
1604cabdff1aSopenharmony_ci        dst += 8;
1605cabdff1aSopenharmony_ci    }
1606cabdff1aSopenharmony_ci}
1607cabdff1aSopenharmony_ci
1608cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_8w_msa(uint8_t *src,
1609cabdff1aSopenharmony_ci                                  int32_t src_stride,
1610cabdff1aSopenharmony_ci                                  uint8_t *dst,
1611cabdff1aSopenharmony_ci                                  int32_t dst_stride,
1612cabdff1aSopenharmony_ci                                  const int8_t *filter_x,
1613cabdff1aSopenharmony_ci                                  const int8_t *filter_y,
1614cabdff1aSopenharmony_ci                                  int32_t height)
1615cabdff1aSopenharmony_ci{
1616cabdff1aSopenharmony_ci    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1617cabdff1aSopenharmony_ci                                   filter_x, filter_y, height, 8);
1618cabdff1aSopenharmony_ci}
1619cabdff1aSopenharmony_ci
1620cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_12w_msa(uint8_t *src,
1621cabdff1aSopenharmony_ci                                   int32_t src_stride,
1622cabdff1aSopenharmony_ci                                   uint8_t *dst,
1623cabdff1aSopenharmony_ci                                   int32_t dst_stride,
1624cabdff1aSopenharmony_ci                                   const int8_t *filter_x,
1625cabdff1aSopenharmony_ci                                   const int8_t *filter_y,
1626cabdff1aSopenharmony_ci                                   int32_t height)
1627cabdff1aSopenharmony_ci{
1628cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1629cabdff1aSopenharmony_ci    uint8_t *src_tmp, *dst_tmp;
1630cabdff1aSopenharmony_ci    v16u8 out0, out1;
1631cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1632cabdff1aSopenharmony_ci    v16i8 src11, src12, src13, src14;
1633cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1634cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1635cabdff1aSopenharmony_ci    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1636cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1637cabdff1aSopenharmony_ci    v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1638cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1639cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1640cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1641cabdff1aSopenharmony_ci    v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1642cabdff1aSopenharmony_ci    v8i16 dst1413_r, dst87_l, filter_vec;
1643cabdff1aSopenharmony_ci    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1644cabdff1aSopenharmony_ci    v4i32 dst0_l, dst1_l;
1645cabdff1aSopenharmony_ci
1646cabdff1aSopenharmony_ci    src -= ((3 * src_stride) + 3);
1647cabdff1aSopenharmony_ci
1648cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
1649cabdff1aSopenharmony_ci    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1650cabdff1aSopenharmony_ci
1651cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
1652cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
1653cabdff1aSopenharmony_ci
1654cabdff1aSopenharmony_ci    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1655cabdff1aSopenharmony_ci
1656cabdff1aSopenharmony_ci    mask0 = LD_SB(ff_hevc_mask_arr);
1657cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1658cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1659cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1660cabdff1aSopenharmony_ci
1661cabdff1aSopenharmony_ci    src_tmp = src;
1662cabdff1aSopenharmony_ci    dst_tmp = dst;
1663cabdff1aSopenharmony_ci
1664cabdff1aSopenharmony_ci    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1665cabdff1aSopenharmony_ci    src_tmp += (7 * src_stride);
1666cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1667cabdff1aSopenharmony_ci
1668cabdff1aSopenharmony_ci    /* row 0 row 1 row 2 row 3 */
1669cabdff1aSopenharmony_ci    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1670cabdff1aSopenharmony_ci    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1671cabdff1aSopenharmony_ci    VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1672cabdff1aSopenharmony_ci               vec11);
1673cabdff1aSopenharmony_ci    VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1674cabdff1aSopenharmony_ci               vec15);
1675cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1676cabdff1aSopenharmony_ci                             filt3);
1677cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1678cabdff1aSopenharmony_ci                             filt3);
1679cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1680cabdff1aSopenharmony_ci                             filt3);
1681cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1682cabdff1aSopenharmony_ci                             filt2, filt3);
1683cabdff1aSopenharmony_ci
1684cabdff1aSopenharmony_ci    VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1685cabdff1aSopenharmony_ci    VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1686cabdff1aSopenharmony_ci    VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1687cabdff1aSopenharmony_ci               vec11);
1688cabdff1aSopenharmony_ci    dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1689cabdff1aSopenharmony_ci                             filt3);
1690cabdff1aSopenharmony_ci    dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1691cabdff1aSopenharmony_ci                             filt3);
1692cabdff1aSopenharmony_ci    dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1693cabdff1aSopenharmony_ci                             filt3);
1694cabdff1aSopenharmony_ci
1695cabdff1aSopenharmony_ci    for (loop_cnt = 8; loop_cnt--;) {
1696cabdff1aSopenharmony_ci        LD_SB2(src_tmp, src_stride, src7, src8);
1697cabdff1aSopenharmony_ci        XORI_B2_128_SB(src7, src8);
1698cabdff1aSopenharmony_ci        src_tmp += 2 * src_stride;
1699cabdff1aSopenharmony_ci
1700cabdff1aSopenharmony_ci        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1701cabdff1aSopenharmony_ci                   dst32_r, dst54_r, dst21_r);
1702cabdff1aSopenharmony_ci        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1703cabdff1aSopenharmony_ci                   dst32_l, dst54_l, dst21_l);
1704cabdff1aSopenharmony_ci        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1705cabdff1aSopenharmony_ci        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1706cabdff1aSopenharmony_ci
1707cabdff1aSopenharmony_ci        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1708cabdff1aSopenharmony_ci                   vec3);
1709cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1710cabdff1aSopenharmony_ci                                 filt3);
1711cabdff1aSopenharmony_ci
1712cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1713cabdff1aSopenharmony_ci        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1714cabdff1aSopenharmony_ci                                filt_h0, filt_h1, filt_h2, filt_h3);
1715cabdff1aSopenharmony_ci        dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1716cabdff1aSopenharmony_ci                                filt_h0, filt_h1, filt_h2, filt_h3);
1717cabdff1aSopenharmony_ci        dst0_r >>= 6;
1718cabdff1aSopenharmony_ci        dst0_l >>= 6;
1719cabdff1aSopenharmony_ci
1720cabdff1aSopenharmony_ci        VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1721cabdff1aSopenharmony_ci                   vec3);
1722cabdff1aSopenharmony_ci        dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1723cabdff1aSopenharmony_ci                                 filt3);
1724cabdff1aSopenharmony_ci
1725cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1726cabdff1aSopenharmony_ci        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1727cabdff1aSopenharmony_ci                                filt_h0, filt_h1, filt_h2, filt_h3);
1728cabdff1aSopenharmony_ci        dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1729cabdff1aSopenharmony_ci                                filt_h0, filt_h1, filt_h2, filt_h3);
1730cabdff1aSopenharmony_ci        dst1_r >>= 6;
1731cabdff1aSopenharmony_ci        dst1_l >>= 6;
1732cabdff1aSopenharmony_ci        SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1733cabdff1aSopenharmony_ci        SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1734cabdff1aSopenharmony_ci
1735cabdff1aSopenharmony_ci        PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1736cabdff1aSopenharmony_ci        out0 = PCKEV_XORI128_UB(dst0, dst1);
1737cabdff1aSopenharmony_ci        ST_D2(out0, 0, 1, dst_tmp, dst_stride);
1738cabdff1aSopenharmony_ci        dst_tmp += (2 * dst_stride);
1739cabdff1aSopenharmony_ci
1740cabdff1aSopenharmony_ci        dst0 = dst2;
1741cabdff1aSopenharmony_ci        dst1 = dst3;
1742cabdff1aSopenharmony_ci        dst2 = dst4;
1743cabdff1aSopenharmony_ci        dst3 = dst5;
1744cabdff1aSopenharmony_ci        dst4 = dst6;
1745cabdff1aSopenharmony_ci        dst5 = dst7;
1746cabdff1aSopenharmony_ci        dst6 = dst8;
1747cabdff1aSopenharmony_ci    }
1748cabdff1aSopenharmony_ci
1749cabdff1aSopenharmony_ci    src += 8;
1750cabdff1aSopenharmony_ci    dst += 8;
1751cabdff1aSopenharmony_ci
1752cabdff1aSopenharmony_ci    mask4 = LD_SB(ff_hevc_mask_arr + 16);
1753cabdff1aSopenharmony_ci    mask5 = mask4 + 2;
1754cabdff1aSopenharmony_ci    mask6 = mask4 + 4;
1755cabdff1aSopenharmony_ci    mask7 = mask4 + 6;
1756cabdff1aSopenharmony_ci
1757cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1758cabdff1aSopenharmony_ci    src += (7 * src_stride);
1759cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1760cabdff1aSopenharmony_ci
1761cabdff1aSopenharmony_ci    VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1762cabdff1aSopenharmony_ci    VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1763cabdff1aSopenharmony_ci    VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1764cabdff1aSopenharmony_ci               vec11);
1765cabdff1aSopenharmony_ci    VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1766cabdff1aSopenharmony_ci               vec15);
1767cabdff1aSopenharmony_ci
1768cabdff1aSopenharmony_ci    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1769cabdff1aSopenharmony_ci                              filt3);
1770cabdff1aSopenharmony_ci    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1771cabdff1aSopenharmony_ci                              filt3);
1772cabdff1aSopenharmony_ci    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1773cabdff1aSopenharmony_ci                              filt3);
1774cabdff1aSopenharmony_ci    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1775cabdff1aSopenharmony_ci                              filt3);
1776cabdff1aSopenharmony_ci
1777cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1778cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1779cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1780cabdff1aSopenharmony_ci
1781cabdff1aSopenharmony_ci    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1782cabdff1aSopenharmony_ci
1783cabdff1aSopenharmony_ci    for (loop_cnt = 2; loop_cnt--;) {
1784cabdff1aSopenharmony_ci        LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1785cabdff1aSopenharmony_ci               src14);
1786cabdff1aSopenharmony_ci        src += (8 * src_stride);
1787cabdff1aSopenharmony_ci        XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1788cabdff1aSopenharmony_ci
1789cabdff1aSopenharmony_ci        VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1790cabdff1aSopenharmony_ci                   vec3);
1791cabdff1aSopenharmony_ci        VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1792cabdff1aSopenharmony_ci                   vec7);
1793cabdff1aSopenharmony_ci        VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1794cabdff1aSopenharmony_ci                   vec11);
1795cabdff1aSopenharmony_ci        VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1796cabdff1aSopenharmony_ci                   vec14, vec15);
1797cabdff1aSopenharmony_ci
1798cabdff1aSopenharmony_ci        dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1799cabdff1aSopenharmony_ci                                   filt3);
1800cabdff1aSopenharmony_ci        dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1801cabdff1aSopenharmony_ci                                   filt3);
1802cabdff1aSopenharmony_ci        dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1803cabdff1aSopenharmony_ci                                   filt2, filt3);
1804cabdff1aSopenharmony_ci        dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1805cabdff1aSopenharmony_ci                                   filt2, filt3);
1806cabdff1aSopenharmony_ci
1807cabdff1aSopenharmony_ci        dst76_r = __msa_ilvr_h(dst117, dst66);
1808cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1809cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1810cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1811cabdff1aSopenharmony_ci        dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1812cabdff1aSopenharmony_ci        dst1110_r = __msa_ilvr_h(dst117, dst1410);
1813cabdff1aSopenharmony_ci
1814cabdff1aSopenharmony_ci        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1815cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1816cabdff1aSopenharmony_ci        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1817cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1818cabdff1aSopenharmony_ci        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1819cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1820cabdff1aSopenharmony_ci        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1821cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1822cabdff1aSopenharmony_ci        dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1823cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1824cabdff1aSopenharmony_ci        dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1825cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1826cabdff1aSopenharmony_ci        dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1827cabdff1aSopenharmony_ci                                filt_h1, filt_h2, filt_h3);
1828cabdff1aSopenharmony_ci        dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1829cabdff1aSopenharmony_ci                                filt_h0, filt_h1, filt_h2, filt_h3);
1830cabdff1aSopenharmony_ci
1831cabdff1aSopenharmony_ci        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1832cabdff1aSopenharmony_ci        SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1833cabdff1aSopenharmony_ci        SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1834cabdff1aSopenharmony_ci        SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1835cabdff1aSopenharmony_ci        SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1836cabdff1aSopenharmony_ci        SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1837cabdff1aSopenharmony_ci        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1838cabdff1aSopenharmony_ci        PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1839cabdff1aSopenharmony_ci        out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1840cabdff1aSopenharmony_ci        out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1841cabdff1aSopenharmony_ci        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1842cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
1843cabdff1aSopenharmony_ci
1844cabdff1aSopenharmony_ci        dst10_r = dst98_r;
1845cabdff1aSopenharmony_ci        dst32_r = dst1110_r;
1846cabdff1aSopenharmony_ci        dst54_r = dst1312_r;
1847cabdff1aSopenharmony_ci        dst21_r = dst109_r;
1848cabdff1aSopenharmony_ci        dst43_r = dst1211_r;
1849cabdff1aSopenharmony_ci        dst65_r = dst1413_r;
1850cabdff1aSopenharmony_ci        dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1851cabdff1aSopenharmony_ci    }
1852cabdff1aSopenharmony_ci}
1853cabdff1aSopenharmony_ci
1854cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_16w_msa(uint8_t *src,
1855cabdff1aSopenharmony_ci                                   int32_t src_stride,
1856cabdff1aSopenharmony_ci                                   uint8_t *dst,
1857cabdff1aSopenharmony_ci                                   int32_t dst_stride,
1858cabdff1aSopenharmony_ci                                   const int8_t *filter_x,
1859cabdff1aSopenharmony_ci                                   const int8_t *filter_y,
1860cabdff1aSopenharmony_ci                                   int32_t height)
1861cabdff1aSopenharmony_ci{
1862cabdff1aSopenharmony_ci    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1863cabdff1aSopenharmony_ci                                   filter_x, filter_y, height, 16);
1864cabdff1aSopenharmony_ci}
1865cabdff1aSopenharmony_ci
1866cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_24w_msa(uint8_t *src,
1867cabdff1aSopenharmony_ci                                   int32_t src_stride,
1868cabdff1aSopenharmony_ci                                   uint8_t *dst,
1869cabdff1aSopenharmony_ci                                   int32_t dst_stride,
1870cabdff1aSopenharmony_ci                                   const int8_t *filter_x,
1871cabdff1aSopenharmony_ci                                   const int8_t *filter_y,
1872cabdff1aSopenharmony_ci                                   int32_t height)
1873cabdff1aSopenharmony_ci{
1874cabdff1aSopenharmony_ci    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1875cabdff1aSopenharmony_ci                                   filter_x, filter_y, height, 24);
1876cabdff1aSopenharmony_ci}
1877cabdff1aSopenharmony_ci
1878cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_32w_msa(uint8_t *src,
1879cabdff1aSopenharmony_ci                                   int32_t src_stride,
1880cabdff1aSopenharmony_ci                                   uint8_t *dst,
1881cabdff1aSopenharmony_ci                                   int32_t dst_stride,
1882cabdff1aSopenharmony_ci                                   const int8_t *filter_x,
1883cabdff1aSopenharmony_ci                                   const int8_t *filter_y,
1884cabdff1aSopenharmony_ci                                   int32_t height)
1885cabdff1aSopenharmony_ci{
1886cabdff1aSopenharmony_ci    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1887cabdff1aSopenharmony_ci                                   filter_x, filter_y, height, 32);
1888cabdff1aSopenharmony_ci}
1889cabdff1aSopenharmony_ci
1890cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_48w_msa(uint8_t *src,
1891cabdff1aSopenharmony_ci                                   int32_t src_stride,
1892cabdff1aSopenharmony_ci                                   uint8_t *dst,
1893cabdff1aSopenharmony_ci                                   int32_t dst_stride,
1894cabdff1aSopenharmony_ci                                   const int8_t *filter_x,
1895cabdff1aSopenharmony_ci                                   const int8_t *filter_y,
1896cabdff1aSopenharmony_ci                                   int32_t height)
1897cabdff1aSopenharmony_ci{
1898cabdff1aSopenharmony_ci    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1899cabdff1aSopenharmony_ci                                   filter_x, filter_y, height, 48);
1900cabdff1aSopenharmony_ci}
1901cabdff1aSopenharmony_ci
1902cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_64w_msa(uint8_t *src,
1903cabdff1aSopenharmony_ci                                   int32_t src_stride,
1904cabdff1aSopenharmony_ci                                   uint8_t *dst,
1905cabdff1aSopenharmony_ci                                   int32_t dst_stride,
1906cabdff1aSopenharmony_ci                                   const int8_t *filter_x,
1907cabdff1aSopenharmony_ci                                   const int8_t *filter_y,
1908cabdff1aSopenharmony_ci                                   int32_t height)
1909cabdff1aSopenharmony_ci{
1910cabdff1aSopenharmony_ci    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1911cabdff1aSopenharmony_ci                                   filter_x, filter_y, height, 64);
1912cabdff1aSopenharmony_ci}
1913cabdff1aSopenharmony_ci
1914cabdff1aSopenharmony_cistatic void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1915cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1916cabdff1aSopenharmony_ci                                 const int8_t *filter)
1917cabdff1aSopenharmony_ci{
1918cabdff1aSopenharmony_ci    v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1919cabdff1aSopenharmony_ci    v16u8 out;
1920cabdff1aSopenharmony_ci    v8i16 filt, res0;
1921cabdff1aSopenharmony_ci
1922cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1923cabdff1aSopenharmony_ci    src -= 1;
1924cabdff1aSopenharmony_ci
1925cabdff1aSopenharmony_ci    /* rearranging filter */
1926cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1927cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1928cabdff1aSopenharmony_ci
1929cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1930cabdff1aSopenharmony_ci
1931cabdff1aSopenharmony_ci    LD_SB2(src, src_stride, src0, src1);
1932cabdff1aSopenharmony_ci    XORI_B2_128_SB(src0, src1);
1933cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1934cabdff1aSopenharmony_ci    res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
1935cabdff1aSopenharmony_ci    res0 = __msa_srari_h(res0, 6);
1936cabdff1aSopenharmony_ci    res0 = __msa_sat_s_h(res0, 7);
1937cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(res0, res0);
1938cabdff1aSopenharmony_ci    ST_W2(out, 0, 1, dst, dst_stride);
1939cabdff1aSopenharmony_ci}
1940cabdff1aSopenharmony_ci
1941cabdff1aSopenharmony_cistatic void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1942cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1943cabdff1aSopenharmony_ci                                 const int8_t *filter)
1944cabdff1aSopenharmony_ci{
1945cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1946cabdff1aSopenharmony_ci    v8i16 filt, out0, out1;
1947cabdff1aSopenharmony_ci    v16u8 out;
1948cabdff1aSopenharmony_ci
1949cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1950cabdff1aSopenharmony_ci    src -= 1;
1951cabdff1aSopenharmony_ci
1952cabdff1aSopenharmony_ci    /* rearranging filter */
1953cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1954cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1955cabdff1aSopenharmony_ci
1956cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1957cabdff1aSopenharmony_ci
1958cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
1959cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
1960cabdff1aSopenharmony_ci    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1961cabdff1aSopenharmony_ci                               filt0, filt1, out0, out1);
1962cabdff1aSopenharmony_ci    SRARI_H2_SH(out0, out1, 6);
1963cabdff1aSopenharmony_ci    SAT_SH2_SH(out0, out1, 7);
1964cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
1965cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1966cabdff1aSopenharmony_ci}
1967cabdff1aSopenharmony_ci
1968cabdff1aSopenharmony_cistatic void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1969cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1970cabdff1aSopenharmony_ci                                 const int8_t *filter)
1971cabdff1aSopenharmony_ci{
1972cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1973cabdff1aSopenharmony_ci    v16u8 out;
1974cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
1975cabdff1aSopenharmony_ci
1976cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1977cabdff1aSopenharmony_ci    src -= 1;
1978cabdff1aSopenharmony_ci
1979cabdff1aSopenharmony_ci    /* rearranging filter */
1980cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1981cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1982cabdff1aSopenharmony_ci
1983cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1984cabdff1aSopenharmony_ci
1985cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
1986cabdff1aSopenharmony_ci    src += (4 * src_stride);
1987cabdff1aSopenharmony_ci
1988cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
1989cabdff1aSopenharmony_ci    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1990cabdff1aSopenharmony_ci                               filt0, filt1, out0, out1);
1991cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
1992cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
1993cabdff1aSopenharmony_ci    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1994cabdff1aSopenharmony_ci                               filt0, filt1, out2, out3);
1995cabdff1aSopenharmony_ci    SRARI_H4_SH(out0, out1, out2, out3, 6);
1996cabdff1aSopenharmony_ci    SAT_SH4_SH(out0, out1, out2, out3, 7);
1997cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
1998cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1999cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out2, out3);
2000cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2001cabdff1aSopenharmony_ci}
2002cabdff1aSopenharmony_ci
2003cabdff1aSopenharmony_cistatic void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
2004cabdff1aSopenharmony_ci                                  uint8_t *dst, int32_t dst_stride,
2005cabdff1aSopenharmony_ci                                  const int8_t *filter)
2006cabdff1aSopenharmony_ci{
2007cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2008cabdff1aSopenharmony_ci    v16i8 filt0, filt1, mask0, mask1;
2009cabdff1aSopenharmony_ci    v16u8 out;
2010cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
2011cabdff1aSopenharmony_ci
2012cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2013cabdff1aSopenharmony_ci    src -= 1;
2014cabdff1aSopenharmony_ci
2015cabdff1aSopenharmony_ci    /* rearranging filter */
2016cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2017cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2018cabdff1aSopenharmony_ci
2019cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2020cabdff1aSopenharmony_ci
2021cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2022cabdff1aSopenharmony_ci    src += (8 * src_stride);
2023cabdff1aSopenharmony_ci    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2024cabdff1aSopenharmony_ci    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2025cabdff1aSopenharmony_ci                               filt0, filt1, out0, out1);
2026cabdff1aSopenharmony_ci    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2027cabdff1aSopenharmony_ci                               filt0, filt1, out2, out3);
2028cabdff1aSopenharmony_ci    SRARI_H4_SH(out0, out1, out2, out3, 6);
2029cabdff1aSopenharmony_ci    SAT_SH4_SH(out0, out1, out2, out3, 7);
2030cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
2031cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2032cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out2, out3);
2033cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2034cabdff1aSopenharmony_ci    dst += (8 * dst_stride);
2035cabdff1aSopenharmony_ci
2036cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2037cabdff1aSopenharmony_ci    src += (8 * src_stride);
2038cabdff1aSopenharmony_ci    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2039cabdff1aSopenharmony_ci    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2040cabdff1aSopenharmony_ci                               filt0, filt1, out0, out1);
2041cabdff1aSopenharmony_ci    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2042cabdff1aSopenharmony_ci                               filt0, filt1, out2, out3);
2043cabdff1aSopenharmony_ci    SRARI_H4_SH(out0, out1, out2, out3, 6);
2044cabdff1aSopenharmony_ci    SAT_SH4_SH(out0, out1, out2, out3, 7);
2045cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
2046cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2047cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out2, out3);
2048cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2049cabdff1aSopenharmony_ci}
2050cabdff1aSopenharmony_ci
2051cabdff1aSopenharmony_cistatic void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
2052cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
2053cabdff1aSopenharmony_ci                                const int8_t *filter, int32_t height)
2054cabdff1aSopenharmony_ci{
2055cabdff1aSopenharmony_ci    if (2 == height) {
2056cabdff1aSopenharmony_ci        common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2057cabdff1aSopenharmony_ci    } else if (4 == height) {
2058cabdff1aSopenharmony_ci        common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2059cabdff1aSopenharmony_ci    } else if (8 == height) {
2060cabdff1aSopenharmony_ci        common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2061cabdff1aSopenharmony_ci    } else if (16 == height) {
2062cabdff1aSopenharmony_ci        common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
2063cabdff1aSopenharmony_ci    }
2064cabdff1aSopenharmony_ci}
2065cabdff1aSopenharmony_ci
2066cabdff1aSopenharmony_cistatic void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
2067cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
2068cabdff1aSopenharmony_ci                                const int8_t *filter, int32_t height)
2069cabdff1aSopenharmony_ci{
2070cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2071cabdff1aSopenharmony_ci    v16u8 out4, out5;
2072cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
2073cabdff1aSopenharmony_ci
2074cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2075cabdff1aSopenharmony_ci    src -= 1;
2076cabdff1aSopenharmony_ci
2077cabdff1aSopenharmony_ci    /* rearranging filter */
2078cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2079cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2080cabdff1aSopenharmony_ci
2081cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2082cabdff1aSopenharmony_ci
2083cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
2084cabdff1aSopenharmony_ci    src += (4 * src_stride);
2085cabdff1aSopenharmony_ci
2086cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
2087cabdff1aSopenharmony_ci    HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2088cabdff1aSopenharmony_ci                               filt1, out0, out1, out2, out3);
2089cabdff1aSopenharmony_ci    SRARI_H4_SH(out0, out1, out2, out3, 6);
2090cabdff1aSopenharmony_ci    SAT_SH4_SH(out0, out1, out2, out3, 7);
2091cabdff1aSopenharmony_ci    out4 = PCKEV_XORI128_UB(out0, out1);
2092cabdff1aSopenharmony_ci    out5 = PCKEV_XORI128_UB(out2, out3);
2093cabdff1aSopenharmony_ci    ST_W2(out4, 0, 2, dst, dst_stride);
2094cabdff1aSopenharmony_ci    ST_H2(out4, 2, 6, dst + 4, dst_stride);
2095cabdff1aSopenharmony_ci    ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2096cabdff1aSopenharmony_ci    ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2097cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
2098cabdff1aSopenharmony_ci
2099cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
2100cabdff1aSopenharmony_ci    src += (4 * src_stride);
2101cabdff1aSopenharmony_ci
2102cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
2103cabdff1aSopenharmony_ci    HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2104cabdff1aSopenharmony_ci                               filt1, out0, out1, out2, out3);
2105cabdff1aSopenharmony_ci    SRARI_H4_SH(out0, out1, out2, out3, 6);
2106cabdff1aSopenharmony_ci    SAT_SH4_SH(out0, out1, out2, out3, 7);
2107cabdff1aSopenharmony_ci    out4 = PCKEV_XORI128_UB(out0, out1);
2108cabdff1aSopenharmony_ci    out5 = PCKEV_XORI128_UB(out2, out3);
2109cabdff1aSopenharmony_ci    ST_W2(out4, 0, 2, dst, dst_stride);
2110cabdff1aSopenharmony_ci    ST_H2(out4, 2, 6, dst + 4, dst_stride);
2111cabdff1aSopenharmony_ci    ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2112cabdff1aSopenharmony_ci    ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2113cabdff1aSopenharmony_ci}
2114cabdff1aSopenharmony_ci
2115cabdff1aSopenharmony_cistatic void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
2116cabdff1aSopenharmony_ci                                     uint8_t *dst, int32_t dst_stride,
2117cabdff1aSopenharmony_ci                                     const int8_t *filter, int32_t height)
2118cabdff1aSopenharmony_ci{
2119cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2120cabdff1aSopenharmony_ci    v16i8 src0, src1, filt0, filt1, mask0, mask1;
2121cabdff1aSopenharmony_ci    v16u8 out;
2122cabdff1aSopenharmony_ci    v8i16 filt, vec0, vec1, vec2, vec3;
2123cabdff1aSopenharmony_ci
2124cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2125cabdff1aSopenharmony_ci    src -= 1;
2126cabdff1aSopenharmony_ci
2127cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2128cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2129cabdff1aSopenharmony_ci
2130cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2131cabdff1aSopenharmony_ci
2132cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
2133cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src0, src1);
2134cabdff1aSopenharmony_ci        src += (2 * src_stride);
2135cabdff1aSopenharmony_ci
2136cabdff1aSopenharmony_ci        XORI_B2_128_SB(src0, src1);
2137cabdff1aSopenharmony_ci        VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2138cabdff1aSopenharmony_ci        DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2139cabdff1aSopenharmony_ci        VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2140cabdff1aSopenharmony_ci        DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
2141cabdff1aSopenharmony_ci        SRARI_H2_SH(vec0, vec1, 6);
2142cabdff1aSopenharmony_ci        SAT_SH2_SH(vec0, vec1, 7);
2143cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(vec0, vec1);
2144cabdff1aSopenharmony_ci        ST_D2(out, 0, 1, dst, dst_stride);
2145cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
2146cabdff1aSopenharmony_ci    }
2147cabdff1aSopenharmony_ci}
2148cabdff1aSopenharmony_ci
2149cabdff1aSopenharmony_cistatic void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2150cabdff1aSopenharmony_ci                                     uint8_t *dst, int32_t dst_stride,
2151cabdff1aSopenharmony_ci                                     const int8_t *filter, int32_t height)
2152cabdff1aSopenharmony_ci{
2153cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2154cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2155cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1;
2156cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
2157cabdff1aSopenharmony_ci
2158cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2159cabdff1aSopenharmony_ci    src -= 1;
2160cabdff1aSopenharmony_ci
2161cabdff1aSopenharmony_ci    /* rearranging filter */
2162cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2163cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2164cabdff1aSopenharmony_ci
2165cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2166cabdff1aSopenharmony_ci
2167cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
2168cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
2169cabdff1aSopenharmony_ci        src += (4 * src_stride);
2170cabdff1aSopenharmony_ci
2171cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
2172cabdff1aSopenharmony_ci        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2173cabdff1aSopenharmony_ci                                   filt1, out0, out1, out2, out3);
2174cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 6);
2175cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
2176cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out0, out1);
2177cabdff1aSopenharmony_ci        tmp1 = PCKEV_XORI128_UB(out2, out3);
2178cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2179cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
2180cabdff1aSopenharmony_ci    }
2181cabdff1aSopenharmony_ci}
2182cabdff1aSopenharmony_ci
2183cabdff1aSopenharmony_cistatic void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
2184cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
2185cabdff1aSopenharmony_ci                                const int8_t *filter, int32_t height)
2186cabdff1aSopenharmony_ci{
2187cabdff1aSopenharmony_ci    if ((2 == height) || (6 == height)) {
2188cabdff1aSopenharmony_ci        common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2189cabdff1aSopenharmony_ci                                 height);
2190cabdff1aSopenharmony_ci    } else {
2191cabdff1aSopenharmony_ci        common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2192cabdff1aSopenharmony_ci                                 height);
2193cabdff1aSopenharmony_ci    }
2194cabdff1aSopenharmony_ci}
2195cabdff1aSopenharmony_ci
2196cabdff1aSopenharmony_cistatic void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
2197cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
2198cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
2199cabdff1aSopenharmony_ci{
2200cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2201cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2202cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2203cabdff1aSopenharmony_ci    v16i8 vec10, vec11;
2204cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1;
2205cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3, out4, out5;
2206cabdff1aSopenharmony_ci
2207cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2208cabdff1aSopenharmony_ci    mask2 = LD_SB(&ff_hevc_mask_arr[32]);
2209cabdff1aSopenharmony_ci
2210cabdff1aSopenharmony_ci    src -= 1;
2211cabdff1aSopenharmony_ci
2212cabdff1aSopenharmony_ci    /* rearranging filter */
2213cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2214cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2215cabdff1aSopenharmony_ci
2216cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2217cabdff1aSopenharmony_ci    mask3 = mask2 + 2;
2218cabdff1aSopenharmony_ci
2219cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
2220cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
2221cabdff1aSopenharmony_ci        src += (4 * src_stride);
2222cabdff1aSopenharmony_ci
2223cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
2224cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2225cabdff1aSopenharmony_ci        DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2226cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2227cabdff1aSopenharmony_ci        DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2228cabdff1aSopenharmony_ci        SRARI_H2_SH(out0, out1, 6);
2229cabdff1aSopenharmony_ci        SAT_SH2_SH(out0, out1, 7);
2230cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out0, out1);
2231cabdff1aSopenharmony_ci        ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2232cabdff1aSopenharmony_ci
2233cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2234cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2235cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2236cabdff1aSopenharmony_ci                    out2, out3, out4, out5);
2237cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2238cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2239cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2240cabdff1aSopenharmony_ci                     out2, out3, out4, out5);
2241cabdff1aSopenharmony_ci        SRARI_H4_SH(out2, out3, out4, out5, 6);
2242cabdff1aSopenharmony_ci        SAT_SH4_SH(out2, out3, out4, out5, 7);
2243cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out2, out3);
2244cabdff1aSopenharmony_ci        tmp1 = PCKEV_XORI128_UB(out4, out5);
2245cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2246cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
2247cabdff1aSopenharmony_ci    }
2248cabdff1aSopenharmony_ci}
2249cabdff1aSopenharmony_ci
2250cabdff1aSopenharmony_cistatic void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
2251cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
2252cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
2253cabdff1aSopenharmony_ci{
2254cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2255cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2256cabdff1aSopenharmony_ci    v16i8 filt0, filt1, mask0, mask1;
2257cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2258cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2259cabdff1aSopenharmony_ci    v16u8 out;
2260cabdff1aSopenharmony_ci
2261cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2262cabdff1aSopenharmony_ci    src -= 1;
2263cabdff1aSopenharmony_ci
2264cabdff1aSopenharmony_ci    /* rearranging filter */
2265cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2266cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2267cabdff1aSopenharmony_ci
2268cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2269cabdff1aSopenharmony_ci
2270cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
2271cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src2, src4, src6);
2272cabdff1aSopenharmony_ci        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2273cabdff1aSopenharmony_ci        src += (4 * src_stride);
2274cabdff1aSopenharmony_ci
2275cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2276cabdff1aSopenharmony_ci
2277cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2278cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2279cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2280cabdff1aSopenharmony_ci                    out0, out1, out2, out3);
2281cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2282cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2283cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2284cabdff1aSopenharmony_ci                     out0, out1, out2, out3);
2285cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 6);
2286cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
2287cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0, out1);
2288cabdff1aSopenharmony_ci        ST_UB(out, dst);
2289cabdff1aSopenharmony_ci        dst += dst_stride;
2290cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2, out3);
2291cabdff1aSopenharmony_ci        ST_UB(out, dst);
2292cabdff1aSopenharmony_ci        dst += dst_stride;
2293cabdff1aSopenharmony_ci
2294cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2295cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2296cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2297cabdff1aSopenharmony_ci                    out4, out5, out6, out7);
2298cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2299cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2300cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2301cabdff1aSopenharmony_ci                     out4, out5, out6, out7);
2302cabdff1aSopenharmony_ci        SRARI_H4_SH(out4, out5, out6, out7, 6);
2303cabdff1aSopenharmony_ci        SAT_SH4_SH(out4, out5, out6, out7, 7);
2304cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out4, out5);
2305cabdff1aSopenharmony_ci        ST_UB(out, dst);
2306cabdff1aSopenharmony_ci        dst += dst_stride;
2307cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out6, out7);
2308cabdff1aSopenharmony_ci        ST_UB(out, dst);
2309cabdff1aSopenharmony_ci        dst += dst_stride;
2310cabdff1aSopenharmony_ci    }
2311cabdff1aSopenharmony_ci}
2312cabdff1aSopenharmony_ci
2313cabdff1aSopenharmony_cistatic void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2314cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
2315cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
2316cabdff1aSopenharmony_ci{
2317cabdff1aSopenharmony_ci    uint8_t *dst1 = dst + 16;
2318cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2319cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2320cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2321cabdff1aSopenharmony_ci    v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2322cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
2323cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1;
2324cabdff1aSopenharmony_ci
2325cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2326cabdff1aSopenharmony_ci    src -= 1;
2327cabdff1aSopenharmony_ci
2328cabdff1aSopenharmony_ci    /* rearranging filter */
2329cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2330cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2331cabdff1aSopenharmony_ci
2332cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2333cabdff1aSopenharmony_ci    mask00 = mask0 + 8;
2334cabdff1aSopenharmony_ci    mask11 = mask0 + 10;
2335cabdff1aSopenharmony_ci
2336cabdff1aSopenharmony_ci    for (loop_cnt = 8; loop_cnt--;) {
2337cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src2, src4, src6);
2338cabdff1aSopenharmony_ci        LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2339cabdff1aSopenharmony_ci        src += (4 * src_stride);
2340cabdff1aSopenharmony_ci
2341cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2342cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2343cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2344cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2345cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2346cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2347cabdff1aSopenharmony_ci                    out0, out1, out2, out3);
2348cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2349cabdff1aSopenharmony_ci                     out0, out1, out2, out3);
2350cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 6);
2351cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
2352cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out0, out1);
2353cabdff1aSopenharmony_ci        ST_UB(tmp0, dst);
2354cabdff1aSopenharmony_ci        dst += dst_stride;
2355cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out2, out3);
2356cabdff1aSopenharmony_ci        ST_UB(tmp0, dst);
2357cabdff1aSopenharmony_ci        dst += dst_stride;
2358cabdff1aSopenharmony_ci
2359cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2360cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2361cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2362cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2363cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2364cabdff1aSopenharmony_ci                    out0, out1, out2, out3);
2365cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2366cabdff1aSopenharmony_ci                     out0, out1, out2, out3);
2367cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 6);
2368cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
2369cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out0, out1);
2370cabdff1aSopenharmony_ci        ST_UB(tmp0, dst);
2371cabdff1aSopenharmony_ci        dst += dst_stride;
2372cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out2, out3);
2373cabdff1aSopenharmony_ci        ST_UB(tmp0, dst);
2374cabdff1aSopenharmony_ci        dst += dst_stride;
2375cabdff1aSopenharmony_ci
2376cabdff1aSopenharmony_ci        /* 8 width */
2377cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2378cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2379cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2380cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2381cabdff1aSopenharmony_ci
2382cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2383cabdff1aSopenharmony_ci                    out0, out1, out2, out3);
2384cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2385cabdff1aSopenharmony_ci                     out0, out1, out2, out3);
2386cabdff1aSopenharmony_ci
2387cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 6);
2388cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
2389cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out0, out1);
2390cabdff1aSopenharmony_ci        tmp1 = PCKEV_XORI128_UB(out2, out3);
2391cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride);
2392cabdff1aSopenharmony_ci        dst1 += (4 * dst_stride);
2393cabdff1aSopenharmony_ci    }
2394cabdff1aSopenharmony_ci}
2395cabdff1aSopenharmony_ci
2396cabdff1aSopenharmony_cistatic void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2397cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
2398cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
2399cabdff1aSopenharmony_ci{
2400cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2401cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2402cabdff1aSopenharmony_ci    v16i8 filt0, filt1, mask0, mask1;
2403cabdff1aSopenharmony_ci    v16u8 out;
2404cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2405cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2406cabdff1aSopenharmony_ci
2407cabdff1aSopenharmony_ci    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2408cabdff1aSopenharmony_ci    src -= 1;
2409cabdff1aSopenharmony_ci
2410cabdff1aSopenharmony_ci    /* rearranging filter */
2411cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2412cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2413cabdff1aSopenharmony_ci
2414cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
2415cabdff1aSopenharmony_ci
2416cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
2417cabdff1aSopenharmony_ci        src0 = LD_SB(src);
2418cabdff1aSopenharmony_ci        src1 = LD_SB(src + 8);
2419cabdff1aSopenharmony_ci        src2 = LD_SB(src + 16);
2420cabdff1aSopenharmony_ci        src3 = LD_SB(src + 24);
2421cabdff1aSopenharmony_ci        src += src_stride;
2422cabdff1aSopenharmony_ci        src4 = LD_SB(src);
2423cabdff1aSopenharmony_ci        src5 = LD_SB(src + 8);
2424cabdff1aSopenharmony_ci        src6 = LD_SB(src + 16);
2425cabdff1aSopenharmony_ci        src7 = LD_SB(src + 24);
2426cabdff1aSopenharmony_ci        src += src_stride;
2427cabdff1aSopenharmony_ci
2428cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2429cabdff1aSopenharmony_ci
2430cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2431cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2432cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2433cabdff1aSopenharmony_ci                    out0, out1, out2, out3);
2434cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2435cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2436cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2437cabdff1aSopenharmony_ci                     out0, out1, out2, out3);
2438cabdff1aSopenharmony_ci
2439cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2440cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2441cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2442cabdff1aSopenharmony_ci                    out4, out5, out6, out7);
2443cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2444cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2445cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2446cabdff1aSopenharmony_ci                     out4, out5, out6, out7);
2447cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 6);
2448cabdff1aSopenharmony_ci        SRARI_H4_SH(out4, out5, out6, out7, 6);
2449cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
2450cabdff1aSopenharmony_ci        SAT_SH4_SH(out4, out5, out6, out7, 7);
2451cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0, out1);
2452cabdff1aSopenharmony_ci        ST_UB(out, dst);
2453cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2, out3);
2454cabdff1aSopenharmony_ci        ST_UB(out, dst + 16);
2455cabdff1aSopenharmony_ci        dst += dst_stride;
2456cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out4, out5);
2457cabdff1aSopenharmony_ci        ST_UB(out, dst);
2458cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out6, out7);
2459cabdff1aSopenharmony_ci        ST_UB(out, dst + 16);
2460cabdff1aSopenharmony_ci        dst += dst_stride;
2461cabdff1aSopenharmony_ci    }
2462cabdff1aSopenharmony_ci}
2463cabdff1aSopenharmony_ci
2464cabdff1aSopenharmony_cistatic void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2465cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
2466cabdff1aSopenharmony_ci                                 const int8_t *filter)
2467cabdff1aSopenharmony_ci{
2468cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2469cabdff1aSopenharmony_ci    v16i8 src2110, src4332, filt0, filt1;
2470cabdff1aSopenharmony_ci    v16u8 out;
2471cabdff1aSopenharmony_ci    v8i16 filt, out10;
2472cabdff1aSopenharmony_ci
2473cabdff1aSopenharmony_ci    src -= src_stride;
2474cabdff1aSopenharmony_ci
2475cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2476cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2477cabdff1aSopenharmony_ci
2478cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
2479cabdff1aSopenharmony_ci    src += (3 * src_stride);
2480cabdff1aSopenharmony_ci
2481cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2482cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2483cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2484cabdff1aSopenharmony_ci    LD_SB2(src, src_stride, src3, src4);
2485cabdff1aSopenharmony_ci    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2486cabdff1aSopenharmony_ci    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2487cabdff1aSopenharmony_ci    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2488cabdff1aSopenharmony_ci    out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2489cabdff1aSopenharmony_ci    out10 = __msa_srari_h(out10, 6);
2490cabdff1aSopenharmony_ci    out10 = __msa_sat_s_h(out10, 7);
2491cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out10, out10);
2492cabdff1aSopenharmony_ci    ST_W2(out, 0, 1, dst, dst_stride);
2493cabdff1aSopenharmony_ci}
2494cabdff1aSopenharmony_ci
2495cabdff1aSopenharmony_cistatic void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
2496cabdff1aSopenharmony_ci                                         uint8_t *dst, int32_t dst_stride,
2497cabdff1aSopenharmony_ci                                         const int8_t *filter, int32_t height)
2498cabdff1aSopenharmony_ci{
2499cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2500cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5;
2501cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2502cabdff1aSopenharmony_ci    v16i8 src2110, src4332, filt0, filt1;
2503cabdff1aSopenharmony_ci    v8i16 filt, out10, out32;
2504cabdff1aSopenharmony_ci    v16u8 out;
2505cabdff1aSopenharmony_ci
2506cabdff1aSopenharmony_ci    src -= src_stride;
2507cabdff1aSopenharmony_ci
2508cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2509cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2510cabdff1aSopenharmony_ci
2511cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
2512cabdff1aSopenharmony_ci    src += (3 * src_stride);
2513cabdff1aSopenharmony_ci
2514cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2515cabdff1aSopenharmony_ci
2516cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2517cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2518cabdff1aSopenharmony_ci
2519cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
2520cabdff1aSopenharmony_ci        LD_SB3(src, src_stride, src3, src4, src5);
2521cabdff1aSopenharmony_ci        src += (3 * src_stride);
2522cabdff1aSopenharmony_ci        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2523cabdff1aSopenharmony_ci        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2524cabdff1aSopenharmony_ci        src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2525cabdff1aSopenharmony_ci        out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2526cabdff1aSopenharmony_ci
2527cabdff1aSopenharmony_ci        src2 = LD_SB(src);
2528cabdff1aSopenharmony_ci        src += (src_stride);
2529cabdff1aSopenharmony_ci        ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2530cabdff1aSopenharmony_ci        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2531cabdff1aSopenharmony_ci        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2532cabdff1aSopenharmony_ci        out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
2533cabdff1aSopenharmony_ci        SRARI_H2_SH(out10, out32, 6);
2534cabdff1aSopenharmony_ci        SAT_SH2_SH(out10, out32, 7);
2535cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out10, out32);
2536cabdff1aSopenharmony_ci        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2537cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
2538cabdff1aSopenharmony_ci    }
2539cabdff1aSopenharmony_ci}
2540cabdff1aSopenharmony_ci
2541cabdff1aSopenharmony_cistatic void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2542cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
2543cabdff1aSopenharmony_ci                                const int8_t *filter, int32_t height)
2544cabdff1aSopenharmony_ci{
2545cabdff1aSopenharmony_ci    if (2 == height) {
2546cabdff1aSopenharmony_ci        common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2547cabdff1aSopenharmony_ci    } else {
2548cabdff1aSopenharmony_ci        common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2549cabdff1aSopenharmony_ci                                     height);
2550cabdff1aSopenharmony_ci    }
2551cabdff1aSopenharmony_ci}
2552cabdff1aSopenharmony_ci
2553cabdff1aSopenharmony_cistatic void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2554cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
2555cabdff1aSopenharmony_ci                                const int8_t *filter, int32_t height)
2556cabdff1aSopenharmony_ci{
2557cabdff1aSopenharmony_ci    v16u8 out0, out1;
2558cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
2559cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2560cabdff1aSopenharmony_ci    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2561cabdff1aSopenharmony_ci
2562cabdff1aSopenharmony_ci    src -= src_stride;
2563cabdff1aSopenharmony_ci
2564cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2565cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2566cabdff1aSopenharmony_ci
2567cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
2568cabdff1aSopenharmony_ci    src += (3 * src_stride);
2569cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
2570cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2571cabdff1aSopenharmony_ci
2572cabdff1aSopenharmony_ci    LD_SB2(src, src_stride, src3, src4);
2573cabdff1aSopenharmony_ci    src += (2 * src_stride);
2574cabdff1aSopenharmony_ci    XORI_B2_128_SB(src3, src4);
2575cabdff1aSopenharmony_ci    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2576cabdff1aSopenharmony_ci
2577cabdff1aSopenharmony_ci    dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2578cabdff1aSopenharmony_ci    dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2579cabdff1aSopenharmony_ci
2580cabdff1aSopenharmony_ci    LD_SB2(src, src_stride, src5, src6);
2581cabdff1aSopenharmony_ci    src += (2 * src_stride);
2582cabdff1aSopenharmony_ci    XORI_B2_128_SB(src5, src6);
2583cabdff1aSopenharmony_ci    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2584cabdff1aSopenharmony_ci
2585cabdff1aSopenharmony_ci    dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2586cabdff1aSopenharmony_ci    dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2587cabdff1aSopenharmony_ci
2588cabdff1aSopenharmony_ci    SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2589cabdff1aSopenharmony_ci    SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2590cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2591cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2592cabdff1aSopenharmony_ci    ST_W2(out0, 0, 2, dst, dst_stride);
2593cabdff1aSopenharmony_ci    ST_H2(out0, 2, 6, dst + 4, dst_stride);
2594cabdff1aSopenharmony_ci    ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2595cabdff1aSopenharmony_ci    ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2596cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
2597cabdff1aSopenharmony_ci
2598cabdff1aSopenharmony_ci    LD_SB2(src, src_stride, src3, src4);
2599cabdff1aSopenharmony_ci    src += (2 * src_stride);
2600cabdff1aSopenharmony_ci    XORI_B2_128_SB(src3, src4);
2601cabdff1aSopenharmony_ci    ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2602cabdff1aSopenharmony_ci
2603cabdff1aSopenharmony_ci    dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1);
2604cabdff1aSopenharmony_ci    dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1);
2605cabdff1aSopenharmony_ci
2606cabdff1aSopenharmony_ci    LD_SB2(src, src_stride, src5, src6);
2607cabdff1aSopenharmony_ci    src += (2 * src_stride);
2608cabdff1aSopenharmony_ci    XORI_B2_128_SB(src5, src6);
2609cabdff1aSopenharmony_ci    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2610cabdff1aSopenharmony_ci
2611cabdff1aSopenharmony_ci    dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2612cabdff1aSopenharmony_ci    dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2613cabdff1aSopenharmony_ci
2614cabdff1aSopenharmony_ci    SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2615cabdff1aSopenharmony_ci    SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2616cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2617cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2618cabdff1aSopenharmony_ci    ST_W2(out0, 0, 2, dst, dst_stride);
2619cabdff1aSopenharmony_ci    ST_H2(out0, 2, 6, dst + 4, dst_stride);
2620cabdff1aSopenharmony_ci    ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2621cabdff1aSopenharmony_ci    ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2622cabdff1aSopenharmony_ci}
2623cabdff1aSopenharmony_ci
2624cabdff1aSopenharmony_cistatic void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2625cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
2626cabdff1aSopenharmony_ci                                 const int8_t *filter)
2627cabdff1aSopenharmony_ci{
2628cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
2629cabdff1aSopenharmony_ci    v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2630cabdff1aSopenharmony_ci    v16u8 out;
2631cabdff1aSopenharmony_ci
2632cabdff1aSopenharmony_ci    src -= src_stride;
2633cabdff1aSopenharmony_ci
2634cabdff1aSopenharmony_ci    /* rearranging filter_y */
2635cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2636cabdff1aSopenharmony_ci    SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2637cabdff1aSopenharmony_ci
2638cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2639cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2640cabdff1aSopenharmony_ci    ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2641cabdff1aSopenharmony_ci    tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1);
2642cabdff1aSopenharmony_ci    ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2643cabdff1aSopenharmony_ci    tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1);
2644cabdff1aSopenharmony_ci    SRARI_H2_SH(tmp0, tmp1, 6);
2645cabdff1aSopenharmony_ci    SAT_SH2_SH(tmp0, tmp1, 7);
2646cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(tmp0, tmp1);
2647cabdff1aSopenharmony_ci    ST_D2(out, 0, 1, dst, dst_stride);
2648cabdff1aSopenharmony_ci}
2649cabdff1aSopenharmony_ci
2650cabdff1aSopenharmony_cistatic void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2651cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
2652cabdff1aSopenharmony_ci                                 const int8_t *filter)
2653cabdff1aSopenharmony_ci{
2654cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2655cabdff1aSopenharmony_ci    uint64_t out0, out1, out2;
2656cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5;
2657cabdff1aSopenharmony_ci    v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2658cabdff1aSopenharmony_ci    v8i16 filt, filt0, filt1;
2659cabdff1aSopenharmony_ci
2660cabdff1aSopenharmony_ci    src -= src_stride;
2661cabdff1aSopenharmony_ci
2662cabdff1aSopenharmony_ci    /* rearranging filter_y */
2663cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2664cabdff1aSopenharmony_ci    SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2665cabdff1aSopenharmony_ci
2666cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
2667cabdff1aSopenharmony_ci    src += (3 * src_stride);
2668cabdff1aSopenharmony_ci
2669cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
2670cabdff1aSopenharmony_ci    ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2671cabdff1aSopenharmony_ci
2672cabdff1aSopenharmony_ci    for (loop_cnt = 2; loop_cnt--;) {
2673cabdff1aSopenharmony_ci        LD_SB3(src, src_stride, src3, src4, src5);
2674cabdff1aSopenharmony_ci        src += (3 * src_stride);
2675cabdff1aSopenharmony_ci
2676cabdff1aSopenharmony_ci        XORI_B3_128_SB(src3, src4, src5);
2677cabdff1aSopenharmony_ci        ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2678cabdff1aSopenharmony_ci        tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2679cabdff1aSopenharmony_ci        tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2680cabdff1aSopenharmony_ci        tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1);
2681cabdff1aSopenharmony_ci        SRARI_H2_SH(tmp0, tmp1, 6);
2682cabdff1aSopenharmony_ci        tmp2 = __msa_srari_h(tmp2, 6);
2683cabdff1aSopenharmony_ci        SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2684cabdff1aSopenharmony_ci        PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2685cabdff1aSopenharmony_ci        XORI_B2_128_SH(tmp0, tmp2);
2686cabdff1aSopenharmony_ci
2687cabdff1aSopenharmony_ci        out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2688cabdff1aSopenharmony_ci        out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2689cabdff1aSopenharmony_ci        out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2690cabdff1aSopenharmony_ci        SD(out0, dst);
2691cabdff1aSopenharmony_ci        dst += dst_stride;
2692cabdff1aSopenharmony_ci        SD(out1, dst);
2693cabdff1aSopenharmony_ci        dst += dst_stride;
2694cabdff1aSopenharmony_ci        SD(out2, dst);
2695cabdff1aSopenharmony_ci        dst += dst_stride;
2696cabdff1aSopenharmony_ci
2697cabdff1aSopenharmony_ci        src2 = src5;
2698cabdff1aSopenharmony_ci        vec0 = vec3;
2699cabdff1aSopenharmony_ci        vec2 = vec4;
2700cabdff1aSopenharmony_ci    }
2701cabdff1aSopenharmony_ci}
2702cabdff1aSopenharmony_ci
2703cabdff1aSopenharmony_cistatic void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2704cabdff1aSopenharmony_ci                                     uint8_t *dst, int32_t dst_stride,
2705cabdff1aSopenharmony_ci                                     const int8_t *filter, int32_t height)
2706cabdff1aSopenharmony_ci{
2707cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2708cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src7, src8, src9, src10;
2709cabdff1aSopenharmony_ci    v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2710cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1;
2711cabdff1aSopenharmony_ci    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2712cabdff1aSopenharmony_ci
2713cabdff1aSopenharmony_ci    src -= src_stride;
2714cabdff1aSopenharmony_ci
2715cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2716cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2717cabdff1aSopenharmony_ci
2718cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
2719cabdff1aSopenharmony_ci    src += (3 * src_stride);
2720cabdff1aSopenharmony_ci
2721cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
2722cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2723cabdff1aSopenharmony_ci
2724cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
2725cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
2726cabdff1aSopenharmony_ci        src += (4 * src_stride);
2727cabdff1aSopenharmony_ci
2728cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
2729cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2730cabdff1aSopenharmony_ci                   src72_r, src87_r, src98_r, src109_r);
2731cabdff1aSopenharmony_ci        out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1);
2732cabdff1aSopenharmony_ci        out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1);
2733cabdff1aSopenharmony_ci        out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1);
2734cabdff1aSopenharmony_ci        out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2735cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2736cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2737cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2738cabdff1aSopenharmony_ci        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2739cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2740cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
2741cabdff1aSopenharmony_ci
2742cabdff1aSopenharmony_ci        src10_r = src98_r;
2743cabdff1aSopenharmony_ci        src21_r = src109_r;
2744cabdff1aSopenharmony_ci        src2 = src10;
2745cabdff1aSopenharmony_ci    }
2746cabdff1aSopenharmony_ci}
2747cabdff1aSopenharmony_ci
2748cabdff1aSopenharmony_cistatic void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2749cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
2750cabdff1aSopenharmony_ci                                const int8_t *filter, int32_t height)
2751cabdff1aSopenharmony_ci{
2752cabdff1aSopenharmony_ci    if (2 == height) {
2753cabdff1aSopenharmony_ci        common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2754cabdff1aSopenharmony_ci    } else if (6 == height) {
2755cabdff1aSopenharmony_ci        common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2756cabdff1aSopenharmony_ci    } else {
2757cabdff1aSopenharmony_ci        common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2758cabdff1aSopenharmony_ci                                 filter, height);
2759cabdff1aSopenharmony_ci    }
2760cabdff1aSopenharmony_ci}
2761cabdff1aSopenharmony_ci
2762cabdff1aSopenharmony_cistatic void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2763cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
2764cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
2765cabdff1aSopenharmony_ci{
2766cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2767cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
2768cabdff1aSopenharmony_ci    v16u8 out0, out1;
2769cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2770cabdff1aSopenharmony_ci    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2771cabdff1aSopenharmony_ci    v16i8 src2110, src4332, src6554;
2772cabdff1aSopenharmony_ci    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2773cabdff1aSopenharmony_ci    v8i16 filter_vec;
2774cabdff1aSopenharmony_ci
2775cabdff1aSopenharmony_ci    src -= (1 * src_stride);
2776cabdff1aSopenharmony_ci
2777cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter);
2778cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2779cabdff1aSopenharmony_ci
2780cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
2781cabdff1aSopenharmony_ci    src += (3 * src_stride);
2782cabdff1aSopenharmony_ci
2783cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
2784cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2785cabdff1aSopenharmony_ci    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2786cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2787cabdff1aSopenharmony_ci
2788cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
2789cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src3, src4, src5, src6);
2790cabdff1aSopenharmony_ci        src += (4 * src_stride);
2791cabdff1aSopenharmony_ci
2792cabdff1aSopenharmony_ci        XORI_B4_128_SB(src3, src4, src5, src6);
2793cabdff1aSopenharmony_ci        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2794cabdff1aSopenharmony_ci        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2795cabdff1aSopenharmony_ci        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2796cabdff1aSopenharmony_ci        ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2797cabdff1aSopenharmony_ci        ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2798cabdff1aSopenharmony_ci        src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2799cabdff1aSopenharmony_ci
2800cabdff1aSopenharmony_ci        dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2801cabdff1aSopenharmony_ci        dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2802cabdff1aSopenharmony_ci        dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2803cabdff1aSopenharmony_ci        dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2804cabdff1aSopenharmony_ci        dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2805cabdff1aSopenharmony_ci        dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
2806cabdff1aSopenharmony_ci
2807cabdff1aSopenharmony_ci        SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2808cabdff1aSopenharmony_ci        SRARI_H2_SH(dst0_l, dst1_l, 6);
2809cabdff1aSopenharmony_ci        SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2810cabdff1aSopenharmony_ci        SAT_SH2_SH(dst0_l, dst1_l, 7);
2811cabdff1aSopenharmony_ci        out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2812cabdff1aSopenharmony_ci        out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2813cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2814cabdff1aSopenharmony_ci        out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
2815cabdff1aSopenharmony_ci        ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
2816cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
2817cabdff1aSopenharmony_ci
2818cabdff1aSopenharmony_ci        src2 = src6;
2819cabdff1aSopenharmony_ci        src10_r = src54_r;
2820cabdff1aSopenharmony_ci        src21_r = src65_r;
2821cabdff1aSopenharmony_ci        src2110 = src6554;
2822cabdff1aSopenharmony_ci    }
2823cabdff1aSopenharmony_ci}
2824cabdff1aSopenharmony_ci
2825cabdff1aSopenharmony_cistatic void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2826cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
2827cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
2828cabdff1aSopenharmony_ci{
2829cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2830cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
2831cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2832cabdff1aSopenharmony_ci    v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2833cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3;
2834cabdff1aSopenharmony_ci    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2835cabdff1aSopenharmony_ci
2836cabdff1aSopenharmony_ci    src -= src_stride;
2837cabdff1aSopenharmony_ci
2838cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2839cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2840cabdff1aSopenharmony_ci
2841cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
2842cabdff1aSopenharmony_ci    src += (3 * src_stride);
2843cabdff1aSopenharmony_ci
2844cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
2845cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2846cabdff1aSopenharmony_ci    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2847cabdff1aSopenharmony_ci
2848cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
2849cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src3, src4, src5, src6);
2850cabdff1aSopenharmony_ci        src += (4 * src_stride);
2851cabdff1aSopenharmony_ci
2852cabdff1aSopenharmony_ci        XORI_B4_128_SB(src3, src4, src5, src6);
2853cabdff1aSopenharmony_ci        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2854cabdff1aSopenharmony_ci                   src32_r, src43_r, src54_r, src65_r);
2855cabdff1aSopenharmony_ci        ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2856cabdff1aSopenharmony_ci                   src32_l, src43_l, src54_l, src65_l);
2857cabdff1aSopenharmony_ci        out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2858cabdff1aSopenharmony_ci        out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2859cabdff1aSopenharmony_ci        out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2860cabdff1aSopenharmony_ci        out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2861cabdff1aSopenharmony_ci        out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2862cabdff1aSopenharmony_ci        out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2863cabdff1aSopenharmony_ci        out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
2864cabdff1aSopenharmony_ci        out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
2865cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2866cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2867cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2868cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2869cabdff1aSopenharmony_ci        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2870cabdff1aSopenharmony_ci                    out3_r, tmp0, tmp1, tmp2, tmp3);
2871cabdff1aSopenharmony_ci        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2872cabdff1aSopenharmony_ci        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2873cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
2874cabdff1aSopenharmony_ci
2875cabdff1aSopenharmony_ci        src10_r = src54_r;
2876cabdff1aSopenharmony_ci        src21_r = src65_r;
2877cabdff1aSopenharmony_ci        src10_l = src54_l;
2878cabdff1aSopenharmony_ci        src21_l = src65_l;
2879cabdff1aSopenharmony_ci        src2 = src6;
2880cabdff1aSopenharmony_ci    }
2881cabdff1aSopenharmony_ci}
2882cabdff1aSopenharmony_ci
2883cabdff1aSopenharmony_cistatic void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2884cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
2885cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
2886cabdff1aSopenharmony_ci{
2887cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2888cabdff1aSopenharmony_ci    uint64_t out0, out1;
2889cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2890cabdff1aSopenharmony_ci    v16i8 src11, filt0, filt1;
2891cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2892cabdff1aSopenharmony_ci    v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2893cabdff1aSopenharmony_ci    v16u8 out;
2894cabdff1aSopenharmony_ci    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2895cabdff1aSopenharmony_ci
2896cabdff1aSopenharmony_ci    src -= src_stride;
2897cabdff1aSopenharmony_ci
2898cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2899cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2900cabdff1aSopenharmony_ci
2901cabdff1aSopenharmony_ci    /* 16 width */
2902cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
2903cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
2904cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2905cabdff1aSopenharmony_ci    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2906cabdff1aSopenharmony_ci
2907cabdff1aSopenharmony_ci    /* 8 width */
2908cabdff1aSopenharmony_ci    LD_SB3(src + 16, src_stride, src6, src7, src8);
2909cabdff1aSopenharmony_ci    src += (3 * src_stride);
2910cabdff1aSopenharmony_ci    XORI_B3_128_SB(src6, src7, src8);
2911cabdff1aSopenharmony_ci    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2912cabdff1aSopenharmony_ci
2913cabdff1aSopenharmony_ci    for (loop_cnt = 8; loop_cnt--;) {
2914cabdff1aSopenharmony_ci        /* 16 width */
2915cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src3, src4);
2916cabdff1aSopenharmony_ci        XORI_B2_128_SB(src3, src4);
2917cabdff1aSopenharmony_ci        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2918cabdff1aSopenharmony_ci        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2919cabdff1aSopenharmony_ci
2920cabdff1aSopenharmony_ci        /* 8 width */
2921cabdff1aSopenharmony_ci        LD_SB2(src + 16, src_stride, src9, src10);
2922cabdff1aSopenharmony_ci        src += (2 * src_stride);
2923cabdff1aSopenharmony_ci        XORI_B2_128_SB(src9, src10);
2924cabdff1aSopenharmony_ci        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2925cabdff1aSopenharmony_ci
2926cabdff1aSopenharmony_ci        /* 16 width */
2927cabdff1aSopenharmony_ci        out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2928cabdff1aSopenharmony_ci        out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2929cabdff1aSopenharmony_ci        out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2930cabdff1aSopenharmony_ci        out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2931cabdff1aSopenharmony_ci
2932cabdff1aSopenharmony_ci        /* 8 width */
2933cabdff1aSopenharmony_ci        out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
2934cabdff1aSopenharmony_ci        out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2935cabdff1aSopenharmony_ci
2936cabdff1aSopenharmony_ci        /* 16 + 8 width */
2937cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2938cabdff1aSopenharmony_ci        SRARI_H2_SH(out0_l, out1_l, 6);
2939cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2940cabdff1aSopenharmony_ci        SAT_SH2_SH(out0_l, out1_l, 7);
2941cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0_r, out0_l);
2942cabdff1aSopenharmony_ci        ST_UB(out, dst);
2943cabdff1aSopenharmony_ci        PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2944cabdff1aSopenharmony_ci        XORI_B2_128_SH(out2_r, out3_r);
2945cabdff1aSopenharmony_ci        out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2946cabdff1aSopenharmony_ci        out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2947cabdff1aSopenharmony_ci        SD(out0, dst + 16);
2948cabdff1aSopenharmony_ci        dst += dst_stride;
2949cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out1_r, out1_l);
2950cabdff1aSopenharmony_ci        ST_UB(out, dst);
2951cabdff1aSopenharmony_ci        SD(out1, dst + 16);
2952cabdff1aSopenharmony_ci        dst += dst_stride;
2953cabdff1aSopenharmony_ci
2954cabdff1aSopenharmony_ci        /* 16 width */
2955cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src5, src2);
2956cabdff1aSopenharmony_ci        XORI_B2_128_SB(src5, src2);
2957cabdff1aSopenharmony_ci        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2958cabdff1aSopenharmony_ci        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2959cabdff1aSopenharmony_ci
2960cabdff1aSopenharmony_ci        /* 8 width */
2961cabdff1aSopenharmony_ci        LD_SB2(src + 16, src_stride, src11, src8);
2962cabdff1aSopenharmony_ci        src += (2 * src_stride);
2963cabdff1aSopenharmony_ci        XORI_B2_128_SB(src11, src8);
2964cabdff1aSopenharmony_ci        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2965cabdff1aSopenharmony_ci
2966cabdff1aSopenharmony_ci        /* 16 width */
2967cabdff1aSopenharmony_ci        out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
2968cabdff1aSopenharmony_ci        out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
2969cabdff1aSopenharmony_ci        out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
2970cabdff1aSopenharmony_ci        out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
2971cabdff1aSopenharmony_ci
2972cabdff1aSopenharmony_ci        /* 8 width */
2973cabdff1aSopenharmony_ci        out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
2974cabdff1aSopenharmony_ci        out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
2975cabdff1aSopenharmony_ci
2976cabdff1aSopenharmony_ci        /* 16 + 8 width */
2977cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2978cabdff1aSopenharmony_ci        SRARI_H2_SH(out0_l, out1_l, 6);
2979cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2980cabdff1aSopenharmony_ci        SAT_SH2_SH(out0_l, out1_l, 7);
2981cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0_r, out0_l);
2982cabdff1aSopenharmony_ci        ST_UB(out, dst);
2983cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2_r, out2_r);
2984cabdff1aSopenharmony_ci        ST_D1(out, 0, dst + 16);
2985cabdff1aSopenharmony_ci        dst += dst_stride;
2986cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out1_r, out1_l);
2987cabdff1aSopenharmony_ci        ST_UB(out, dst);
2988cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out3_r, out3_r);
2989cabdff1aSopenharmony_ci        ST_D1(out, 0, dst + 16);
2990cabdff1aSopenharmony_ci        dst += dst_stride;
2991cabdff1aSopenharmony_ci    }
2992cabdff1aSopenharmony_ci}
2993cabdff1aSopenharmony_ci
2994cabdff1aSopenharmony_cistatic void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
2995cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
2996cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
2997cabdff1aSopenharmony_ci{
2998cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2999cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3000cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src76_r, src98_r;
3001cabdff1aSopenharmony_ci    v16i8 src21_r, src43_r, src87_r, src109_r;
3002cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3003cabdff1aSopenharmony_ci    v16i8 src10_l, src32_l, src76_l, src98_l;
3004cabdff1aSopenharmony_ci    v16i8 src21_l, src43_l, src87_l, src109_l;
3005cabdff1aSopenharmony_ci    v8i16 filt;
3006cabdff1aSopenharmony_ci    v16i8 filt0, filt1;
3007cabdff1aSopenharmony_ci    v16u8 out;
3008cabdff1aSopenharmony_ci
3009cabdff1aSopenharmony_ci    src -= src_stride;
3010cabdff1aSopenharmony_ci
3011cabdff1aSopenharmony_ci    filt = LD_SH(filter);
3012cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
3013cabdff1aSopenharmony_ci
3014cabdff1aSopenharmony_ci    /* 16 width */
3015cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
3016cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
3017cabdff1aSopenharmony_ci
3018cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3019cabdff1aSopenharmony_ci    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3020cabdff1aSopenharmony_ci
3021cabdff1aSopenharmony_ci    /* next 16 width */
3022cabdff1aSopenharmony_ci    LD_SB3(src + 16, src_stride, src6, src7, src8);
3023cabdff1aSopenharmony_ci    src += (3 * src_stride);
3024cabdff1aSopenharmony_ci
3025cabdff1aSopenharmony_ci    XORI_B3_128_SB(src6, src7, src8);
3026cabdff1aSopenharmony_ci    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3027cabdff1aSopenharmony_ci    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3028cabdff1aSopenharmony_ci
3029cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
3030cabdff1aSopenharmony_ci        /* 16 width */
3031cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src3, src4);
3032cabdff1aSopenharmony_ci        XORI_B2_128_SB(src3, src4);
3033cabdff1aSopenharmony_ci        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3034cabdff1aSopenharmony_ci        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3035cabdff1aSopenharmony_ci
3036cabdff1aSopenharmony_ci        /* 16 width */
3037cabdff1aSopenharmony_ci        out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3038cabdff1aSopenharmony_ci        out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3039cabdff1aSopenharmony_ci        out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3040cabdff1aSopenharmony_ci        out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3041cabdff1aSopenharmony_ci
3042cabdff1aSopenharmony_ci        /* 16 width */
3043cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
3044cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3045cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0_r, out0_l);
3046cabdff1aSopenharmony_ci        ST_UB(out, dst);
3047cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out1_r, out1_l);
3048cabdff1aSopenharmony_ci        ST_UB(out, dst + dst_stride);
3049cabdff1aSopenharmony_ci
3050cabdff1aSopenharmony_ci        src10_r = src32_r;
3051cabdff1aSopenharmony_ci        src21_r = src43_r;
3052cabdff1aSopenharmony_ci        src10_l = src32_l;
3053cabdff1aSopenharmony_ci        src21_l = src43_l;
3054cabdff1aSopenharmony_ci        src2 = src4;
3055cabdff1aSopenharmony_ci
3056cabdff1aSopenharmony_ci        /* next 16 width */
3057cabdff1aSopenharmony_ci        LD_SB2(src + 16, src_stride, src9, src10);
3058cabdff1aSopenharmony_ci        src += (2 * src_stride);
3059cabdff1aSopenharmony_ci        XORI_B2_128_SB(src9, src10);
3060cabdff1aSopenharmony_ci        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3061cabdff1aSopenharmony_ci        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3062cabdff1aSopenharmony_ci
3063cabdff1aSopenharmony_ci        /* next 16 width */
3064cabdff1aSopenharmony_ci        out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3065cabdff1aSopenharmony_ci        out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
3066cabdff1aSopenharmony_ci        out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3067cabdff1aSopenharmony_ci        out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
3068cabdff1aSopenharmony_ci
3069cabdff1aSopenharmony_ci        /* next 16 width */
3070cabdff1aSopenharmony_ci        SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
3071cabdff1aSopenharmony_ci        SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3072cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2_r, out2_l);
3073cabdff1aSopenharmony_ci        ST_UB(out, dst + 16);
3074cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out3_r, out3_l);
3075cabdff1aSopenharmony_ci        ST_UB(out, dst + 16 + dst_stride);
3076cabdff1aSopenharmony_ci
3077cabdff1aSopenharmony_ci        dst += 2 * dst_stride;
3078cabdff1aSopenharmony_ci
3079cabdff1aSopenharmony_ci        src76_r = src98_r;
3080cabdff1aSopenharmony_ci        src87_r = src109_r;
3081cabdff1aSopenharmony_ci        src76_l = src98_l;
3082cabdff1aSopenharmony_ci        src87_l = src109_l;
3083cabdff1aSopenharmony_ci        src8 = src10;
3084cabdff1aSopenharmony_ci    }
3085cabdff1aSopenharmony_ci}
3086cabdff1aSopenharmony_ci
3087cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
3088cabdff1aSopenharmony_ci                                   int32_t src_stride,
3089cabdff1aSopenharmony_ci                                   uint8_t *dst,
3090cabdff1aSopenharmony_ci                                   int32_t dst_stride,
3091cabdff1aSopenharmony_ci                                   const int8_t *filter_x,
3092cabdff1aSopenharmony_ci                                   const int8_t *filter_y)
3093cabdff1aSopenharmony_ci{
3094cabdff1aSopenharmony_ci    v16u8 out;
3095cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
3096cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3097cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1;
3098cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3099cabdff1aSopenharmony_ci    v16i8 mask1;
3100cabdff1aSopenharmony_ci    v8i16 filter_vec, tmp;
3101cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3102cabdff1aSopenharmony_ci    v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3103cabdff1aSopenharmony_ci    v4i32 dst0, dst1;
3104cabdff1aSopenharmony_ci
3105cabdff1aSopenharmony_ci    src -= (src_stride + 1);
3106cabdff1aSopenharmony_ci
3107cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
3108cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3109cabdff1aSopenharmony_ci
3110cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
3111cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
3112cabdff1aSopenharmony_ci
3113cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3114cabdff1aSopenharmony_ci
3115cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3116cabdff1aSopenharmony_ci
3117cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3118cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3119cabdff1aSopenharmony_ci
3120cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3121cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3122cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3123cabdff1aSopenharmony_ci
3124cabdff1aSopenharmony_ci    dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3125cabdff1aSopenharmony_ci    dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3126cabdff1aSopenharmony_ci    dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3127cabdff1aSopenharmony_ci
3128cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3129cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3130cabdff1aSopenharmony_ci
3131cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3132cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3133cabdff1aSopenharmony_ci    dst0 >>= 6;
3134cabdff1aSopenharmony_ci    dst1 >>= 6;
3135cabdff1aSopenharmony_ci    tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3136cabdff1aSopenharmony_ci    tmp = __msa_srari_h(tmp, 6);
3137cabdff1aSopenharmony_ci    tmp = __msa_sat_s_h(tmp, 7);
3138cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(tmp, tmp);
3139cabdff1aSopenharmony_ci    ST_W2(out, 0, 1, dst, dst_stride);
3140cabdff1aSopenharmony_ci}
3141cabdff1aSopenharmony_ci
3142cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
3143cabdff1aSopenharmony_ci                                   int32_t src_stride,
3144cabdff1aSopenharmony_ci                                   uint8_t *dst,
3145cabdff1aSopenharmony_ci                                   int32_t dst_stride,
3146cabdff1aSopenharmony_ci                                   const int8_t *filter_x,
3147cabdff1aSopenharmony_ci                                   const int8_t *filter_y)
3148cabdff1aSopenharmony_ci{
3149cabdff1aSopenharmony_ci    v16u8 out;
3150cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
3151cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3152cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1;
3153cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3154cabdff1aSopenharmony_ci    v16i8 mask1;
3155cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3156cabdff1aSopenharmony_ci    v8i16 filter_vec, tmp0, tmp1;
3157cabdff1aSopenharmony_ci    v8i16 dst30, dst41, dst52, dst63;
3158cabdff1aSopenharmony_ci    v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3159cabdff1aSopenharmony_ci    v4i32 dst0, dst1, dst2, dst3;
3160cabdff1aSopenharmony_ci
3161cabdff1aSopenharmony_ci    src -= (src_stride + 1);
3162cabdff1aSopenharmony_ci
3163cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
3164cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3165cabdff1aSopenharmony_ci
3166cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
3167cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
3168cabdff1aSopenharmony_ci
3169cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3170cabdff1aSopenharmony_ci
3171cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3172cabdff1aSopenharmony_ci
3173cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3174cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3175cabdff1aSopenharmony_ci
3176cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3177cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3178cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3179cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3180cabdff1aSopenharmony_ci
3181cabdff1aSopenharmony_ci    dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3182cabdff1aSopenharmony_ci    dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3183cabdff1aSopenharmony_ci    dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3184cabdff1aSopenharmony_ci    dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3185cabdff1aSopenharmony_ci
3186cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3187cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3188cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3189cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3190cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3191cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3192cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3193cabdff1aSopenharmony_ci    SRA_4V(dst0, dst1, dst2, dst3, 6);
3194cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3195cabdff1aSopenharmony_ci    SRARI_H2_SH(tmp0, tmp1, 6);
3196cabdff1aSopenharmony_ci    SAT_SH2_SH(tmp0, tmp1, 7);
3197cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(tmp0, tmp1);
3198cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3199cabdff1aSopenharmony_ci}
3200cabdff1aSopenharmony_ci
3201cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
3202cabdff1aSopenharmony_ci                                           int32_t src_stride,
3203cabdff1aSopenharmony_ci                                           uint8_t *dst,
3204cabdff1aSopenharmony_ci                                           int32_t dst_stride,
3205cabdff1aSopenharmony_ci                                           const int8_t *filter_x,
3206cabdff1aSopenharmony_ci                                           const int8_t *filter_y,
3207cabdff1aSopenharmony_ci                                           int32_t height)
3208cabdff1aSopenharmony_ci{
3209cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3210cabdff1aSopenharmony_ci    v16u8 out0, out1;
3211cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5;
3212cabdff1aSopenharmony_ci    v16i8 src6, src7, src8, src9, src10;
3213cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3214cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1;
3215cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3216cabdff1aSopenharmony_ci    v16i8 mask1;
3217cabdff1aSopenharmony_ci    v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3218cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3219cabdff1aSopenharmony_ci    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3220cabdff1aSopenharmony_ci    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3221cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3222cabdff1aSopenharmony_ci    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3223cabdff1aSopenharmony_ci    v8i16 dst98_r, dst109_r;
3224cabdff1aSopenharmony_ci
3225cabdff1aSopenharmony_ci    src -= (src_stride + 1);
3226cabdff1aSopenharmony_ci
3227cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
3228cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3229cabdff1aSopenharmony_ci
3230cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
3231cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
3232cabdff1aSopenharmony_ci
3233cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3234cabdff1aSopenharmony_ci
3235cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3236cabdff1aSopenharmony_ci
3237cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
3238cabdff1aSopenharmony_ci    src += (3 * src_stride);
3239cabdff1aSopenharmony_ci
3240cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
3241cabdff1aSopenharmony_ci
3242cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3243cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3244cabdff1aSopenharmony_ci    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3245cabdff1aSopenharmony_ci    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3246cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3247cabdff1aSopenharmony_ci    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3248cabdff1aSopenharmony_ci
3249cabdff1aSopenharmony_ci    for (loop_cnt = height >> 3; loop_cnt--;) {
3250cabdff1aSopenharmony_ci        LD_SB8(src, src_stride,
3251cabdff1aSopenharmony_ci               src3, src4, src5, src6, src7, src8, src9, src10);
3252cabdff1aSopenharmony_ci        src += (8 * src_stride);
3253cabdff1aSopenharmony_ci
3254cabdff1aSopenharmony_ci        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3255cabdff1aSopenharmony_ci
3256cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3257cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3258cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3259cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3260cabdff1aSopenharmony_ci
3261cabdff1aSopenharmony_ci        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3262cabdff1aSopenharmony_ci        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3263cabdff1aSopenharmony_ci        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3264cabdff1aSopenharmony_ci        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3265cabdff1aSopenharmony_ci
3266cabdff1aSopenharmony_ci        dst32_r = __msa_ilvr_h(dst73, dst22);
3267cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3268cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3269cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3270cabdff1aSopenharmony_ci        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3271cabdff1aSopenharmony_ci        dst76_r = __msa_ilvr_h(dst22, dst106);
3272cabdff1aSopenharmony_ci
3273cabdff1aSopenharmony_ci        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3274cabdff1aSopenharmony_ci        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3275cabdff1aSopenharmony_ci        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3276cabdff1aSopenharmony_ci        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3277cabdff1aSopenharmony_ci        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3278cabdff1aSopenharmony_ci        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3279cabdff1aSopenharmony_ci        dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3280cabdff1aSopenharmony_ci        dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3281cabdff1aSopenharmony_ci        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3282cabdff1aSopenharmony_ci        SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3283cabdff1aSopenharmony_ci        PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3284cabdff1aSopenharmony_ci                    dst5_r, dst4_r, dst7_r, dst6_r,
3285cabdff1aSopenharmony_ci                    tmp0, tmp1, tmp2, tmp3);
3286cabdff1aSopenharmony_ci        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3287cabdff1aSopenharmony_ci        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3288cabdff1aSopenharmony_ci        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3289cabdff1aSopenharmony_ci        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3290cabdff1aSopenharmony_ci        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3291cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
3292cabdff1aSopenharmony_ci
3293cabdff1aSopenharmony_ci        dst10_r = dst98_r;
3294cabdff1aSopenharmony_ci        dst21_r = dst109_r;
3295cabdff1aSopenharmony_ci        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3296cabdff1aSopenharmony_ci    }
3297cabdff1aSopenharmony_ci}
3298cabdff1aSopenharmony_ci
3299cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_4w_msa(uint8_t *src,
3300cabdff1aSopenharmony_ci                                  int32_t src_stride,
3301cabdff1aSopenharmony_ci                                  uint8_t *dst,
3302cabdff1aSopenharmony_ci                                  int32_t dst_stride,
3303cabdff1aSopenharmony_ci                                  const int8_t *filter_x,
3304cabdff1aSopenharmony_ci                                  const int8_t *filter_y,
3305cabdff1aSopenharmony_ci                                  int32_t height)
3306cabdff1aSopenharmony_ci{
3307cabdff1aSopenharmony_ci    if (2 == height) {
3308cabdff1aSopenharmony_ci        hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3309cabdff1aSopenharmony_ci                               filter_x, filter_y);
3310cabdff1aSopenharmony_ci    } else if (4 == height) {
3311cabdff1aSopenharmony_ci        hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3312cabdff1aSopenharmony_ci                               filter_x, filter_y);
3313cabdff1aSopenharmony_ci    } else if (0 == (height % 8)) {
3314cabdff1aSopenharmony_ci        hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3315cabdff1aSopenharmony_ci                                       filter_x, filter_y, height);
3316cabdff1aSopenharmony_ci    }
3317cabdff1aSopenharmony_ci}
3318cabdff1aSopenharmony_ci
3319cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_6w_msa(uint8_t *src,
3320cabdff1aSopenharmony_ci                                  int32_t src_stride,
3321cabdff1aSopenharmony_ci                                  uint8_t *dst,
3322cabdff1aSopenharmony_ci                                  int32_t dst_stride,
3323cabdff1aSopenharmony_ci                                  const int8_t *filter_x,
3324cabdff1aSopenharmony_ci                                  const int8_t *filter_y,
3325cabdff1aSopenharmony_ci                                  int32_t height)
3326cabdff1aSopenharmony_ci{
3327cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
3328cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
3329cabdff1aSopenharmony_ci    v16i8 src7, src8, src9, src10;
3330cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3331cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3332cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3333cabdff1aSopenharmony_ci    v16i8 mask1;
3334cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filter_vec;
3335cabdff1aSopenharmony_ci    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3336cabdff1aSopenharmony_ci    v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3337cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3338cabdff1aSopenharmony_ci    v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3339cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3340cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3341cabdff1aSopenharmony_ci    v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3342cabdff1aSopenharmony_ci    v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3343cabdff1aSopenharmony_ci    v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3344cabdff1aSopenharmony_ci
3345cabdff1aSopenharmony_ci    src -= (src_stride + 1);
3346cabdff1aSopenharmony_ci
3347cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
3348cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3349cabdff1aSopenharmony_ci
3350cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
3351cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
3352cabdff1aSopenharmony_ci
3353cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3354cabdff1aSopenharmony_ci
3355cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3356cabdff1aSopenharmony_ci
3357cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
3358cabdff1aSopenharmony_ci    src += (3 * src_stride);
3359cabdff1aSopenharmony_ci
3360cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
3361cabdff1aSopenharmony_ci
3362cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3363cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3364cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3365cabdff1aSopenharmony_ci
3366cabdff1aSopenharmony_ci    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3367cabdff1aSopenharmony_ci    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3368cabdff1aSopenharmony_ci    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3369cabdff1aSopenharmony_ci
3370cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3371cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3372cabdff1aSopenharmony_ci
3373cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3374cabdff1aSopenharmony_ci    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3375cabdff1aSopenharmony_ci
3376cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3377cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3378cabdff1aSopenharmony_ci    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3379cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3380cabdff1aSopenharmony_ci
3381cabdff1aSopenharmony_ci    dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3382cabdff1aSopenharmony_ci    dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3383cabdff1aSopenharmony_ci    dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3384cabdff1aSopenharmony_ci    dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3385cabdff1aSopenharmony_ci
3386cabdff1aSopenharmony_ci    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3387cabdff1aSopenharmony_ci    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3388cabdff1aSopenharmony_ci    VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3389cabdff1aSopenharmony_ci    VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3390cabdff1aSopenharmony_ci
3391cabdff1aSopenharmony_ci    dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3392cabdff1aSopenharmony_ci    dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3393cabdff1aSopenharmony_ci    dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3394cabdff1aSopenharmony_ci    dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3395cabdff1aSopenharmony_ci
3396cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3397cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3398cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3399cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3400cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3401cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3402cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3403cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3404cabdff1aSopenharmony_ci
3405cabdff1aSopenharmony_ci    PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3406cabdff1aSopenharmony_ci    PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3407cabdff1aSopenharmony_ci    dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3408cabdff1aSopenharmony_ci
3409cabdff1aSopenharmony_ci    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3410cabdff1aSopenharmony_ci    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3411cabdff1aSopenharmony_ci    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3412cabdff1aSopenharmony_ci    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3413cabdff1aSopenharmony_ci    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3414cabdff1aSopenharmony_ci    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3415cabdff1aSopenharmony_ci    dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3416cabdff1aSopenharmony_ci    dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3417cabdff1aSopenharmony_ci    dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3418cabdff1aSopenharmony_ci    dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3419cabdff1aSopenharmony_ci    dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3420cabdff1aSopenharmony_ci    dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3421cabdff1aSopenharmony_ci    SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3422cabdff1aSopenharmony_ci    SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3423cabdff1aSopenharmony_ci    SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3424cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3425cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3426cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3427cabdff1aSopenharmony_ci    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3428cabdff1aSopenharmony_ci    SRARI_H2_SH(tmp4, tmp5, 6);
3429cabdff1aSopenharmony_ci    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3,7);
3430cabdff1aSopenharmony_ci    SAT_SH2_SH(tmp4, tmp5,7);
3431cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3432cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3433cabdff1aSopenharmony_ci    out2 = PCKEV_XORI128_UB(tmp4, tmp5);
3434cabdff1aSopenharmony_ci    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3435cabdff1aSopenharmony_ci    ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
3436cabdff1aSopenharmony_ci}
3437cabdff1aSopenharmony_ci
3438cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
3439cabdff1aSopenharmony_ci                                   int32_t src_stride,
3440cabdff1aSopenharmony_ci                                   uint8_t *dst,
3441cabdff1aSopenharmony_ci                                   int32_t dst_stride,
3442cabdff1aSopenharmony_ci                                   const int8_t *filter_x,
3443cabdff1aSopenharmony_ci                                   const int8_t *filter_y)
3444cabdff1aSopenharmony_ci{
3445cabdff1aSopenharmony_ci    v16u8 out;
3446cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
3447cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3448cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filter_vec;
3449cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3450cabdff1aSopenharmony_ci    v16i8 mask1;
3451cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3452cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4;
3453cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3454cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3455cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3456cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r;
3457cabdff1aSopenharmony_ci
3458cabdff1aSopenharmony_ci    src -= (src_stride + 1);
3459cabdff1aSopenharmony_ci
3460cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
3461cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3462cabdff1aSopenharmony_ci
3463cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
3464cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
3465cabdff1aSopenharmony_ci
3466cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3467cabdff1aSopenharmony_ci
3468cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3469cabdff1aSopenharmony_ci
3470cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3471cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3472cabdff1aSopenharmony_ci
3473cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3474cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3475cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3476cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3477cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3478cabdff1aSopenharmony_ci
3479cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3480cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3481cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3482cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3483cabdff1aSopenharmony_ci    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3484cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3485cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3486cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3487cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3488cabdff1aSopenharmony_ci    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3489cabdff1aSopenharmony_ci    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3490cabdff1aSopenharmony_ci    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3491cabdff1aSopenharmony_ci    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3492cabdff1aSopenharmony_ci    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3493cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3494cabdff1aSopenharmony_ci    SRARI_H2_SH(out0_r, out1_r, 6);
3495cabdff1aSopenharmony_ci    SAT_SH2_SH(out0_r, out1_r, 7);
3496cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0_r, out1_r);
3497cabdff1aSopenharmony_ci    ST_D2(out, 0, 1, dst, dst_stride);
3498cabdff1aSopenharmony_ci}
3499cabdff1aSopenharmony_ci
3500cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_8multx4_msa(uint8_t *src,
3501cabdff1aSopenharmony_ci                                       int32_t src_stride,
3502cabdff1aSopenharmony_ci                                       uint8_t *dst,
3503cabdff1aSopenharmony_ci                                       int32_t dst_stride,
3504cabdff1aSopenharmony_ci                                       const int8_t *filter_x,
3505cabdff1aSopenharmony_ci                                       const int8_t *filter_y,
3506cabdff1aSopenharmony_ci                                       int32_t width8mult)
3507cabdff1aSopenharmony_ci{
3508cabdff1aSopenharmony_ci    uint32_t cnt;
3509cabdff1aSopenharmony_ci    v16u8 out0, out1;
3510cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3511cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3512cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3513cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3514cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3515cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3516cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3517cabdff1aSopenharmony_ci
3518cabdff1aSopenharmony_ci    src -= (src_stride + 1);
3519cabdff1aSopenharmony_ci
3520cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
3521cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3522cabdff1aSopenharmony_ci
3523cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
3524cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
3525cabdff1aSopenharmony_ci
3526cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3527cabdff1aSopenharmony_ci
3528cabdff1aSopenharmony_ci    mask0 = LD_SB(ff_hevc_mask_arr);
3529cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3530cabdff1aSopenharmony_ci
3531cabdff1aSopenharmony_ci    for (cnt = width8mult; cnt--;) {
3532cabdff1aSopenharmony_ci        LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3533cabdff1aSopenharmony_ci        src += 8;
3534cabdff1aSopenharmony_ci        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3535cabdff1aSopenharmony_ci
3536cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3537cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3538cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3539cabdff1aSopenharmony_ci
3540cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3541cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3542cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3543cabdff1aSopenharmony_ci
3544cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3545cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3546cabdff1aSopenharmony_ci
3547cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3548cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3549cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3550cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3551cabdff1aSopenharmony_ci
3552cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3553cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3554cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3555cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3556cabdff1aSopenharmony_ci
3557cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3558cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3559cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3560cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3561cabdff1aSopenharmony_ci
3562cabdff1aSopenharmony_ci        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3563cabdff1aSopenharmony_ci        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3564cabdff1aSopenharmony_ci        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3565cabdff1aSopenharmony_ci        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3566cabdff1aSopenharmony_ci        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3567cabdff1aSopenharmony_ci        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3568cabdff1aSopenharmony_ci        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3569cabdff1aSopenharmony_ci        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3570cabdff1aSopenharmony_ci
3571cabdff1aSopenharmony_ci        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3572cabdff1aSopenharmony_ci        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3573cabdff1aSopenharmony_ci
3574cabdff1aSopenharmony_ci        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3575cabdff1aSopenharmony_ci                    dst3_r, tmp0, tmp1, tmp2, tmp3);
3576cabdff1aSopenharmony_ci        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3577cabdff1aSopenharmony_ci        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3578cabdff1aSopenharmony_ci        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3579cabdff1aSopenharmony_ci        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3580cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3581cabdff1aSopenharmony_ci        dst += 8;
3582cabdff1aSopenharmony_ci    }
3583cabdff1aSopenharmony_ci}
3584cabdff1aSopenharmony_ci
3585cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_8x6_msa(uint8_t *src,
3586cabdff1aSopenharmony_ci                                   int32_t src_stride,
3587cabdff1aSopenharmony_ci                                   uint8_t *dst,
3588cabdff1aSopenharmony_ci                                   int32_t dst_stride,
3589cabdff1aSopenharmony_ci                                   const int8_t *filter_x,
3590cabdff1aSopenharmony_ci                                   const int8_t *filter_y)
3591cabdff1aSopenharmony_ci{
3592cabdff1aSopenharmony_ci    v16u8 out0, out1, out2;
3593cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3594cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3595cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filter_vec;
3596cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3597cabdff1aSopenharmony_ci    v16i8 mask1;
3598cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3599cabdff1aSopenharmony_ci    v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3600cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3601cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3602cabdff1aSopenharmony_ci    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3603cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3604cabdff1aSopenharmony_ci    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3605cabdff1aSopenharmony_ci    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3606cabdff1aSopenharmony_ci    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3607cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3608cabdff1aSopenharmony_ci
3609cabdff1aSopenharmony_ci    src -= (src_stride + 1);
3610cabdff1aSopenharmony_ci
3611cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
3612cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3613cabdff1aSopenharmony_ci
3614cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
3615cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
3616cabdff1aSopenharmony_ci
3617cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3618cabdff1aSopenharmony_ci
3619cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3620cabdff1aSopenharmony_ci
3621cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3622cabdff1aSopenharmony_ci    src += (5 * src_stride);
3623cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src5, src6, src7, src8);
3624cabdff1aSopenharmony_ci
3625cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3626cabdff1aSopenharmony_ci    XORI_B4_128_SB(src5, src6, src7, src8);
3627cabdff1aSopenharmony_ci
3628cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3629cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3630cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3631cabdff1aSopenharmony_ci    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3632cabdff1aSopenharmony_ci    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3633cabdff1aSopenharmony_ci    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3634cabdff1aSopenharmony_ci    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3635cabdff1aSopenharmony_ci    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3636cabdff1aSopenharmony_ci    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3637cabdff1aSopenharmony_ci
3638cabdff1aSopenharmony_ci    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3639cabdff1aSopenharmony_ci    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3640cabdff1aSopenharmony_ci    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3641cabdff1aSopenharmony_ci    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3642cabdff1aSopenharmony_ci    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3643cabdff1aSopenharmony_ci    dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
3644cabdff1aSopenharmony_ci    dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
3645cabdff1aSopenharmony_ci    dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
3646cabdff1aSopenharmony_ci    dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
3647cabdff1aSopenharmony_ci
3648cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3649cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3650cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3651cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3652cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3653cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3654cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3655cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3656cabdff1aSopenharmony_ci
3657cabdff1aSopenharmony_ci    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3658cabdff1aSopenharmony_ci    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3659cabdff1aSopenharmony_ci    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3660cabdff1aSopenharmony_ci    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3661cabdff1aSopenharmony_ci    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3662cabdff1aSopenharmony_ci    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3663cabdff1aSopenharmony_ci    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3664cabdff1aSopenharmony_ci    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3665cabdff1aSopenharmony_ci    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3666cabdff1aSopenharmony_ci    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3667cabdff1aSopenharmony_ci    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3668cabdff1aSopenharmony_ci    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3669cabdff1aSopenharmony_ci
3670cabdff1aSopenharmony_ci    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3671cabdff1aSopenharmony_ci    SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3672cabdff1aSopenharmony_ci    SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3673cabdff1aSopenharmony_ci    PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3674cabdff1aSopenharmony_ci                dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3675cabdff1aSopenharmony_ci    PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3676cabdff1aSopenharmony_ci    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3677cabdff1aSopenharmony_ci    SRARI_H2_SH(out4_r, out5_r, 6);
3678cabdff1aSopenharmony_ci    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3679cabdff1aSopenharmony_ci    SAT_SH2_SH(out4_r, out5_r, 7);
3680cabdff1aSopenharmony_ci    out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3681cabdff1aSopenharmony_ci    out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3682cabdff1aSopenharmony_ci    out2 = PCKEV_XORI128_UB(out4_r, out5_r);
3683cabdff1aSopenharmony_ci
3684cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3685cabdff1aSopenharmony_ci    ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3686cabdff1aSopenharmony_ci}
3687cabdff1aSopenharmony_ci
3688cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src,
3689cabdff1aSopenharmony_ci                                           int32_t src_stride,
3690cabdff1aSopenharmony_ci                                           uint8_t *dst,
3691cabdff1aSopenharmony_ci                                           int32_t dst_stride,
3692cabdff1aSopenharmony_ci                                           const int8_t *filter_x,
3693cabdff1aSopenharmony_ci                                           const int8_t *filter_y,
3694cabdff1aSopenharmony_ci                                           int32_t height,
3695cabdff1aSopenharmony_ci                                           int32_t width8mult)
3696cabdff1aSopenharmony_ci{
3697cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
3698cabdff1aSopenharmony_ci    uint8_t *src_tmp;
3699cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
3700cabdff1aSopenharmony_ci    v16u8 out0, out1;
3701cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
3702cabdff1aSopenharmony_ci    v8i16 filt0, filt1;
3703cabdff1aSopenharmony_ci    v8i16 filt_h0, filt_h1, filter_vec;
3704cabdff1aSopenharmony_ci    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3705cabdff1aSopenharmony_ci    v16i8 mask1;
3706cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3707cabdff1aSopenharmony_ci    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3708cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3709cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3710cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3711cabdff1aSopenharmony_ci    v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3712cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r;
3713cabdff1aSopenharmony_ci
3714cabdff1aSopenharmony_ci    src -= (src_stride + 1);
3715cabdff1aSopenharmony_ci
3716cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
3717cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3718cabdff1aSopenharmony_ci
3719cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
3720cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
3721cabdff1aSopenharmony_ci
3722cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3723cabdff1aSopenharmony_ci
3724cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3725cabdff1aSopenharmony_ci
3726cabdff1aSopenharmony_ci    for (cnt = width8mult; cnt--;) {
3727cabdff1aSopenharmony_ci        src_tmp = src;
3728cabdff1aSopenharmony_ci        dst_tmp = dst;
3729cabdff1aSopenharmony_ci
3730cabdff1aSopenharmony_ci        LD_SB3(src_tmp, src_stride, src0, src1, src2);
3731cabdff1aSopenharmony_ci        src_tmp += (3 * src_stride);
3732cabdff1aSopenharmony_ci
3733cabdff1aSopenharmony_ci        XORI_B3_128_SB(src0, src1, src2);
3734cabdff1aSopenharmony_ci
3735cabdff1aSopenharmony_ci        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3736cabdff1aSopenharmony_ci        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3737cabdff1aSopenharmony_ci        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3738cabdff1aSopenharmony_ci
3739cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3740cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3741cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3742cabdff1aSopenharmony_ci
3743cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3744cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3745cabdff1aSopenharmony_ci
3746cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 2); loop_cnt--;) {
3747cabdff1aSopenharmony_ci            LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3748cabdff1aSopenharmony_ci            src_tmp += (4 * src_stride);
3749cabdff1aSopenharmony_ci
3750cabdff1aSopenharmony_ci            XORI_B4_128_SB(src3, src4, src5, src6);
3751cabdff1aSopenharmony_ci
3752cabdff1aSopenharmony_ci            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3753cabdff1aSopenharmony_ci            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3754cabdff1aSopenharmony_ci            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3755cabdff1aSopenharmony_ci            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3756cabdff1aSopenharmony_ci
3757cabdff1aSopenharmony_ci            dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3758cabdff1aSopenharmony_ci            dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3759cabdff1aSopenharmony_ci            dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3760cabdff1aSopenharmony_ci            dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3761cabdff1aSopenharmony_ci
3762cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3763cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3764cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3765cabdff1aSopenharmony_ci            ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3766cabdff1aSopenharmony_ci
3767cabdff1aSopenharmony_ci            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3768cabdff1aSopenharmony_ci            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3769cabdff1aSopenharmony_ci            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3770cabdff1aSopenharmony_ci            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3771cabdff1aSopenharmony_ci            dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3772cabdff1aSopenharmony_ci            dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3773cabdff1aSopenharmony_ci            dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3774cabdff1aSopenharmony_ci            dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3775cabdff1aSopenharmony_ci
3776cabdff1aSopenharmony_ci            SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3777cabdff1aSopenharmony_ci            SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3778cabdff1aSopenharmony_ci
3779cabdff1aSopenharmony_ci            PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3780cabdff1aSopenharmony_ci                        dst2_l, dst2_r, dst3_l, dst3_r,
3781cabdff1aSopenharmony_ci                        out0_r, out1_r, out2_r, out3_r);
3782cabdff1aSopenharmony_ci
3783cabdff1aSopenharmony_ci            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3784cabdff1aSopenharmony_ci            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3785cabdff1aSopenharmony_ci            out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3786cabdff1aSopenharmony_ci            out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3787cabdff1aSopenharmony_ci            ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3788cabdff1aSopenharmony_ci            dst_tmp += (4 * dst_stride);
3789cabdff1aSopenharmony_ci
3790cabdff1aSopenharmony_ci            dst10_r = dst54_r;
3791cabdff1aSopenharmony_ci            dst10_l = dst54_l;
3792cabdff1aSopenharmony_ci            dst21_r = dst65_r;
3793cabdff1aSopenharmony_ci            dst21_l = dst65_l;
3794cabdff1aSopenharmony_ci            dst2 = dst6;
3795cabdff1aSopenharmony_ci        }
3796cabdff1aSopenharmony_ci
3797cabdff1aSopenharmony_ci        src += 8;
3798cabdff1aSopenharmony_ci        dst += 8;
3799cabdff1aSopenharmony_ci    }
3800cabdff1aSopenharmony_ci}
3801cabdff1aSopenharmony_ci
3802cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_8w_msa(uint8_t *src,
3803cabdff1aSopenharmony_ci                                  int32_t src_stride,
3804cabdff1aSopenharmony_ci                                  uint8_t *dst,
3805cabdff1aSopenharmony_ci                                  int32_t dst_stride,
3806cabdff1aSopenharmony_ci                                  const int8_t *filter_x,
3807cabdff1aSopenharmony_ci                                  const int8_t *filter_y,
3808cabdff1aSopenharmony_ci                                  int32_t height)
3809cabdff1aSopenharmony_ci{
3810cabdff1aSopenharmony_ci    if (2 == height) {
3811cabdff1aSopenharmony_ci        hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3812cabdff1aSopenharmony_ci                               filter_x, filter_y);
3813cabdff1aSopenharmony_ci    } else if (4 == height) {
3814cabdff1aSopenharmony_ci        hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3815cabdff1aSopenharmony_ci                                   filter_x, filter_y, 1);
3816cabdff1aSopenharmony_ci    } else if (6 == height) {
3817cabdff1aSopenharmony_ci        hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3818cabdff1aSopenharmony_ci                               filter_x, filter_y);
3819cabdff1aSopenharmony_ci    } else if (0 == (height % 4)) {
3820cabdff1aSopenharmony_ci        hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3821cabdff1aSopenharmony_ci                                       filter_x, filter_y, height, 1);
3822cabdff1aSopenharmony_ci    }
3823cabdff1aSopenharmony_ci}
3824cabdff1aSopenharmony_ci
3825cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_12w_msa(uint8_t *src,
3826cabdff1aSopenharmony_ci                                   int32_t src_stride,
3827cabdff1aSopenharmony_ci                                   uint8_t *dst,
3828cabdff1aSopenharmony_ci                                   int32_t dst_stride,
3829cabdff1aSopenharmony_ci                                   const int8_t *filter_x,
3830cabdff1aSopenharmony_ci                                   const int8_t *filter_y,
3831cabdff1aSopenharmony_ci                                   int32_t height)
3832cabdff1aSopenharmony_ci{
3833cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3834cabdff1aSopenharmony_ci    uint8_t *src_tmp, *dst_tmp;
3835cabdff1aSopenharmony_ci    v16u8 out0, out1;
3836cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3837cabdff1aSopenharmony_ci    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3838cabdff1aSopenharmony_ci    v16i8 mask0, mask1, mask2, mask3;
3839cabdff1aSopenharmony_ci    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3840cabdff1aSopenharmony_ci    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3841cabdff1aSopenharmony_ci    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3842cabdff1aSopenharmony_ci    v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3843cabdff1aSopenharmony_ci    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3844cabdff1aSopenharmony_ci    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3845cabdff1aSopenharmony_ci    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3846cabdff1aSopenharmony_ci    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3847cabdff1aSopenharmony_ci
3848cabdff1aSopenharmony_ci    src -= (src_stride + 1);
3849cabdff1aSopenharmony_ci
3850cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_x);
3851cabdff1aSopenharmony_ci    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3852cabdff1aSopenharmony_ci
3853cabdff1aSopenharmony_ci    filter_vec = LD_SH(filter_y);
3854cabdff1aSopenharmony_ci    UNPCK_R_SB_SH(filter_vec, filter_vec);
3855cabdff1aSopenharmony_ci
3856cabdff1aSopenharmony_ci    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3857cabdff1aSopenharmony_ci
3858cabdff1aSopenharmony_ci    mask0 = LD_SB(ff_hevc_mask_arr);
3859cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
3860cabdff1aSopenharmony_ci
3861cabdff1aSopenharmony_ci    src_tmp = src;
3862cabdff1aSopenharmony_ci    dst_tmp = dst;
3863cabdff1aSopenharmony_ci
3864cabdff1aSopenharmony_ci    LD_SB3(src_tmp, src_stride, src0, src1, src2);
3865cabdff1aSopenharmony_ci    src_tmp += (3 * src_stride);
3866cabdff1aSopenharmony_ci
3867cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
3868cabdff1aSopenharmony_ci
3869cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3870cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3871cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3872cabdff1aSopenharmony_ci
3873cabdff1aSopenharmony_ci    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3874cabdff1aSopenharmony_ci    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3875cabdff1aSopenharmony_ci    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3876cabdff1aSopenharmony_ci
3877cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3878cabdff1aSopenharmony_ci    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3879cabdff1aSopenharmony_ci
3880cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
3881cabdff1aSopenharmony_ci        LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3882cabdff1aSopenharmony_ci        src_tmp += (4 * src_stride);
3883cabdff1aSopenharmony_ci        XORI_B4_128_SB(src3, src4, src5, src6);
3884cabdff1aSopenharmony_ci
3885cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3886cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3887cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3888cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3889cabdff1aSopenharmony_ci
3890cabdff1aSopenharmony_ci        dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3891cabdff1aSopenharmony_ci        dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3892cabdff1aSopenharmony_ci        dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3893cabdff1aSopenharmony_ci        dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3894cabdff1aSopenharmony_ci
3895cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3896cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3897cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3898cabdff1aSopenharmony_ci        ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3899cabdff1aSopenharmony_ci
3900cabdff1aSopenharmony_ci        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3901cabdff1aSopenharmony_ci        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3902cabdff1aSopenharmony_ci        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3903cabdff1aSopenharmony_ci        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3904cabdff1aSopenharmony_ci        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3905cabdff1aSopenharmony_ci        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3906cabdff1aSopenharmony_ci        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3907cabdff1aSopenharmony_ci        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3908cabdff1aSopenharmony_ci
3909cabdff1aSopenharmony_ci        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3910cabdff1aSopenharmony_ci        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3911cabdff1aSopenharmony_ci
3912cabdff1aSopenharmony_ci        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3913cabdff1aSopenharmony_ci                    dst3_r, tmp0, tmp1, tmp2, tmp3);
3914cabdff1aSopenharmony_ci        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3915cabdff1aSopenharmony_ci        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3916cabdff1aSopenharmony_ci        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3917cabdff1aSopenharmony_ci        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3918cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3919cabdff1aSopenharmony_ci        dst_tmp += (4 * dst_stride);
3920cabdff1aSopenharmony_ci
3921cabdff1aSopenharmony_ci        dst10_r = dst54_r;
3922cabdff1aSopenharmony_ci        dst10_l = dst54_l;
3923cabdff1aSopenharmony_ci        dst21_r = dst65_r;
3924cabdff1aSopenharmony_ci        dst21_l = dst65_l;
3925cabdff1aSopenharmony_ci        dsth2 = dsth6;
3926cabdff1aSopenharmony_ci    }
3927cabdff1aSopenharmony_ci
3928cabdff1aSopenharmony_ci    src += 8;
3929cabdff1aSopenharmony_ci    dst += 8;
3930cabdff1aSopenharmony_ci
3931cabdff1aSopenharmony_ci    mask2 = LD_SB(ff_hevc_mask_arr + 16);
3932cabdff1aSopenharmony_ci    mask3 = mask2 + 2;
3933cabdff1aSopenharmony_ci
3934cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
3935cabdff1aSopenharmony_ci    src += (3 * src_stride);
3936cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
3937cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3938cabdff1aSopenharmony_ci    VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
3939cabdff1aSopenharmony_ci
3940cabdff1aSopenharmony_ci    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3941cabdff1aSopenharmony_ci    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3942cabdff1aSopenharmony_ci
3943cabdff1aSopenharmony_ci    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3944cabdff1aSopenharmony_ci    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3945cabdff1aSopenharmony_ci
3946cabdff1aSopenharmony_ci    for (loop_cnt = 2; loop_cnt--;) {
3947cabdff1aSopenharmony_ci        LD_SB8(src, src_stride,
3948cabdff1aSopenharmony_ci               src3, src4, src5, src6, src7, src8, src9, src10);
3949cabdff1aSopenharmony_ci        src += (8 * src_stride);
3950cabdff1aSopenharmony_ci        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3951cabdff1aSopenharmony_ci        VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
3952cabdff1aSopenharmony_ci        VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
3953cabdff1aSopenharmony_ci        VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
3954cabdff1aSopenharmony_ci        VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
3955cabdff1aSopenharmony_ci
3956cabdff1aSopenharmony_ci        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3957cabdff1aSopenharmony_ci        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3958cabdff1aSopenharmony_ci        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3959cabdff1aSopenharmony_ci        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3960cabdff1aSopenharmony_ci
3961cabdff1aSopenharmony_ci        dst32_r = __msa_ilvr_h(dst73, dst22);
3962cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3963cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3964cabdff1aSopenharmony_ci        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3965cabdff1aSopenharmony_ci        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3966cabdff1aSopenharmony_ci        dst76_r = __msa_ilvr_h(dst22, dst106);
3967cabdff1aSopenharmony_ci
3968cabdff1aSopenharmony_ci        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3969cabdff1aSopenharmony_ci        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3970cabdff1aSopenharmony_ci        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3971cabdff1aSopenharmony_ci        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3972cabdff1aSopenharmony_ci        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3973cabdff1aSopenharmony_ci        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3974cabdff1aSopenharmony_ci        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3975cabdff1aSopenharmony_ci        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3976cabdff1aSopenharmony_ci        SRA_4V(dst0, dst1, dst2, dst3, 6);
3977cabdff1aSopenharmony_ci        SRA_4V(dst4, dst5, dst6, dst7, 6);
3978cabdff1aSopenharmony_ci        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3979cabdff1aSopenharmony_ci                    tmp0, tmp1, tmp2, tmp3);
3980cabdff1aSopenharmony_ci        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3981cabdff1aSopenharmony_ci        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3982cabdff1aSopenharmony_ci        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3983cabdff1aSopenharmony_ci        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3984cabdff1aSopenharmony_ci        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3985cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
3986cabdff1aSopenharmony_ci
3987cabdff1aSopenharmony_ci        dst10_r = dst98_r;
3988cabdff1aSopenharmony_ci        dst21_r = dst109_r;
3989cabdff1aSopenharmony_ci        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3990cabdff1aSopenharmony_ci    }
3991cabdff1aSopenharmony_ci}
3992cabdff1aSopenharmony_ci
3993cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_16w_msa(uint8_t *src,
3994cabdff1aSopenharmony_ci                                   int32_t src_stride,
3995cabdff1aSopenharmony_ci                                   uint8_t *dst,
3996cabdff1aSopenharmony_ci                                   int32_t dst_stride,
3997cabdff1aSopenharmony_ci                                   const int8_t *filter_x,
3998cabdff1aSopenharmony_ci                                   const int8_t *filter_y,
3999cabdff1aSopenharmony_ci                                   int32_t height)
4000cabdff1aSopenharmony_ci{
4001cabdff1aSopenharmony_ci    if (4 == height) {
4002cabdff1aSopenharmony_ci        hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, filter_x,
4003cabdff1aSopenharmony_ci                                   filter_y, 2);
4004cabdff1aSopenharmony_ci    } else {
4005cabdff1aSopenharmony_ci        hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4006cabdff1aSopenharmony_ci                                       filter_x, filter_y, height, 2);
4007cabdff1aSopenharmony_ci    }
4008cabdff1aSopenharmony_ci}
4009cabdff1aSopenharmony_ci
4010cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_24w_msa(uint8_t *src,
4011cabdff1aSopenharmony_ci                                   int32_t src_stride,
4012cabdff1aSopenharmony_ci                                   uint8_t *dst,
4013cabdff1aSopenharmony_ci                                   int32_t dst_stride,
4014cabdff1aSopenharmony_ci                                   const int8_t *filter_x,
4015cabdff1aSopenharmony_ci                                   const int8_t *filter_y,
4016cabdff1aSopenharmony_ci                                   int32_t height)
4017cabdff1aSopenharmony_ci{
4018cabdff1aSopenharmony_ci    hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4019cabdff1aSopenharmony_ci                                   filter_x, filter_y, height, 3);
4020cabdff1aSopenharmony_ci}
4021cabdff1aSopenharmony_ci
4022cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_32w_msa(uint8_t *src,
4023cabdff1aSopenharmony_ci                                   int32_t src_stride,
4024cabdff1aSopenharmony_ci                                   uint8_t *dst,
4025cabdff1aSopenharmony_ci                                   int32_t dst_stride,
4026cabdff1aSopenharmony_ci                                   const int8_t *filter_x,
4027cabdff1aSopenharmony_ci                                   const int8_t *filter_y,
4028cabdff1aSopenharmony_ci                                   int32_t height)
4029cabdff1aSopenharmony_ci{
4030cabdff1aSopenharmony_ci    hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4031cabdff1aSopenharmony_ci                                   filter_x, filter_y, height, 4);
4032cabdff1aSopenharmony_ci}
4033cabdff1aSopenharmony_ci
4034cabdff1aSopenharmony_ci#define UNI_MC_COPY(WIDTH)                                                 \
4035cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \
4036cabdff1aSopenharmony_ci                                                    ptrdiff_t dst_stride,  \
4037cabdff1aSopenharmony_ci                                                    uint8_t *src,          \
4038cabdff1aSopenharmony_ci                                                    ptrdiff_t src_stride,  \
4039cabdff1aSopenharmony_ci                                                    int height,            \
4040cabdff1aSopenharmony_ci                                                    intptr_t mx,           \
4041cabdff1aSopenharmony_ci                                                    intptr_t my,           \
4042cabdff1aSopenharmony_ci                                                    int width)             \
4043cabdff1aSopenharmony_ci{                                                                          \
4044cabdff1aSopenharmony_ci    copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height);     \
4045cabdff1aSopenharmony_ci}
4046cabdff1aSopenharmony_ci
4047cabdff1aSopenharmony_ciUNI_MC_COPY(8);
4048cabdff1aSopenharmony_ciUNI_MC_COPY(12);
4049cabdff1aSopenharmony_ciUNI_MC_COPY(16);
4050cabdff1aSopenharmony_ciUNI_MC_COPY(24);
4051cabdff1aSopenharmony_ciUNI_MC_COPY(32);
4052cabdff1aSopenharmony_ciUNI_MC_COPY(48);
4053cabdff1aSopenharmony_ciUNI_MC_COPY(64);
4054cabdff1aSopenharmony_ci
4055cabdff1aSopenharmony_ci#undef UNI_MC_COPY
4056cabdff1aSopenharmony_ci
4057cabdff1aSopenharmony_ci#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                           \
4058cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,           \
4059cabdff1aSopenharmony_ci                                                       ptrdiff_t dst_stride,   \
4060cabdff1aSopenharmony_ci                                                       uint8_t *src,           \
4061cabdff1aSopenharmony_ci                                                       ptrdiff_t src_stride,   \
4062cabdff1aSopenharmony_ci                                                       int height,             \
4063cabdff1aSopenharmony_ci                                                       intptr_t mx,            \
4064cabdff1aSopenharmony_ci                                                       intptr_t my,            \
4065cabdff1aSopenharmony_ci                                                       int width)              \
4066cabdff1aSopenharmony_ci{                                                                              \
4067cabdff1aSopenharmony_ci    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];              \
4068cabdff1aSopenharmony_ci                                                                               \
4069cabdff1aSopenharmony_ci    common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride,  \
4070cabdff1aSopenharmony_ci                                            filter, height);                   \
4071cabdff1aSopenharmony_ci}
4072cabdff1aSopenharmony_ci
4073cabdff1aSopenharmony_ciUNI_MC(qpel, h, 4, 8, hz, mx);
4074cabdff1aSopenharmony_ciUNI_MC(qpel, h, 8, 8, hz, mx);
4075cabdff1aSopenharmony_ciUNI_MC(qpel, h, 12, 8, hz, mx);
4076cabdff1aSopenharmony_ciUNI_MC(qpel, h, 16, 8, hz, mx);
4077cabdff1aSopenharmony_ciUNI_MC(qpel, h, 24, 8, hz, mx);
4078cabdff1aSopenharmony_ciUNI_MC(qpel, h, 32, 8, hz, mx);
4079cabdff1aSopenharmony_ciUNI_MC(qpel, h, 48, 8, hz, mx);
4080cabdff1aSopenharmony_ciUNI_MC(qpel, h, 64, 8, hz, mx);
4081cabdff1aSopenharmony_ci
4082cabdff1aSopenharmony_ciUNI_MC(qpel, v, 4, 8, vt, my);
4083cabdff1aSopenharmony_ciUNI_MC(qpel, v, 8, 8, vt, my);
4084cabdff1aSopenharmony_ciUNI_MC(qpel, v, 12, 8, vt, my);
4085cabdff1aSopenharmony_ciUNI_MC(qpel, v, 16, 8, vt, my);
4086cabdff1aSopenharmony_ciUNI_MC(qpel, v, 24, 8, vt, my);
4087cabdff1aSopenharmony_ciUNI_MC(qpel, v, 32, 8, vt, my);
4088cabdff1aSopenharmony_ciUNI_MC(qpel, v, 48, 8, vt, my);
4089cabdff1aSopenharmony_ciUNI_MC(qpel, v, 64, 8, vt, my);
4090cabdff1aSopenharmony_ci
4091cabdff1aSopenharmony_ciUNI_MC(epel, h, 4, 4, hz, mx);
4092cabdff1aSopenharmony_ciUNI_MC(epel, h, 6, 4, hz, mx);
4093cabdff1aSopenharmony_ciUNI_MC(epel, h, 8, 4, hz, mx);
4094cabdff1aSopenharmony_ciUNI_MC(epel, h, 12, 4, hz, mx);
4095cabdff1aSopenharmony_ciUNI_MC(epel, h, 16, 4, hz, mx);
4096cabdff1aSopenharmony_ciUNI_MC(epel, h, 24, 4, hz, mx);
4097cabdff1aSopenharmony_ciUNI_MC(epel, h, 32, 4, hz, mx);
4098cabdff1aSopenharmony_ci
4099cabdff1aSopenharmony_ciUNI_MC(epel, v, 4, 4, vt, my);
4100cabdff1aSopenharmony_ciUNI_MC(epel, v, 6, 4, vt, my);
4101cabdff1aSopenharmony_ciUNI_MC(epel, v, 8, 4, vt, my);
4102cabdff1aSopenharmony_ciUNI_MC(epel, v, 12, 4, vt, my);
4103cabdff1aSopenharmony_ciUNI_MC(epel, v, 16, 4, vt, my);
4104cabdff1aSopenharmony_ciUNI_MC(epel, v, 24, 4, vt, my);
4105cabdff1aSopenharmony_ciUNI_MC(epel, v, 32, 4, vt, my);
4106cabdff1aSopenharmony_ci
4107cabdff1aSopenharmony_ci#undef UNI_MC
4108cabdff1aSopenharmony_ci
4109cabdff1aSopenharmony_ci#define UNI_MC_HV(PEL, WIDTH, TAP)                                         \
4110cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst,          \
4111cabdff1aSopenharmony_ci                                                    ptrdiff_t dst_stride,  \
4112cabdff1aSopenharmony_ci                                                    uint8_t *src,          \
4113cabdff1aSopenharmony_ci                                                    ptrdiff_t src_stride,  \
4114cabdff1aSopenharmony_ci                                                    int height,            \
4115cabdff1aSopenharmony_ci                                                    intptr_t mx,           \
4116cabdff1aSopenharmony_ci                                                    intptr_t my,           \
4117cabdff1aSopenharmony_ci                                                    int width)             \
4118cabdff1aSopenharmony_ci{                                                                          \
4119cabdff1aSopenharmony_ci    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];              \
4120cabdff1aSopenharmony_ci    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];              \
4121cabdff1aSopenharmony_ci                                                                           \
4122cabdff1aSopenharmony_ci    hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride,  \
4123cabdff1aSopenharmony_ci                                        filter_x, filter_y, height);       \
4124cabdff1aSopenharmony_ci}
4125cabdff1aSopenharmony_ci
4126cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 4, 8);
4127cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 8, 8);
4128cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 12, 8);
4129cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 16, 8);
4130cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 24, 8);
4131cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 32, 8);
4132cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 48, 8);
4133cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 64, 8);
4134cabdff1aSopenharmony_ci
4135cabdff1aSopenharmony_ciUNI_MC_HV(epel, 4, 4);
4136cabdff1aSopenharmony_ciUNI_MC_HV(epel, 6, 4);
4137cabdff1aSopenharmony_ciUNI_MC_HV(epel, 8, 4);
4138cabdff1aSopenharmony_ciUNI_MC_HV(epel, 12, 4);
4139cabdff1aSopenharmony_ciUNI_MC_HV(epel, 16, 4);
4140cabdff1aSopenharmony_ciUNI_MC_HV(epel, 24, 4);
4141cabdff1aSopenharmony_ciUNI_MC_HV(epel, 32, 4);
4142cabdff1aSopenharmony_ci
4143cabdff1aSopenharmony_ci#undef UNI_MC_HV
4144