1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavcodec/vp9dsp.h"
22cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
23cabdff1aSopenharmony_ci#include "vp9dsp_mips.h"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
26cabdff1aSopenharmony_ci                           p1_out, p0_out, q0_out, q1_out)               \
27cabdff1aSopenharmony_ci{                                                                        \
28cabdff1aSopenharmony_ci    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt, filt1, filt2;         \
29cabdff1aSopenharmony_ci    const v16i8 cnst4b = __msa_ldi_b(4);                                 \
30cabdff1aSopenharmony_ci    const v16i8 cnst3b = __msa_ldi_b(3);                                 \
31cabdff1aSopenharmony_ci                                                                         \
32cabdff1aSopenharmony_ci    p1_m = (v16i8) __msa_xori_b(p1_in, 0x80);                            \
33cabdff1aSopenharmony_ci    p0_m = (v16i8) __msa_xori_b(p0_in, 0x80);                            \
34cabdff1aSopenharmony_ci    q0_m = (v16i8) __msa_xori_b(q0_in, 0x80);                            \
35cabdff1aSopenharmony_ci    q1_m = (v16i8) __msa_xori_b(q1_in, 0x80);                            \
36cabdff1aSopenharmony_ci                                                                         \
37cabdff1aSopenharmony_ci    filt = __msa_subs_s_b(p1_m, q1_m);                                   \
38cabdff1aSopenharmony_ci                                                                         \
39cabdff1aSopenharmony_ci    filt = filt & (v16i8) hev_in;                                        \
40cabdff1aSopenharmony_ci                                                                         \
41cabdff1aSopenharmony_ci    q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);                              \
42cabdff1aSopenharmony_ci    filt = __msa_adds_s_b(filt, q0_sub_p0);                              \
43cabdff1aSopenharmony_ci    filt = __msa_adds_s_b(filt, q0_sub_p0);                              \
44cabdff1aSopenharmony_ci    filt = __msa_adds_s_b(filt, q0_sub_p0);                              \
45cabdff1aSopenharmony_ci    filt = filt & (v16i8) mask_in;                                       \
46cabdff1aSopenharmony_ci                                                                         \
47cabdff1aSopenharmony_ci    filt1 = __msa_adds_s_b(filt, cnst4b);                                \
48cabdff1aSopenharmony_ci    filt1 >>= 3;                                                         \
49cabdff1aSopenharmony_ci                                                                         \
50cabdff1aSopenharmony_ci    filt2 = __msa_adds_s_b(filt, cnst3b);                                \
51cabdff1aSopenharmony_ci    filt2 >>= 3;                                                         \
52cabdff1aSopenharmony_ci                                                                         \
53cabdff1aSopenharmony_ci    q0_m = __msa_subs_s_b(q0_m, filt1);                                  \
54cabdff1aSopenharmony_ci    q0_out = __msa_xori_b((v16u8) q0_m, 0x80);                           \
55cabdff1aSopenharmony_ci    p0_m = __msa_adds_s_b(p0_m, filt2);                                  \
56cabdff1aSopenharmony_ci    p0_out = __msa_xori_b((v16u8) p0_m, 0x80);                           \
57cabdff1aSopenharmony_ci                                                                         \
58cabdff1aSopenharmony_ci    filt = __msa_srari_b(filt1, 1);                                      \
59cabdff1aSopenharmony_ci    hev_in = __msa_xori_b((v16u8) hev_in, 0xff);                         \
60cabdff1aSopenharmony_ci    filt = filt & (v16i8) hev_in;                                        \
61cabdff1aSopenharmony_ci                                                                         \
62cabdff1aSopenharmony_ci    q1_m = __msa_subs_s_b(q1_m, filt);                                   \
63cabdff1aSopenharmony_ci    q1_out = __msa_xori_b((v16u8) q1_m, 0x80);                           \
64cabdff1aSopenharmony_ci    p1_m = __msa_adds_s_b(p1_m, filt);                                   \
65cabdff1aSopenharmony_ci    p1_out = __msa_xori_b((v16u8) p1_m, 0x80);                           \
66cabdff1aSopenharmony_ci}
67cabdff1aSopenharmony_ci
68cabdff1aSopenharmony_ci#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)  \
69cabdff1aSopenharmony_ci{                                                                      \
70cabdff1aSopenharmony_ci    v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;     \
71cabdff1aSopenharmony_ci    v16u8 zero_in = { 0 };                                             \
72cabdff1aSopenharmony_ci                                                                       \
73cabdff1aSopenharmony_ci    tmp = __msa_ori_b(zero_in, 1);                                     \
74cabdff1aSopenharmony_ci    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                        \
75cabdff1aSopenharmony_ci    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                        \
76cabdff1aSopenharmony_ci    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                        \
77cabdff1aSopenharmony_ci    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                        \
78cabdff1aSopenharmony_ci                                                                       \
79cabdff1aSopenharmony_ci    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);             \
80cabdff1aSopenharmony_ci    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                   \
81cabdff1aSopenharmony_ci    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);             \
82cabdff1aSopenharmony_ci    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                   \
83cabdff1aSopenharmony_ci                                                                       \
84cabdff1aSopenharmony_ci    flat_out = (tmp < (v16u8) flat_out);                               \
85cabdff1aSopenharmony_ci    flat_out = __msa_xori_b(flat_out, 0xff);                           \
86cabdff1aSopenharmony_ci    flat_out = flat_out & (mask);                                      \
87cabdff1aSopenharmony_ci}
88cabdff1aSopenharmony_ci
89cabdff1aSopenharmony_ci#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,  \
90cabdff1aSopenharmony_ci                  q5_in, q6_in, q7_in, flat_in, flat2_out)          \
91cabdff1aSopenharmony_ci{                                                                   \
92cabdff1aSopenharmony_ci    v16u8 tmp, zero_in = { 0 };                                     \
93cabdff1aSopenharmony_ci    v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;       \
94cabdff1aSopenharmony_ci    v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;       \
95cabdff1aSopenharmony_ci                                                                    \
96cabdff1aSopenharmony_ci    tmp = __msa_ori_b(zero_in, 1);                                  \
97cabdff1aSopenharmony_ci    p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                     \
98cabdff1aSopenharmony_ci    q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                     \
99cabdff1aSopenharmony_ci    p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                     \
100cabdff1aSopenharmony_ci    q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                     \
101cabdff1aSopenharmony_ci    p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                     \
102cabdff1aSopenharmony_ci    q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                     \
103cabdff1aSopenharmony_ci    p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                     \
104cabdff1aSopenharmony_ci    q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                     \
105cabdff1aSopenharmony_ci                                                                    \
106cabdff1aSopenharmony_ci    p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);          \
107cabdff1aSopenharmony_ci    flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);            \
108cabdff1aSopenharmony_ci    flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);              \
109cabdff1aSopenharmony_ci    p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);          \
110cabdff1aSopenharmony_ci    flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);              \
111cabdff1aSopenharmony_ci    p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);          \
112cabdff1aSopenharmony_ci    flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);              \
113cabdff1aSopenharmony_ci                                                                    \
114cabdff1aSopenharmony_ci    flat2_out = (tmp < (v16u8) flat2_out);                          \
115cabdff1aSopenharmony_ci    flat2_out = __msa_xori_b(flat2_out, 0xff);                      \
116cabdff1aSopenharmony_ci    flat2_out = flat2_out & flat_in;                                \
117cabdff1aSopenharmony_ci}
118cabdff1aSopenharmony_ci
119cabdff1aSopenharmony_ci#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in,                \
120cabdff1aSopenharmony_ci                    q0_in, q1_in, q2_in, q3_in,                \
121cabdff1aSopenharmony_ci                    p2_filt8_out, p1_filt8_out, p0_filt8_out,  \
122cabdff1aSopenharmony_ci                    q0_filt8_out, q1_filt8_out, q2_filt8_out)  \
123cabdff1aSopenharmony_ci{                                                              \
124cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2;                                    \
125cabdff1aSopenharmony_ci                                                               \
126cabdff1aSopenharmony_ci    tmp2 = p2_in + p1_in + p0_in;                              \
127cabdff1aSopenharmony_ci    tmp0 = p3_in << 1;                                         \
128cabdff1aSopenharmony_ci                                                               \
129cabdff1aSopenharmony_ci    tmp0 = tmp0 + tmp2 + q0_in;                                \
130cabdff1aSopenharmony_ci    tmp1 = tmp0 + p3_in + p2_in;                               \
131cabdff1aSopenharmony_ci    p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
132cabdff1aSopenharmony_ci                                                               \
133cabdff1aSopenharmony_ci    tmp1 = tmp0 + p1_in + q1_in;                               \
134cabdff1aSopenharmony_ci    p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
135cabdff1aSopenharmony_ci                                                               \
136cabdff1aSopenharmony_ci    tmp1 = q2_in + q1_in + q0_in;                              \
137cabdff1aSopenharmony_ci    tmp2 = tmp2 + tmp1;                                        \
138cabdff1aSopenharmony_ci    tmp0 = tmp2 + (p0_in);                                     \
139cabdff1aSopenharmony_ci    tmp0 = tmp0 + (p3_in);                                     \
140cabdff1aSopenharmony_ci    p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3);     \
141cabdff1aSopenharmony_ci                                                               \
142cabdff1aSopenharmony_ci    tmp0 = q2_in + q3_in;                                      \
143cabdff1aSopenharmony_ci    tmp0 = p0_in + tmp1 + tmp0;                                \
144cabdff1aSopenharmony_ci    tmp1 = q3_in + q3_in;                                      \
145cabdff1aSopenharmony_ci    tmp1 = tmp1 + tmp0;                                        \
146cabdff1aSopenharmony_ci    q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
147cabdff1aSopenharmony_ci                                                               \
148cabdff1aSopenharmony_ci    tmp0 = tmp2 + q3_in;                                       \
149cabdff1aSopenharmony_ci    tmp1 = tmp0 + q0_in;                                       \
150cabdff1aSopenharmony_ci    q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
151cabdff1aSopenharmony_ci                                                               \
152cabdff1aSopenharmony_ci    tmp1 = tmp0 - p2_in;                                       \
153cabdff1aSopenharmony_ci    tmp0 = q1_in + q3_in;                                      \
154cabdff1aSopenharmony_ci    tmp1 = tmp0 + tmp1;                                        \
155cabdff1aSopenharmony_ci    q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
156cabdff1aSopenharmony_ci}
157cabdff1aSopenharmony_ci
158cabdff1aSopenharmony_ci#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                   \
159cabdff1aSopenharmony_ci                     q0_in, q1_in, q2_in, q3_in,                   \
160cabdff1aSopenharmony_ci                     limit_in, b_limit_in, thresh_in,              \
161cabdff1aSopenharmony_ci                     hev_out, mask_out, flat_out)                  \
162cabdff1aSopenharmony_ci{                                                                  \
163cabdff1aSopenharmony_ci    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
164cabdff1aSopenharmony_ci    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
165cabdff1aSopenharmony_ci                                                                   \
166cabdff1aSopenharmony_ci    /* absolute subtraction of pixel values */                     \
167cabdff1aSopenharmony_ci    p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                   \
168cabdff1aSopenharmony_ci    p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                   \
169cabdff1aSopenharmony_ci    p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                   \
170cabdff1aSopenharmony_ci    q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                   \
171cabdff1aSopenharmony_ci    q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                   \
172cabdff1aSopenharmony_ci    q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                   \
173cabdff1aSopenharmony_ci    p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                   \
174cabdff1aSopenharmony_ci    p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                   \
175cabdff1aSopenharmony_ci                                                                   \
176cabdff1aSopenharmony_ci    /* calculation of hev */                                       \
177cabdff1aSopenharmony_ci    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
178cabdff1aSopenharmony_ci    hev_out = thresh_in < (v16u8) flat_out;                        \
179cabdff1aSopenharmony_ci                                                                   \
180cabdff1aSopenharmony_ci    /* calculation of mask */                                      \
181cabdff1aSopenharmony_ci    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
182cabdff1aSopenharmony_ci    p1_asub_q1_m >>= 1;                                            \
183cabdff1aSopenharmony_ci    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
184cabdff1aSopenharmony_ci                                                                   \
185cabdff1aSopenharmony_ci    mask_out = b_limit_in < p0_asub_q0_m;                          \
186cabdff1aSopenharmony_ci    mask_out = __msa_max_u_b(flat_out, mask_out);                  \
187cabdff1aSopenharmony_ci    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
188cabdff1aSopenharmony_ci    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
189cabdff1aSopenharmony_ci    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
190cabdff1aSopenharmony_ci    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
191cabdff1aSopenharmony_ci                                                                   \
192cabdff1aSopenharmony_ci    mask_out = limit_in < (v16u8) mask_out;                        \
193cabdff1aSopenharmony_ci    mask_out = __msa_xori_b(mask_out, 0xff);                       \
194cabdff1aSopenharmony_ci}
195cabdff1aSopenharmony_ci
196cabdff1aSopenharmony_civoid ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch,
197cabdff1aSopenharmony_ci                              int32_t b_limit_ptr,
198cabdff1aSopenharmony_ci                              int32_t limit_ptr,
199cabdff1aSopenharmony_ci                              int32_t thresh_ptr)
200cabdff1aSopenharmony_ci{
201cabdff1aSopenharmony_ci    uint64_t p1_d, p0_d, q0_d, q1_d;
202cabdff1aSopenharmony_ci    v16u8 mask, hev, flat, thresh, b_limit, limit;
203cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
204cabdff1aSopenharmony_ci
205cabdff1aSopenharmony_ci    /* load vector elements */
206cabdff1aSopenharmony_ci    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
207cabdff1aSopenharmony_ci
208cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_fill_b(thresh_ptr);
209cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
210cabdff1aSopenharmony_ci    limit = (v16u8) __msa_fill_b(limit_ptr);
211cabdff1aSopenharmony_ci
212cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
213cabdff1aSopenharmony_ci                 hev, mask, flat);
214cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
215cabdff1aSopenharmony_ci                       q1_out);
216cabdff1aSopenharmony_ci
217cabdff1aSopenharmony_ci    p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
218cabdff1aSopenharmony_ci    p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
219cabdff1aSopenharmony_ci    q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
220cabdff1aSopenharmony_ci    q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
221cabdff1aSopenharmony_ci    SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
222cabdff1aSopenharmony_ci}
223cabdff1aSopenharmony_ci
224cabdff1aSopenharmony_ci
225cabdff1aSopenharmony_civoid ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch,
226cabdff1aSopenharmony_ci                                int32_t b_limit_ptr,
227cabdff1aSopenharmony_ci                                int32_t limit_ptr,
228cabdff1aSopenharmony_ci                                int32_t thresh_ptr)
229cabdff1aSopenharmony_ci{
230cabdff1aSopenharmony_ci    v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
231cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
232cabdff1aSopenharmony_ci
233cabdff1aSopenharmony_ci    /* load vector elements */
234cabdff1aSopenharmony_ci    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
235cabdff1aSopenharmony_ci
236cabdff1aSopenharmony_ci    thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
237cabdff1aSopenharmony_ci    thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
238cabdff1aSopenharmony_ci    thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
239cabdff1aSopenharmony_ci
240cabdff1aSopenharmony_ci    b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
241cabdff1aSopenharmony_ci    b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
242cabdff1aSopenharmony_ci    b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
243cabdff1aSopenharmony_ci
244cabdff1aSopenharmony_ci    limit0 = (v16u8) __msa_fill_b(limit_ptr);
245cabdff1aSopenharmony_ci    limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
246cabdff1aSopenharmony_ci    limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
247cabdff1aSopenharmony_ci
248cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
249cabdff1aSopenharmony_ci                 hev, mask, flat);
250cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
251cabdff1aSopenharmony_ci
252cabdff1aSopenharmony_ci    ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
253cabdff1aSopenharmony_ci}
254cabdff1aSopenharmony_ci
255cabdff1aSopenharmony_civoid ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch,
256cabdff1aSopenharmony_ci                              int32_t b_limit_ptr,
257cabdff1aSopenharmony_ci                              int32_t limit_ptr,
258cabdff1aSopenharmony_ci                              int32_t thresh_ptr)
259cabdff1aSopenharmony_ci{
260cabdff1aSopenharmony_ci    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
261cabdff1aSopenharmony_ci    v16u8 mask, hev, flat, thresh, b_limit, limit;
262cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
263cabdff1aSopenharmony_ci    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
264cabdff1aSopenharmony_ci    v8i16 p2_filter8, p1_filter8, p0_filter8;
265cabdff1aSopenharmony_ci    v8i16 q0_filter8, q1_filter8, q2_filter8;
266cabdff1aSopenharmony_ci    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
267cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
268cabdff1aSopenharmony_ci
269cabdff1aSopenharmony_ci    /* load vector elements */
270cabdff1aSopenharmony_ci    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
271cabdff1aSopenharmony_ci
272cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_fill_b(thresh_ptr);
273cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
274cabdff1aSopenharmony_ci    limit = (v16u8) __msa_fill_b(limit_ptr);
275cabdff1aSopenharmony_ci
276cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
277cabdff1aSopenharmony_ci                 hev, mask, flat);
278cabdff1aSopenharmony_ci    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
279cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
280cabdff1aSopenharmony_ci                       q1_out);
281cabdff1aSopenharmony_ci
282cabdff1aSopenharmony_ci    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
283cabdff1aSopenharmony_ci
284cabdff1aSopenharmony_ci    /* if flat is zero for all pixels, then no need to calculate other filter */
285cabdff1aSopenharmony_ci    if (__msa_test_bz_v(flat)) {
286cabdff1aSopenharmony_ci        p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
287cabdff1aSopenharmony_ci        p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
288cabdff1aSopenharmony_ci        q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
289cabdff1aSopenharmony_ci        q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
290cabdff1aSopenharmony_ci        SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
291cabdff1aSopenharmony_ci    } else {
292cabdff1aSopenharmony_ci        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
293cabdff1aSopenharmony_ci                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
294cabdff1aSopenharmony_ci                   q2_r, q3_r);
295cabdff1aSopenharmony_ci        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
296cabdff1aSopenharmony_ci                    p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
297cabdff1aSopenharmony_ci
298cabdff1aSopenharmony_ci        /* convert 16 bit output data into 8 bit */
299cabdff1aSopenharmony_ci        PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
300cabdff1aSopenharmony_ci                    zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
301cabdff1aSopenharmony_ci                    q0_filter8);
302cabdff1aSopenharmony_ci        PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
303cabdff1aSopenharmony_ci
304cabdff1aSopenharmony_ci        /* store pixel values */
305cabdff1aSopenharmony_ci        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
306cabdff1aSopenharmony_ci        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
307cabdff1aSopenharmony_ci        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
308cabdff1aSopenharmony_ci        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
309cabdff1aSopenharmony_ci        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
310cabdff1aSopenharmony_ci        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
311cabdff1aSopenharmony_ci
312cabdff1aSopenharmony_ci        p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
313cabdff1aSopenharmony_ci        p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
314cabdff1aSopenharmony_ci        p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
315cabdff1aSopenharmony_ci        q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
316cabdff1aSopenharmony_ci        q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
317cabdff1aSopenharmony_ci        q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
318cabdff1aSopenharmony_ci
319cabdff1aSopenharmony_ci        src -= 3 * pitch;
320cabdff1aSopenharmony_ci
321cabdff1aSopenharmony_ci        SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
322cabdff1aSopenharmony_ci        src += (4 * pitch);
323cabdff1aSopenharmony_ci        SD(q1_d, src);
324cabdff1aSopenharmony_ci        src += pitch;
325cabdff1aSopenharmony_ci        SD(q2_d, src);
326cabdff1aSopenharmony_ci    }
327cabdff1aSopenharmony_ci}
328cabdff1aSopenharmony_ci
329cabdff1aSopenharmony_civoid ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch,
330cabdff1aSopenharmony_ci                                int32_t b_limit_ptr,
331cabdff1aSopenharmony_ci                                int32_t limit_ptr,
332cabdff1aSopenharmony_ci                                int32_t thresh_ptr)
333cabdff1aSopenharmony_ci{
334cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
335cabdff1aSopenharmony_ci    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
336cabdff1aSopenharmony_ci    v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
337cabdff1aSopenharmony_ci    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
338cabdff1aSopenharmony_ci    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
339cabdff1aSopenharmony_ci    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
340cabdff1aSopenharmony_ci    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
341cabdff1aSopenharmony_ci    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
342cabdff1aSopenharmony_ci    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
343cabdff1aSopenharmony_ci    v16u8 zero = { 0 };
344cabdff1aSopenharmony_ci
345cabdff1aSopenharmony_ci    /* load vector elements */
346cabdff1aSopenharmony_ci    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
347cabdff1aSopenharmony_ci
348cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_fill_b(thresh_ptr);
349cabdff1aSopenharmony_ci    tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
350cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
351cabdff1aSopenharmony_ci
352cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
353cabdff1aSopenharmony_ci    tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
354cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
355cabdff1aSopenharmony_ci
356cabdff1aSopenharmony_ci    limit = (v16u8) __msa_fill_b(limit_ptr);
357cabdff1aSopenharmony_ci    tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
358cabdff1aSopenharmony_ci    limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
359cabdff1aSopenharmony_ci
360cabdff1aSopenharmony_ci    /* mask and hev */
361cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
362cabdff1aSopenharmony_ci                 hev, mask, flat);
363cabdff1aSopenharmony_ci    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
364cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
365cabdff1aSopenharmony_ci                       q1_out);
366cabdff1aSopenharmony_ci
367cabdff1aSopenharmony_ci    /* if flat is zero for all pixels, then no need to calculate other filter */
368cabdff1aSopenharmony_ci    if (__msa_test_bz_v(flat)) {
369cabdff1aSopenharmony_ci        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
370cabdff1aSopenharmony_ci    } else {
371cabdff1aSopenharmony_ci        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
372cabdff1aSopenharmony_ci                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
373cabdff1aSopenharmony_ci                   q2_r, q3_r);
374cabdff1aSopenharmony_ci        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
375cabdff1aSopenharmony_ci                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
376cabdff1aSopenharmony_ci
377cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
378cabdff1aSopenharmony_ci                   p0_l);
379cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
380cabdff1aSopenharmony_ci                   q3_l);
381cabdff1aSopenharmony_ci        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
382cabdff1aSopenharmony_ci                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
383cabdff1aSopenharmony_ci
384cabdff1aSopenharmony_ci        /* convert 16 bit output data into 8 bit */
385cabdff1aSopenharmony_ci        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
386cabdff1aSopenharmony_ci                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
387cabdff1aSopenharmony_ci                    p0_filt8_r, q0_filt8_r);
388cabdff1aSopenharmony_ci        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r,
389cabdff1aSopenharmony_ci                    q1_filt8_r, q2_filt8_r);
390cabdff1aSopenharmony_ci
391cabdff1aSopenharmony_ci        /* store pixel values */
392cabdff1aSopenharmony_ci        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
393cabdff1aSopenharmony_ci        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
394cabdff1aSopenharmony_ci        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
395cabdff1aSopenharmony_ci        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
396cabdff1aSopenharmony_ci        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
397cabdff1aSopenharmony_ci        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
398cabdff1aSopenharmony_ci
399cabdff1aSopenharmony_ci        src -= 3 * pitch;
400cabdff1aSopenharmony_ci
401cabdff1aSopenharmony_ci        ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
402cabdff1aSopenharmony_ci        src += (4 * pitch);
403cabdff1aSopenharmony_ci        ST_UB2(q1_out, q2_out, src, pitch);
404cabdff1aSopenharmony_ci        src += (2 * pitch);
405cabdff1aSopenharmony_ci    }
406cabdff1aSopenharmony_ci}
407cabdff1aSopenharmony_ci
408cabdff1aSopenharmony_civoid ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch,
409cabdff1aSopenharmony_ci                                int32_t b_limit_ptr,
410cabdff1aSopenharmony_ci                                int32_t limit_ptr,
411cabdff1aSopenharmony_ci                                int32_t thresh_ptr)
412cabdff1aSopenharmony_ci{
413cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
414cabdff1aSopenharmony_ci    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
415cabdff1aSopenharmony_ci    v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
416cabdff1aSopenharmony_ci    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
417cabdff1aSopenharmony_ci    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
418cabdff1aSopenharmony_ci    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
419cabdff1aSopenharmony_ci    v16u8 zero = { 0 };
420cabdff1aSopenharmony_ci
421cabdff1aSopenharmony_ci    /* load vector elements */
422cabdff1aSopenharmony_ci    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_fill_b(thresh_ptr);
425cabdff1aSopenharmony_ci    tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
426cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
427cabdff1aSopenharmony_ci
428cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
429cabdff1aSopenharmony_ci    tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
430cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
431cabdff1aSopenharmony_ci
432cabdff1aSopenharmony_ci    limit = (v16u8) __msa_fill_b(limit_ptr);
433cabdff1aSopenharmony_ci    tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
434cabdff1aSopenharmony_ci    limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
435cabdff1aSopenharmony_ci
436cabdff1aSopenharmony_ci    /* mask and hev */
437cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
438cabdff1aSopenharmony_ci                 hev, mask, flat);
439cabdff1aSopenharmony_ci    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
440cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
441cabdff1aSopenharmony_ci                       q1_out);
442cabdff1aSopenharmony_ci
443cabdff1aSopenharmony_ci    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
444cabdff1aSopenharmony_ci
445cabdff1aSopenharmony_ci    /* if flat is zero for all pixels, then no need to calculate other filter */
446cabdff1aSopenharmony_ci    if (__msa_test_bz_v(flat)) {
447cabdff1aSopenharmony_ci        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
448cabdff1aSopenharmony_ci    } else {
449cabdff1aSopenharmony_ci        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
450cabdff1aSopenharmony_ci                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
451cabdff1aSopenharmony_ci                   q2_r, q3_r);
452cabdff1aSopenharmony_ci        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
453cabdff1aSopenharmony_ci                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
454cabdff1aSopenharmony_ci
455cabdff1aSopenharmony_ci        /* convert 16 bit output data into 8 bit */
456cabdff1aSopenharmony_ci        PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
457cabdff1aSopenharmony_ci                    p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
458cabdff1aSopenharmony_ci                    p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
459cabdff1aSopenharmony_ci        PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
460cabdff1aSopenharmony_ci                    q1_filt8_r, q2_filt8_r);
461cabdff1aSopenharmony_ci
462cabdff1aSopenharmony_ci        /* store pixel values */
463cabdff1aSopenharmony_ci        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
464cabdff1aSopenharmony_ci        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
465cabdff1aSopenharmony_ci        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
466cabdff1aSopenharmony_ci        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
467cabdff1aSopenharmony_ci        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
468cabdff1aSopenharmony_ci        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
469cabdff1aSopenharmony_ci
470cabdff1aSopenharmony_ci        src -= 3 * pitch;
471cabdff1aSopenharmony_ci
472cabdff1aSopenharmony_ci        ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
473cabdff1aSopenharmony_ci        src += (4 * pitch);
474cabdff1aSopenharmony_ci        ST_UB2(q1_out, q2_out, src, pitch);
475cabdff1aSopenharmony_ci        src += (2 * pitch);
476cabdff1aSopenharmony_ci    }
477cabdff1aSopenharmony_ci}
478cabdff1aSopenharmony_ci
479cabdff1aSopenharmony_civoid ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch,
480cabdff1aSopenharmony_ci                                int32_t b_limit_ptr,
481cabdff1aSopenharmony_ci                                int32_t limit_ptr,
482cabdff1aSopenharmony_ci                                int32_t thresh_ptr)
483cabdff1aSopenharmony_ci{
484cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
485cabdff1aSopenharmony_ci    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
486cabdff1aSopenharmony_ci    v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
487cabdff1aSopenharmony_ci    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
488cabdff1aSopenharmony_ci    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
489cabdff1aSopenharmony_ci    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
490cabdff1aSopenharmony_ci    v16u8 zero = { 0 };
491cabdff1aSopenharmony_ci
492cabdff1aSopenharmony_ci    /* load vector elements */
493cabdff1aSopenharmony_ci    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
494cabdff1aSopenharmony_ci
495cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_fill_b(thresh_ptr);
496cabdff1aSopenharmony_ci    tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
497cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
498cabdff1aSopenharmony_ci
499cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
500cabdff1aSopenharmony_ci    tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
501cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
502cabdff1aSopenharmony_ci
503cabdff1aSopenharmony_ci    limit = (v16u8) __msa_fill_b(limit_ptr);
504cabdff1aSopenharmony_ci    tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
505cabdff1aSopenharmony_ci    limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
506cabdff1aSopenharmony_ci
507cabdff1aSopenharmony_ci    /* mask and hev */
508cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
509cabdff1aSopenharmony_ci                 hev, mask, flat);
510cabdff1aSopenharmony_ci    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
511cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
512cabdff1aSopenharmony_ci                       q1_out);
513cabdff1aSopenharmony_ci
514cabdff1aSopenharmony_ci    flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
515cabdff1aSopenharmony_ci
516cabdff1aSopenharmony_ci    /* if flat is zero for all pixels, then no need to calculate other filter */
517cabdff1aSopenharmony_ci    if (__msa_test_bz_v(flat)) {
518cabdff1aSopenharmony_ci        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
519cabdff1aSopenharmony_ci    } else {
520cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
521cabdff1aSopenharmony_ci                   p0_l);
522cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
523cabdff1aSopenharmony_ci                   q3_l);
524cabdff1aSopenharmony_ci        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
525cabdff1aSopenharmony_ci                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
526cabdff1aSopenharmony_ci
527cabdff1aSopenharmony_ci        /* convert 16 bit output data into 8 bit */
528cabdff1aSopenharmony_ci        PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
529cabdff1aSopenharmony_ci                    p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
530cabdff1aSopenharmony_ci                    p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
531cabdff1aSopenharmony_ci        PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
532cabdff1aSopenharmony_ci                    q1_filt8_l, q2_filt8_l);
533cabdff1aSopenharmony_ci
534cabdff1aSopenharmony_ci        /* store pixel values */
535cabdff1aSopenharmony_ci        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
536cabdff1aSopenharmony_ci        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
537cabdff1aSopenharmony_ci        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
538cabdff1aSopenharmony_ci        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
539cabdff1aSopenharmony_ci        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
540cabdff1aSopenharmony_ci        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
541cabdff1aSopenharmony_ci
542cabdff1aSopenharmony_ci        src -= 3 * pitch;
543cabdff1aSopenharmony_ci
544cabdff1aSopenharmony_ci        ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
545cabdff1aSopenharmony_ci        src += (4 * pitch);
546cabdff1aSopenharmony_ci        ST_UB2(q1_out, q2_out, src, pitch);
547cabdff1aSopenharmony_ci        src += (2 * pitch);
548cabdff1aSopenharmony_ci    }
549cabdff1aSopenharmony_ci}
550cabdff1aSopenharmony_ci
551cabdff1aSopenharmony_cistatic int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch,
552cabdff1aSopenharmony_ci                                        uint8_t *filter48,
553cabdff1aSopenharmony_ci                                        int32_t b_limit_ptr,
554cabdff1aSopenharmony_ci                                        int32_t limit_ptr,
555cabdff1aSopenharmony_ci                                        int32_t thresh_ptr)
556cabdff1aSopenharmony_ci{
557cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
558cabdff1aSopenharmony_ci    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
559cabdff1aSopenharmony_ci    v16u8 flat, mask, hev, thresh, b_limit, limit;
560cabdff1aSopenharmony_ci    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
561cabdff1aSopenharmony_ci    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
562cabdff1aSopenharmony_ci    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
563cabdff1aSopenharmony_ci    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
564cabdff1aSopenharmony_ci    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
565cabdff1aSopenharmony_ci    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
566cabdff1aSopenharmony_ci    v16u8 zero = { 0 };
567cabdff1aSopenharmony_ci
568cabdff1aSopenharmony_ci    /* load vector elements */
569cabdff1aSopenharmony_ci    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
570cabdff1aSopenharmony_ci
571cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_fill_b(thresh_ptr);
572cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
573cabdff1aSopenharmony_ci    limit = (v16u8) __msa_fill_b(limit_ptr);
574cabdff1aSopenharmony_ci
575cabdff1aSopenharmony_ci    /* mask and hev */
576cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
577cabdff1aSopenharmony_ci                 hev, mask, flat);
578cabdff1aSopenharmony_ci    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
579cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
580cabdff1aSopenharmony_ci                       q1_out);
581cabdff1aSopenharmony_ci
582cabdff1aSopenharmony_ci    /* if flat is zero for all pixels, then no need to calculate other filter */
583cabdff1aSopenharmony_ci    if (__msa_test_bz_v(flat)) {
584cabdff1aSopenharmony_ci        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
585cabdff1aSopenharmony_ci
586cabdff1aSopenharmony_ci        return 1;
587cabdff1aSopenharmony_ci    } else {
588cabdff1aSopenharmony_ci        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
589cabdff1aSopenharmony_ci                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
590cabdff1aSopenharmony_ci                   q2_r, q3_r);
591cabdff1aSopenharmony_ci        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
592cabdff1aSopenharmony_ci                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
593cabdff1aSopenharmony_ci
594cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
595cabdff1aSopenharmony_ci                   p0_l);
596cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
597cabdff1aSopenharmony_ci                   q3_l);
598cabdff1aSopenharmony_ci        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
599cabdff1aSopenharmony_ci                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
600cabdff1aSopenharmony_ci
601cabdff1aSopenharmony_ci        /* convert 16 bit output data into 8 bit */
602cabdff1aSopenharmony_ci        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
603cabdff1aSopenharmony_ci                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
604cabdff1aSopenharmony_ci                    p0_filt8_r, q0_filt8_r);
605cabdff1aSopenharmony_ci        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
606cabdff1aSopenharmony_ci                    q2_filt8_r);
607cabdff1aSopenharmony_ci
608cabdff1aSopenharmony_ci        /* store pixel values */
609cabdff1aSopenharmony_ci        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
610cabdff1aSopenharmony_ci        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
611cabdff1aSopenharmony_ci        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
612cabdff1aSopenharmony_ci        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
613cabdff1aSopenharmony_ci        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
614cabdff1aSopenharmony_ci        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
615cabdff1aSopenharmony_ci
616cabdff1aSopenharmony_ci        ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
617cabdff1aSopenharmony_ci        filter48 += (4 * 16);
618cabdff1aSopenharmony_ci        ST_UB2(q1_out, q2_out, filter48, 16);
619cabdff1aSopenharmony_ci        filter48 += (2 * 16);
620cabdff1aSopenharmony_ci        ST_UB(flat, filter48);
621cabdff1aSopenharmony_ci
622cabdff1aSopenharmony_ci        return 0;
623cabdff1aSopenharmony_ci    }
624cabdff1aSopenharmony_ci}
625cabdff1aSopenharmony_ci
626cabdff1aSopenharmony_cistatic void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
627cabdff1aSopenharmony_ci{
628cabdff1aSopenharmony_ci    v16u8 flat, flat2, filter8;
629cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
630cabdff1aSopenharmony_ci    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
631cabdff1aSopenharmony_ci    v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
632cabdff1aSopenharmony_ci    v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
633cabdff1aSopenharmony_ci    v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
634cabdff1aSopenharmony_ci    v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
635cabdff1aSopenharmony_ci    v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
636cabdff1aSopenharmony_ci    v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
637cabdff1aSopenharmony_ci    v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
638cabdff1aSopenharmony_ci    v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
639cabdff1aSopenharmony_ci    v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
640cabdff1aSopenharmony_ci    v8i16 l_out, r_out;
641cabdff1aSopenharmony_ci
642cabdff1aSopenharmony_ci    flat = LD_UB(filter48 + 96);
643cabdff1aSopenharmony_ci
644cabdff1aSopenharmony_ci    LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
645cabdff1aSopenharmony_ci    LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
646cabdff1aSopenharmony_ci    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
647cabdff1aSopenharmony_ci
648cabdff1aSopenharmony_ci    /* if flat2 is zero for all pixels, then no need to calculate other filter */
649cabdff1aSopenharmony_ci    if (__msa_test_bz_v(flat2)) {
650cabdff1aSopenharmony_ci        LD_UB4(filter48, 16, p2, p1, p0, q0);
651cabdff1aSopenharmony_ci        LD_UB2(filter48 + 4 * 16, 16, q1, q2);
652cabdff1aSopenharmony_ci
653cabdff1aSopenharmony_ci        src -= 3 * pitch;
654cabdff1aSopenharmony_ci        ST_UB4(p2, p1, p0, q0, src, pitch);
655cabdff1aSopenharmony_ci        src += (4 * pitch);
656cabdff1aSopenharmony_ci        ST_UB2(q1, q2, src, pitch);
657cabdff1aSopenharmony_ci    } else {
658cabdff1aSopenharmony_ci        src -= 7 * pitch;
659cabdff1aSopenharmony_ci
660cabdff1aSopenharmony_ci        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
661cabdff1aSopenharmony_ci                   zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
662cabdff1aSopenharmony_ci                   p3_r_in, p2_r_in, p1_r_in, p0_r_in);
663cabdff1aSopenharmony_ci
664cabdff1aSopenharmony_ci        q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
665cabdff1aSopenharmony_ci
666cabdff1aSopenharmony_ci        tmp0_r = p7_r_in << 3;
667cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
668cabdff1aSopenharmony_ci        tmp0_r += p6_r_in;
669cabdff1aSopenharmony_ci        tmp0_r += q0_r_in;
670cabdff1aSopenharmony_ci        tmp1_r = p6_r_in + p5_r_in;
671cabdff1aSopenharmony_ci        tmp1_r += p4_r_in;
672cabdff1aSopenharmony_ci        tmp1_r += p3_r_in;
673cabdff1aSopenharmony_ci        tmp1_r += p2_r_in;
674cabdff1aSopenharmony_ci        tmp1_r += p1_r_in;
675cabdff1aSopenharmony_ci        tmp1_r += p0_r_in;
676cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
677cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
678cabdff1aSopenharmony_ci
679cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
680cabdff1aSopenharmony_ci                   p5_l_in, p4_l_in);
681cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
682cabdff1aSopenharmony_ci                   p1_l_in, p0_l_in);
683cabdff1aSopenharmony_ci        q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
684cabdff1aSopenharmony_ci
685cabdff1aSopenharmony_ci        tmp0_l = p7_l_in << 3;
686cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
687cabdff1aSopenharmony_ci        tmp0_l += p6_l_in;
688cabdff1aSopenharmony_ci        tmp0_l += q0_l_in;
689cabdff1aSopenharmony_ci        tmp1_l = p6_l_in + p5_l_in;
690cabdff1aSopenharmony_ci        tmp1_l += p4_l_in;
691cabdff1aSopenharmony_ci        tmp1_l += p3_l_in;
692cabdff1aSopenharmony_ci        tmp1_l += p2_l_in;
693cabdff1aSopenharmony_ci        tmp1_l += p1_l_in;
694cabdff1aSopenharmony_ci        tmp1_l += p0_l_in;
695cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
696cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
697cabdff1aSopenharmony_ci
698cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
699cabdff1aSopenharmony_ci        p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
700cabdff1aSopenharmony_ci        ST_UB(p6, src);
701cabdff1aSopenharmony_ci        src += pitch;
702cabdff1aSopenharmony_ci
703cabdff1aSopenharmony_ci        /* p5 */
704cabdff1aSopenharmony_ci        q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
705cabdff1aSopenharmony_ci        tmp0_r = p5_r_in - p6_r_in;
706cabdff1aSopenharmony_ci        tmp0_r += q1_r_in;
707cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
708cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
709cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
710cabdff1aSopenharmony_ci
711cabdff1aSopenharmony_ci        q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
712cabdff1aSopenharmony_ci        tmp0_l = p5_l_in - p6_l_in;
713cabdff1aSopenharmony_ci        tmp0_l += q1_l_in;
714cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
715cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
716cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
717cabdff1aSopenharmony_ci
718cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
719cabdff1aSopenharmony_ci        p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
720cabdff1aSopenharmony_ci        ST_UB(p5, src);
721cabdff1aSopenharmony_ci        src += pitch;
722cabdff1aSopenharmony_ci
723cabdff1aSopenharmony_ci        /* p4 */
724cabdff1aSopenharmony_ci        q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
725cabdff1aSopenharmony_ci        tmp0_r = p4_r_in - p5_r_in;
726cabdff1aSopenharmony_ci        tmp0_r += q2_r_in;
727cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
728cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
729cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4);
730cabdff1aSopenharmony_ci
731cabdff1aSopenharmony_ci        q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
732cabdff1aSopenharmony_ci        tmp0_l = p4_l_in - p5_l_in;
733cabdff1aSopenharmony_ci        tmp0_l += q2_l_in;
734cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
735cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
736cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
737cabdff1aSopenharmony_ci
738cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
739cabdff1aSopenharmony_ci        p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
740cabdff1aSopenharmony_ci        ST_UB(p4, src);
741cabdff1aSopenharmony_ci        src += pitch;
742cabdff1aSopenharmony_ci
743cabdff1aSopenharmony_ci        /* p3 */
744cabdff1aSopenharmony_ci        q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
745cabdff1aSopenharmony_ci        tmp0_r = p3_r_in - p4_r_in;
746cabdff1aSopenharmony_ci        tmp0_r += q3_r_in;
747cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
748cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
749cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
750cabdff1aSopenharmony_ci
751cabdff1aSopenharmony_ci        q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
752cabdff1aSopenharmony_ci        tmp0_l = p3_l_in - p4_l_in;
753cabdff1aSopenharmony_ci        tmp0_l += q3_l_in;
754cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
755cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
756cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
757cabdff1aSopenharmony_ci
758cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
759cabdff1aSopenharmony_ci        p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
760cabdff1aSopenharmony_ci        ST_UB(p3, src);
761cabdff1aSopenharmony_ci        src += pitch;
762cabdff1aSopenharmony_ci
763cabdff1aSopenharmony_ci        /* p2 */
764cabdff1aSopenharmony_ci        q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
765cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48);
766cabdff1aSopenharmony_ci        tmp0_r = p2_r_in - p3_r_in;
767cabdff1aSopenharmony_ci        tmp0_r += q4_r_in;
768cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
769cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
770cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
771cabdff1aSopenharmony_ci
772cabdff1aSopenharmony_ci        q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
773cabdff1aSopenharmony_ci        tmp0_l = p2_l_in - p3_l_in;
774cabdff1aSopenharmony_ci        tmp0_l += q4_l_in;
775cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
776cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
777cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
778cabdff1aSopenharmony_ci
779cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
780cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
781cabdff1aSopenharmony_ci        ST_UB(filter8, src);
782cabdff1aSopenharmony_ci        src += pitch;
783cabdff1aSopenharmony_ci
784cabdff1aSopenharmony_ci        /* p1 */
785cabdff1aSopenharmony_ci        q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
786cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48 + 16);
787cabdff1aSopenharmony_ci        tmp0_r = p1_r_in - p2_r_in;
788cabdff1aSopenharmony_ci        tmp0_r += q5_r_in;
789cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
790cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
791cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
792cabdff1aSopenharmony_ci
793cabdff1aSopenharmony_ci        q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
794cabdff1aSopenharmony_ci        tmp0_l = p1_l_in - p2_l_in;
795cabdff1aSopenharmony_ci        tmp0_l += q5_l_in;
796cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
797cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
798cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
799cabdff1aSopenharmony_ci
800cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
801cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
802cabdff1aSopenharmony_ci        ST_UB(filter8, src);
803cabdff1aSopenharmony_ci        src += pitch;
804cabdff1aSopenharmony_ci
805cabdff1aSopenharmony_ci        /* p0 */
806cabdff1aSopenharmony_ci        q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
807cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48 + 32);
808cabdff1aSopenharmony_ci        tmp0_r = p0_r_in - p1_r_in;
809cabdff1aSopenharmony_ci        tmp0_r += q6_r_in;
810cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
811cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
812cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
813cabdff1aSopenharmony_ci
814cabdff1aSopenharmony_ci        q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
815cabdff1aSopenharmony_ci        tmp0_l = p0_l_in - p1_l_in;
816cabdff1aSopenharmony_ci        tmp0_l += q6_l_in;
817cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
818cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
819cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
820cabdff1aSopenharmony_ci
821cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
822cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
823cabdff1aSopenharmony_ci        ST_UB(filter8, src);
824cabdff1aSopenharmony_ci        src += pitch;
825cabdff1aSopenharmony_ci
826cabdff1aSopenharmony_ci        /* q0 */
827cabdff1aSopenharmony_ci        q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
828cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48 + 48);
829cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - p0_r_in;
830cabdff1aSopenharmony_ci        tmp0_r += q0_r_in;
831cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
832cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
833cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
834cabdff1aSopenharmony_ci
835cabdff1aSopenharmony_ci        q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
836cabdff1aSopenharmony_ci        tmp0_l = q7_l_in - p0_l_in;
837cabdff1aSopenharmony_ci        tmp0_l += q0_l_in;
838cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
839cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
840cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
841cabdff1aSopenharmony_ci
842cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
843cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
844cabdff1aSopenharmony_ci        ST_UB(filter8, src);
845cabdff1aSopenharmony_ci        src += pitch;
846cabdff1aSopenharmony_ci
847cabdff1aSopenharmony_ci        /* q1 */
848cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48 + 64);
849cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q0_r_in;
850cabdff1aSopenharmony_ci        tmp0_r += q1_r_in;
851cabdff1aSopenharmony_ci        tmp0_r -= p6_r_in;
852cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
853cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
854cabdff1aSopenharmony_ci
855cabdff1aSopenharmony_ci        tmp0_l = q7_l_in - q0_l_in;
856cabdff1aSopenharmony_ci        tmp0_l += q1_l_in;
857cabdff1aSopenharmony_ci        tmp0_l -= p6_l_in;
858cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
859cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
860cabdff1aSopenharmony_ci
861cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
862cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
863cabdff1aSopenharmony_ci        ST_UB(filter8, src);
864cabdff1aSopenharmony_ci        src += pitch;
865cabdff1aSopenharmony_ci
866cabdff1aSopenharmony_ci        /* q2 */
867cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48 + 80);
868cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q1_r_in;
869cabdff1aSopenharmony_ci        tmp0_r += q2_r_in;
870cabdff1aSopenharmony_ci        tmp0_r -= p5_r_in;
871cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
872cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
873cabdff1aSopenharmony_ci
874cabdff1aSopenharmony_ci        tmp0_l = q7_l_in - q1_l_in;
875cabdff1aSopenharmony_ci        tmp0_l += q2_l_in;
876cabdff1aSopenharmony_ci        tmp0_l -= p5_l_in;
877cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
878cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
879cabdff1aSopenharmony_ci
880cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
881cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
882cabdff1aSopenharmony_ci        ST_UB(filter8, src);
883cabdff1aSopenharmony_ci        src += pitch;
884cabdff1aSopenharmony_ci
885cabdff1aSopenharmony_ci        /* q3 */
886cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q2_r_in;
887cabdff1aSopenharmony_ci        tmp0_r += q3_r_in;
888cabdff1aSopenharmony_ci        tmp0_r -= p4_r_in;
889cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
890cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
891cabdff1aSopenharmony_ci
892cabdff1aSopenharmony_ci        tmp0_l = q7_l_in - q2_l_in;
893cabdff1aSopenharmony_ci        tmp0_l += q3_l_in;
894cabdff1aSopenharmony_ci        tmp0_l -= p4_l_in;
895cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
896cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
897cabdff1aSopenharmony_ci
898cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
899cabdff1aSopenharmony_ci        q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
900cabdff1aSopenharmony_ci        ST_UB(q3, src);
901cabdff1aSopenharmony_ci        src += pitch;
902cabdff1aSopenharmony_ci
903cabdff1aSopenharmony_ci        /* q4 */
904cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q3_r_in;
905cabdff1aSopenharmony_ci        tmp0_r += q4_r_in;
906cabdff1aSopenharmony_ci        tmp0_r -= p3_r_in;
907cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
908cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
909cabdff1aSopenharmony_ci
910cabdff1aSopenharmony_ci        tmp0_l = q7_l_in - q3_l_in;
911cabdff1aSopenharmony_ci        tmp0_l += q4_l_in;
912cabdff1aSopenharmony_ci        tmp0_l -= p3_l_in;
913cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
914cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
915cabdff1aSopenharmony_ci
916cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
917cabdff1aSopenharmony_ci        q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
918cabdff1aSopenharmony_ci        ST_UB(q4, src);
919cabdff1aSopenharmony_ci        src += pitch;
920cabdff1aSopenharmony_ci
921cabdff1aSopenharmony_ci        /* q5 */
922cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q4_r_in;
923cabdff1aSopenharmony_ci        tmp0_r += q5_r_in;
924cabdff1aSopenharmony_ci        tmp0_r -= p2_r_in;
925cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
926cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
927cabdff1aSopenharmony_ci
928cabdff1aSopenharmony_ci        tmp0_l = q7_l_in - q4_l_in;
929cabdff1aSopenharmony_ci        tmp0_l += q5_l_in;
930cabdff1aSopenharmony_ci        tmp0_l -= p2_l_in;
931cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
932cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
933cabdff1aSopenharmony_ci
934cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
935cabdff1aSopenharmony_ci        q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
936cabdff1aSopenharmony_ci        ST_UB(q5, src);
937cabdff1aSopenharmony_ci        src += pitch;
938cabdff1aSopenharmony_ci
939cabdff1aSopenharmony_ci        /* q6 */
940cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q5_r_in;
941cabdff1aSopenharmony_ci        tmp0_r += q6_r_in;
942cabdff1aSopenharmony_ci        tmp0_r -= p1_r_in;
943cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
944cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
945cabdff1aSopenharmony_ci
946cabdff1aSopenharmony_ci        tmp0_l = q7_l_in - q5_l_in;
947cabdff1aSopenharmony_ci        tmp0_l += q6_l_in;
948cabdff1aSopenharmony_ci        tmp0_l -= p1_l_in;
949cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
950cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
951cabdff1aSopenharmony_ci
952cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
953cabdff1aSopenharmony_ci        q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
954cabdff1aSopenharmony_ci        ST_UB(q6, src);
955cabdff1aSopenharmony_ci    }
956cabdff1aSopenharmony_ci}
957cabdff1aSopenharmony_ci
958cabdff1aSopenharmony_civoid ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch,
959cabdff1aSopenharmony_ci                                int32_t b_limit_ptr,
960cabdff1aSopenharmony_ci                                int32_t limit_ptr,
961cabdff1aSopenharmony_ci                                int32_t thresh_ptr)
962cabdff1aSopenharmony_ci{
963cabdff1aSopenharmony_ci    uint8_t filter48[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
964cabdff1aSopenharmony_ci    uint8_t early_exit = 0;
965cabdff1aSopenharmony_ci
966cabdff1aSopenharmony_ci    early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0],
967cabdff1aSopenharmony_ci                                          b_limit_ptr, limit_ptr, thresh_ptr);
968cabdff1aSopenharmony_ci
969cabdff1aSopenharmony_ci    if (0 == early_exit) {
970cabdff1aSopenharmony_ci        vp9_hz_lpf_t16_16w(src, pitch, filter48);
971cabdff1aSopenharmony_ci    }
972cabdff1aSopenharmony_ci}
973cabdff1aSopenharmony_ci
974cabdff1aSopenharmony_civoid ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch,
975cabdff1aSopenharmony_ci                               int32_t b_limit_ptr,
976cabdff1aSopenharmony_ci                               int32_t limit_ptr,
977cabdff1aSopenharmony_ci                               int32_t thresh_ptr)
978cabdff1aSopenharmony_ci{
979cabdff1aSopenharmony_ci    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
980cabdff1aSopenharmony_ci    uint64_t dword0, dword1;
981cabdff1aSopenharmony_ci    v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
982cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
983cabdff1aSopenharmony_ci    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
984cabdff1aSopenharmony_ci    v16u8 p0_filter16, p1_filter16;
985cabdff1aSopenharmony_ci    v8i16 p2_filter8, p1_filter8, p0_filter8;
986cabdff1aSopenharmony_ci    v8i16 q0_filter8, q1_filter8, q2_filter8;
987cabdff1aSopenharmony_ci    v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
988cabdff1aSopenharmony_ci    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
989cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
990cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2;
991cabdff1aSopenharmony_ci
992cabdff1aSopenharmony_ci    /* load vector elements */
993cabdff1aSopenharmony_ci    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
994cabdff1aSopenharmony_ci
995cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_fill_b(thresh_ptr);
996cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
997cabdff1aSopenharmony_ci    limit = (v16u8) __msa_fill_b(limit_ptr);
998cabdff1aSopenharmony_ci
999cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1000cabdff1aSopenharmony_ci                 hev, mask, flat);
1001cabdff1aSopenharmony_ci    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1002cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1003cabdff1aSopenharmony_ci                       q1_out);
1004cabdff1aSopenharmony_ci
1005cabdff1aSopenharmony_ci    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1006cabdff1aSopenharmony_ci
1007cabdff1aSopenharmony_ci    /* if flat is zero for all pixels, then no need to calculate other filter */
1008cabdff1aSopenharmony_ci    if (__msa_test_bz_v(flat)) {
1009cabdff1aSopenharmony_ci        p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1010cabdff1aSopenharmony_ci        p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1011cabdff1aSopenharmony_ci        q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1012cabdff1aSopenharmony_ci        q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1013cabdff1aSopenharmony_ci        SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
1014cabdff1aSopenharmony_ci    } else {
1015cabdff1aSopenharmony_ci        /* convert 8 bit input data into 16 bit */
1016cabdff1aSopenharmony_ci        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero,
1017cabdff1aSopenharmony_ci                   q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r,
1018cabdff1aSopenharmony_ci                   q1_r, q2_r, q3_r);
1019cabdff1aSopenharmony_ci        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r,
1020cabdff1aSopenharmony_ci                    p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1021cabdff1aSopenharmony_ci                    q1_filter8, q2_filter8);
1022cabdff1aSopenharmony_ci
1023cabdff1aSopenharmony_ci        /* convert 16 bit output data into 8 bit */
1024cabdff1aSopenharmony_ci        PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
1025cabdff1aSopenharmony_ci                    zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
1026cabdff1aSopenharmony_ci                    q0_filter8);
1027cabdff1aSopenharmony_ci        PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8,
1028cabdff1aSopenharmony_ci                    q2_filter8);
1029cabdff1aSopenharmony_ci
1030cabdff1aSopenharmony_ci        /* store pixel values */
1031cabdff1aSopenharmony_ci        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
1032cabdff1aSopenharmony_ci        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
1033cabdff1aSopenharmony_ci        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
1034cabdff1aSopenharmony_ci        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
1035cabdff1aSopenharmony_ci        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
1036cabdff1aSopenharmony_ci        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
1037cabdff1aSopenharmony_ci
1038cabdff1aSopenharmony_ci        /* load 16 vector elements */
1039cabdff1aSopenharmony_ci        LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
1040cabdff1aSopenharmony_ci        LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
1041cabdff1aSopenharmony_ci
1042cabdff1aSopenharmony_ci        VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1043cabdff1aSopenharmony_ci
1044cabdff1aSopenharmony_ci        /* if flat2 is zero for all pixels, then no need to calculate other filter */
1045cabdff1aSopenharmony_ci        if (__msa_test_bz_v(flat2)) {
1046cabdff1aSopenharmony_ci            p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
1047cabdff1aSopenharmony_ci            p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1048cabdff1aSopenharmony_ci            p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1049cabdff1aSopenharmony_ci            q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1050cabdff1aSopenharmony_ci            q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1051cabdff1aSopenharmony_ci            q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
1052cabdff1aSopenharmony_ci
1053cabdff1aSopenharmony_ci            SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
1054cabdff1aSopenharmony_ci            SD(q1_d, src + pitch);
1055cabdff1aSopenharmony_ci            SD(q2_d, src + 2 * pitch);
1056cabdff1aSopenharmony_ci        } else {
1057cabdff1aSopenharmony_ci            /* LSB(right) 8 pixel operation */
1058cabdff1aSopenharmony_ci            ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4,
1059cabdff1aSopenharmony_ci                       zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r,
1060cabdff1aSopenharmony_ci                       q4_r, q5_r, q6_r, q7_r);
1061cabdff1aSopenharmony_ci
1062cabdff1aSopenharmony_ci            tmp0 = p7_r << 3;
1063cabdff1aSopenharmony_ci            tmp0 -= p7_r;
1064cabdff1aSopenharmony_ci            tmp0 += p6_r;
1065cabdff1aSopenharmony_ci            tmp0 += q0_r;
1066cabdff1aSopenharmony_ci
1067cabdff1aSopenharmony_ci            src -= 7 * pitch;
1068cabdff1aSopenharmony_ci
1069cabdff1aSopenharmony_ci            /* calculation of p6 and p5 */
1070cabdff1aSopenharmony_ci            tmp1 = p6_r + p5_r + p4_r + p3_r;
1071cabdff1aSopenharmony_ci            tmp1 += (p2_r + p1_r + p0_r);
1072cabdff1aSopenharmony_ci            tmp1 += tmp0;
1073cabdff1aSopenharmony_ci            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1074cabdff1aSopenharmony_ci            tmp0 = p5_r - p6_r + q1_r - p7_r;
1075cabdff1aSopenharmony_ci            tmp1 += tmp0;
1076cabdff1aSopenharmony_ci            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1077cabdff1aSopenharmony_ci            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1078cabdff1aSopenharmony_ci                        p0_filter16, p1_filter16);
1079cabdff1aSopenharmony_ci            p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
1080cabdff1aSopenharmony_ci            p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
1081cabdff1aSopenharmony_ci            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1082cabdff1aSopenharmony_ci            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1083cabdff1aSopenharmony_ci            SD(dword0, src);
1084cabdff1aSopenharmony_ci            src += pitch;
1085cabdff1aSopenharmony_ci            SD(dword1, src);
1086cabdff1aSopenharmony_ci            src += pitch;
1087cabdff1aSopenharmony_ci
1088cabdff1aSopenharmony_ci            /* calculation of p4 and p3 */
1089cabdff1aSopenharmony_ci            tmp0 = p4_r - p5_r + q2_r - p7_r;
1090cabdff1aSopenharmony_ci            tmp2 = p3_r - p4_r + q3_r - p7_r;
1091cabdff1aSopenharmony_ci            tmp1 += tmp0;
1092cabdff1aSopenharmony_ci            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1093cabdff1aSopenharmony_ci            tmp1 += tmp2;
1094cabdff1aSopenharmony_ci            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1095cabdff1aSopenharmony_ci            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1096cabdff1aSopenharmony_ci                        p0_filter16, p1_filter16);
1097cabdff1aSopenharmony_ci            p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
1098cabdff1aSopenharmony_ci            p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
1099cabdff1aSopenharmony_ci            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1100cabdff1aSopenharmony_ci            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1101cabdff1aSopenharmony_ci            SD(dword0, src);
1102cabdff1aSopenharmony_ci            src += pitch;
1103cabdff1aSopenharmony_ci            SD(dword1, src);
1104cabdff1aSopenharmony_ci            src += pitch;
1105cabdff1aSopenharmony_ci
1106cabdff1aSopenharmony_ci            /* calculation of p2 and p1 */
1107cabdff1aSopenharmony_ci            tmp0 = p2_r - p3_r + q4_r - p7_r;
1108cabdff1aSopenharmony_ci            tmp2 = p1_r - p2_r + q5_r - p7_r;
1109cabdff1aSopenharmony_ci            tmp1 += tmp0;
1110cabdff1aSopenharmony_ci            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1111cabdff1aSopenharmony_ci            tmp1 += tmp2;
1112cabdff1aSopenharmony_ci            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1113cabdff1aSopenharmony_ci            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1114cabdff1aSopenharmony_ci                        p0_filter16, p1_filter16);
1115cabdff1aSopenharmony_ci            p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
1116cabdff1aSopenharmony_ci            p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
1117cabdff1aSopenharmony_ci            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1118cabdff1aSopenharmony_ci            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1119cabdff1aSopenharmony_ci            SD(dword0, src);
1120cabdff1aSopenharmony_ci            src += pitch;
1121cabdff1aSopenharmony_ci            SD(dword1, src);
1122cabdff1aSopenharmony_ci            src += pitch;
1123cabdff1aSopenharmony_ci
1124cabdff1aSopenharmony_ci            /* calculation of p0 and q0 */
1125cabdff1aSopenharmony_ci            tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
1126cabdff1aSopenharmony_ci            tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
1127cabdff1aSopenharmony_ci            tmp1 += tmp0;
1128cabdff1aSopenharmony_ci            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1129cabdff1aSopenharmony_ci            tmp1 += tmp2;
1130cabdff1aSopenharmony_ci            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1131cabdff1aSopenharmony_ci            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1132cabdff1aSopenharmony_ci                        p0_filter16, p1_filter16);
1133cabdff1aSopenharmony_ci            p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
1134cabdff1aSopenharmony_ci            p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
1135cabdff1aSopenharmony_ci            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1136cabdff1aSopenharmony_ci            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1137cabdff1aSopenharmony_ci            SD(dword0, src);
1138cabdff1aSopenharmony_ci            src += pitch;
1139cabdff1aSopenharmony_ci            SD(dword1, src);
1140cabdff1aSopenharmony_ci            src += pitch;
1141cabdff1aSopenharmony_ci
1142cabdff1aSopenharmony_ci            /* calculation of q1 and q2 */
1143cabdff1aSopenharmony_ci            tmp0 = q7_r - q0_r + q1_r - p6_r;
1144cabdff1aSopenharmony_ci            tmp2 = q7_r - q1_r + q2_r - p5_r;
1145cabdff1aSopenharmony_ci            tmp1 += tmp0;
1146cabdff1aSopenharmony_ci            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1147cabdff1aSopenharmony_ci            tmp1 += tmp2;
1148cabdff1aSopenharmony_ci            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1149cabdff1aSopenharmony_ci            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1150cabdff1aSopenharmony_ci                        p0_filter16, p1_filter16);
1151cabdff1aSopenharmony_ci            p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
1152cabdff1aSopenharmony_ci            p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
1153cabdff1aSopenharmony_ci            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1154cabdff1aSopenharmony_ci            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1155cabdff1aSopenharmony_ci            SD(dword0, src);
1156cabdff1aSopenharmony_ci            src += pitch;
1157cabdff1aSopenharmony_ci            SD(dword1, src);
1158cabdff1aSopenharmony_ci            src += pitch;
1159cabdff1aSopenharmony_ci
1160cabdff1aSopenharmony_ci            /* calculation of q3 and q4 */
1161cabdff1aSopenharmony_ci            tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
1162cabdff1aSopenharmony_ci            tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
1163cabdff1aSopenharmony_ci            tmp1 += tmp0;
1164cabdff1aSopenharmony_ci            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1165cabdff1aSopenharmony_ci            tmp1 += tmp2;
1166cabdff1aSopenharmony_ci            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1167cabdff1aSopenharmony_ci            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1168cabdff1aSopenharmony_ci                        p0_filter16, p1_filter16);
1169cabdff1aSopenharmony_ci            p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
1170cabdff1aSopenharmony_ci            p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
1171cabdff1aSopenharmony_ci            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1172cabdff1aSopenharmony_ci            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1173cabdff1aSopenharmony_ci            SD(dword0, src);
1174cabdff1aSopenharmony_ci            src += pitch;
1175cabdff1aSopenharmony_ci            SD(dword1, src);
1176cabdff1aSopenharmony_ci            src += pitch;
1177cabdff1aSopenharmony_ci
1178cabdff1aSopenharmony_ci            /* calculation of q5 and q6 */
1179cabdff1aSopenharmony_ci            tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
1180cabdff1aSopenharmony_ci            tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
1181cabdff1aSopenharmony_ci            tmp1 += tmp0;
1182cabdff1aSopenharmony_ci            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1183cabdff1aSopenharmony_ci            tmp1 += tmp2;
1184cabdff1aSopenharmony_ci            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1185cabdff1aSopenharmony_ci            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1186cabdff1aSopenharmony_ci                        p0_filter16, p1_filter16);
1187cabdff1aSopenharmony_ci            p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
1188cabdff1aSopenharmony_ci            p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
1189cabdff1aSopenharmony_ci            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1190cabdff1aSopenharmony_ci            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1191cabdff1aSopenharmony_ci            SD(dword0, src);
1192cabdff1aSopenharmony_ci            src += pitch;
1193cabdff1aSopenharmony_ci            SD(dword1, src);
1194cabdff1aSopenharmony_ci        }
1195cabdff1aSopenharmony_ci    }
1196cabdff1aSopenharmony_ci}
1197cabdff1aSopenharmony_ci
1198cabdff1aSopenharmony_civoid ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch,
1199cabdff1aSopenharmony_ci                              int32_t b_limit_ptr,
1200cabdff1aSopenharmony_ci                              int32_t limit_ptr,
1201cabdff1aSopenharmony_ci                              int32_t thresh_ptr)
1202cabdff1aSopenharmony_ci{
1203cabdff1aSopenharmony_ci    v16u8 mask, hev, flat, limit, thresh, b_limit;
1204cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1205cabdff1aSopenharmony_ci    v8i16 vec0, vec1, vec2, vec3;
1206cabdff1aSopenharmony_ci
1207cabdff1aSopenharmony_ci    LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1208cabdff1aSopenharmony_ci
1209cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_fill_b(thresh_ptr);
1210cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1211cabdff1aSopenharmony_ci    limit = (v16u8) __msa_fill_b(limit_ptr);
1212cabdff1aSopenharmony_ci
1213cabdff1aSopenharmony_ci    TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1214cabdff1aSopenharmony_ci                       p3, p2, p1, p0, q0, q1, q2, q3);
1215cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1216cabdff1aSopenharmony_ci                 hev, mask, flat);
1217cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1218cabdff1aSopenharmony_ci    ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
1219cabdff1aSopenharmony_ci    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1220cabdff1aSopenharmony_ci
1221cabdff1aSopenharmony_ci    src -= 2;
1222cabdff1aSopenharmony_ci    ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1223cabdff1aSopenharmony_ci}
1224cabdff1aSopenharmony_ci
1225cabdff1aSopenharmony_civoid ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch,
1226cabdff1aSopenharmony_ci                                int32_t b_limit_ptr,
1227cabdff1aSopenharmony_ci                                int32_t limit_ptr,
1228cabdff1aSopenharmony_ci                                int32_t thresh_ptr)
1229cabdff1aSopenharmony_ci{
1230cabdff1aSopenharmony_ci    v16u8 mask, hev, flat;
1231cabdff1aSopenharmony_ci    v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1232cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1233cabdff1aSopenharmony_ci    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1234cabdff1aSopenharmony_ci    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1235cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1236cabdff1aSopenharmony_ci
1237cabdff1aSopenharmony_ci    LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1238cabdff1aSopenharmony_ci    LD_UB8(src - 4 + (8 * pitch), pitch,
1239cabdff1aSopenharmony_ci           row8, row9, row10, row11, row12, row13, row14, row15);
1240cabdff1aSopenharmony_ci
1241cabdff1aSopenharmony_ci    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1242cabdff1aSopenharmony_ci                        row8, row9, row10, row11, row12, row13, row14, row15,
1243cabdff1aSopenharmony_ci                        p3, p2, p1, p0, q0, q1, q2, q3);
1244cabdff1aSopenharmony_ci
1245cabdff1aSopenharmony_ci    thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
1246cabdff1aSopenharmony_ci    thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
1247cabdff1aSopenharmony_ci    thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
1248cabdff1aSopenharmony_ci
1249cabdff1aSopenharmony_ci    b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
1250cabdff1aSopenharmony_ci    b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
1251cabdff1aSopenharmony_ci    b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
1252cabdff1aSopenharmony_ci
1253cabdff1aSopenharmony_ci    limit0 = (v16u8) __msa_fill_b(limit_ptr);
1254cabdff1aSopenharmony_ci    limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
1255cabdff1aSopenharmony_ci    limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
1256cabdff1aSopenharmony_ci
1257cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
1258cabdff1aSopenharmony_ci                 hev, mask, flat);
1259cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1260cabdff1aSopenharmony_ci    ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1261cabdff1aSopenharmony_ci    ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
1262cabdff1aSopenharmony_ci    ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1263cabdff1aSopenharmony_ci    ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
1264cabdff1aSopenharmony_ci
1265cabdff1aSopenharmony_ci    src -= 2;
1266cabdff1aSopenharmony_ci
1267cabdff1aSopenharmony_ci    ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1268cabdff1aSopenharmony_ci    ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1269cabdff1aSopenharmony_ci}
1270cabdff1aSopenharmony_ci
1271cabdff1aSopenharmony_civoid ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch,
1272cabdff1aSopenharmony_ci                              int32_t b_limit_ptr,
1273cabdff1aSopenharmony_ci                              int32_t limit_ptr,
1274cabdff1aSopenharmony_ci                              int32_t thresh_ptr)
1275cabdff1aSopenharmony_ci{
1276cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1277cabdff1aSopenharmony_ci    v16u8 p1_out, p0_out, q0_out, q1_out;
1278cabdff1aSopenharmony_ci    v16u8 flat, mask, hev, thresh, b_limit, limit;
1279cabdff1aSopenharmony_ci    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1280cabdff1aSopenharmony_ci    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1281cabdff1aSopenharmony_ci    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1282cabdff1aSopenharmony_ci    v16u8 zero = { 0 };
1283cabdff1aSopenharmony_ci    v8i16 vec0, vec1, vec2, vec3, vec4;
1284cabdff1aSopenharmony_ci
1285cabdff1aSopenharmony_ci    /* load vector elements */
1286cabdff1aSopenharmony_ci    LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1287cabdff1aSopenharmony_ci
1288cabdff1aSopenharmony_ci    TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1289cabdff1aSopenharmony_ci                       p3, p2, p1, p0, q0, q1, q2, q3);
1290cabdff1aSopenharmony_ci
1291cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_fill_b(thresh_ptr);
1292cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1293cabdff1aSopenharmony_ci    limit = (v16u8) __msa_fill_b(limit_ptr);
1294cabdff1aSopenharmony_ci
1295cabdff1aSopenharmony_ci    /* mask and hev */
1296cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1297cabdff1aSopenharmony_ci                 hev, mask, flat);
1298cabdff1aSopenharmony_ci    /* flat4 */
1299cabdff1aSopenharmony_ci    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1300cabdff1aSopenharmony_ci    /* filter4 */
1301cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1302cabdff1aSopenharmony_ci                       q1_out);
1303cabdff1aSopenharmony_ci
1304cabdff1aSopenharmony_ci    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1305cabdff1aSopenharmony_ci
1306cabdff1aSopenharmony_ci    /* if flat is zero for all pixels, then no need to calculate other filter */
1307cabdff1aSopenharmony_ci    if (__msa_test_bz_v(flat)) {
1308cabdff1aSopenharmony_ci        /* Store 4 pixels p1-_q1 */
1309cabdff1aSopenharmony_ci        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1310cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1311cabdff1aSopenharmony_ci
1312cabdff1aSopenharmony_ci        src -= 2;
1313cabdff1aSopenharmony_ci        ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1314cabdff1aSopenharmony_ci    } else {
1315cabdff1aSopenharmony_ci        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1316cabdff1aSopenharmony_ci                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1317cabdff1aSopenharmony_ci                   q3_r);
1318cabdff1aSopenharmony_ci        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1319cabdff1aSopenharmony_ci                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1320cabdff1aSopenharmony_ci        /* convert 16 bit output data into 8 bit */
1321cabdff1aSopenharmony_ci        PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
1322cabdff1aSopenharmony_ci                    p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1323cabdff1aSopenharmony_ci                    p0_filt8_r, q0_filt8_r);
1324cabdff1aSopenharmony_ci        PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
1325cabdff1aSopenharmony_ci                    q2_filt8_r);
1326cabdff1aSopenharmony_ci
1327cabdff1aSopenharmony_ci        /* store pixel values */
1328cabdff1aSopenharmony_ci        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1329cabdff1aSopenharmony_ci        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1330cabdff1aSopenharmony_ci        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1331cabdff1aSopenharmony_ci        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1332cabdff1aSopenharmony_ci        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1333cabdff1aSopenharmony_ci        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1334cabdff1aSopenharmony_ci
1335cabdff1aSopenharmony_ci        /* Store 6 pixels p2-_q2 */
1336cabdff1aSopenharmony_ci        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1337cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1338cabdff1aSopenharmony_ci        vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1339cabdff1aSopenharmony_ci
1340cabdff1aSopenharmony_ci        src -= 3;
1341cabdff1aSopenharmony_ci        ST_W4(vec2, 0, 1, 2, 3, src, pitch);
1342cabdff1aSopenharmony_ci        ST_H4(vec4, 0, 1, 2, 3, src + 4, pitch);
1343cabdff1aSopenharmony_ci        src += (4 * pitch);
1344cabdff1aSopenharmony_ci        ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1345cabdff1aSopenharmony_ci        ST_H4(vec4, 4, 5, 6, 7, src + 4, pitch);
1346cabdff1aSopenharmony_ci    }
1347cabdff1aSopenharmony_ci}
1348cabdff1aSopenharmony_ci
1349cabdff1aSopenharmony_civoid ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch,
1350cabdff1aSopenharmony_ci                                int32_t b_limit_ptr,
1351cabdff1aSopenharmony_ci                                int32_t limit_ptr,
1352cabdff1aSopenharmony_ci                                int32_t thresh_ptr)
1353cabdff1aSopenharmony_ci{
1354cabdff1aSopenharmony_ci    uint8_t *temp_src;
1355cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1356cabdff1aSopenharmony_ci    v16u8 p1_out, p0_out, q0_out, q1_out;
1357cabdff1aSopenharmony_ci    v16u8 flat, mask, hev, thresh, b_limit, limit;
1358cabdff1aSopenharmony_ci    v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1359cabdff1aSopenharmony_ci    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1360cabdff1aSopenharmony_ci    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1361cabdff1aSopenharmony_ci    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1362cabdff1aSopenharmony_ci    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1363cabdff1aSopenharmony_ci    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1364cabdff1aSopenharmony_ci    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1365cabdff1aSopenharmony_ci    v16u8 zero = { 0 };
1366cabdff1aSopenharmony_ci    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1367cabdff1aSopenharmony_ci
1368cabdff1aSopenharmony_ci    temp_src = src - 4;
1369cabdff1aSopenharmony_ci
1370cabdff1aSopenharmony_ci    LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1371cabdff1aSopenharmony_ci    temp_src += (8 * pitch);
1372cabdff1aSopenharmony_ci    LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1373cabdff1aSopenharmony_ci
1374cabdff1aSopenharmony_ci    /* transpose 16x8 matrix into 8x16 */
1375cabdff1aSopenharmony_ci    TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1376cabdff1aSopenharmony_ci                        q3, q2, q1, q0, row12, row13, row14, row15,
1377cabdff1aSopenharmony_ci                        p3, p2, p1, p0, q0, q1, q2, q3);
1378cabdff1aSopenharmony_ci
1379cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_fill_b(thresh_ptr);
1380cabdff1aSopenharmony_ci    vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1381cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1382cabdff1aSopenharmony_ci
1383cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1384cabdff1aSopenharmony_ci    vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1385cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1386cabdff1aSopenharmony_ci
1387cabdff1aSopenharmony_ci    limit = (v16u8) __msa_fill_b(limit_ptr);
1388cabdff1aSopenharmony_ci    vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1389cabdff1aSopenharmony_ci    limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1390cabdff1aSopenharmony_ci
1391cabdff1aSopenharmony_ci    /* mask and hev */
1392cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1393cabdff1aSopenharmony_ci                 hev, mask, flat);
1394cabdff1aSopenharmony_ci    /* flat4 */
1395cabdff1aSopenharmony_ci    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1396cabdff1aSopenharmony_ci    /* filter4 */
1397cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1398cabdff1aSopenharmony_ci                       q1_out);
1399cabdff1aSopenharmony_ci
1400cabdff1aSopenharmony_ci    /* if flat is zero for all pixels, then no need to calculate other filter */
1401cabdff1aSopenharmony_ci    if (__msa_test_bz_v(flat)) {
1402cabdff1aSopenharmony_ci        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1403cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1404cabdff1aSopenharmony_ci        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1405cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1406cabdff1aSopenharmony_ci
1407cabdff1aSopenharmony_ci        src -= 2;
1408cabdff1aSopenharmony_ci        ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1409cabdff1aSopenharmony_ci        ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1410cabdff1aSopenharmony_ci    } else {
1411cabdff1aSopenharmony_ci        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1412cabdff1aSopenharmony_ci                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1413cabdff1aSopenharmony_ci                   q3_r);
1414cabdff1aSopenharmony_ci        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1415cabdff1aSopenharmony_ci                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1416cabdff1aSopenharmony_ci
1417cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1418cabdff1aSopenharmony_ci                   p0_l);
1419cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1420cabdff1aSopenharmony_ci                   q3_l);
1421cabdff1aSopenharmony_ci
1422cabdff1aSopenharmony_ci        /* filter8 */
1423cabdff1aSopenharmony_ci        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1424cabdff1aSopenharmony_ci                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1425cabdff1aSopenharmony_ci
1426cabdff1aSopenharmony_ci        /* convert 16 bit output data into 8 bit */
1427cabdff1aSopenharmony_ci        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1428cabdff1aSopenharmony_ci                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1429cabdff1aSopenharmony_ci                    p0_filt8_r, q0_filt8_r);
1430cabdff1aSopenharmony_ci        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1431cabdff1aSopenharmony_ci                    q2_filt8_r);
1432cabdff1aSopenharmony_ci
1433cabdff1aSopenharmony_ci        /* store pixel values */
1434cabdff1aSopenharmony_ci        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1435cabdff1aSopenharmony_ci        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1436cabdff1aSopenharmony_ci        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1437cabdff1aSopenharmony_ci        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1438cabdff1aSopenharmony_ci        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1439cabdff1aSopenharmony_ci        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1440cabdff1aSopenharmony_ci
1441cabdff1aSopenharmony_ci        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1442cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1443cabdff1aSopenharmony_ci        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1444cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1445cabdff1aSopenharmony_ci        ILVRL_B2_SH(q2, q1, vec2, vec5);
1446cabdff1aSopenharmony_ci
1447cabdff1aSopenharmony_ci        src -= 3;
1448cabdff1aSopenharmony_ci        ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1449cabdff1aSopenharmony_ci        ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1450cabdff1aSopenharmony_ci        src += (4 * pitch);
1451cabdff1aSopenharmony_ci        ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1452cabdff1aSopenharmony_ci        ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1453cabdff1aSopenharmony_ci        src += (4 * pitch);
1454cabdff1aSopenharmony_ci        ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1455cabdff1aSopenharmony_ci        ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1456cabdff1aSopenharmony_ci        src += (4 * pitch);
1457cabdff1aSopenharmony_ci        ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1458cabdff1aSopenharmony_ci        ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1459cabdff1aSopenharmony_ci    }
1460cabdff1aSopenharmony_ci}
1461cabdff1aSopenharmony_ci
1462cabdff1aSopenharmony_civoid ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch,
1463cabdff1aSopenharmony_ci                                int32_t b_limit_ptr,
1464cabdff1aSopenharmony_ci                                int32_t limit_ptr,
1465cabdff1aSopenharmony_ci                                int32_t thresh_ptr)
1466cabdff1aSopenharmony_ci{
1467cabdff1aSopenharmony_ci    uint8_t *temp_src;
1468cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1469cabdff1aSopenharmony_ci    v16u8 p1_out, p0_out, q0_out, q1_out;
1470cabdff1aSopenharmony_ci    v16u8 flat, mask, hev, thresh, b_limit, limit;
1471cabdff1aSopenharmony_ci    v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1472cabdff1aSopenharmony_ci    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1473cabdff1aSopenharmony_ci    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1474cabdff1aSopenharmony_ci    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1475cabdff1aSopenharmony_ci    v16u8 zero = { 0 };
1476cabdff1aSopenharmony_ci    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1477cabdff1aSopenharmony_ci
1478cabdff1aSopenharmony_ci    temp_src = src - 4;
1479cabdff1aSopenharmony_ci
1480cabdff1aSopenharmony_ci    LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1481cabdff1aSopenharmony_ci    temp_src += (8 * pitch);
1482cabdff1aSopenharmony_ci    LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1483cabdff1aSopenharmony_ci
1484cabdff1aSopenharmony_ci    /* transpose 16x8 matrix into 8x16 */
1485cabdff1aSopenharmony_ci    TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1486cabdff1aSopenharmony_ci                        q3, q2, q1, q0, row12, row13, row14, row15,
1487cabdff1aSopenharmony_ci                        p3, p2, p1, p0, q0, q1, q2, q3);
1488cabdff1aSopenharmony_ci
1489cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_fill_b(thresh_ptr);
1490cabdff1aSopenharmony_ci    vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1491cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1492cabdff1aSopenharmony_ci
1493cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1494cabdff1aSopenharmony_ci    vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1495cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1496cabdff1aSopenharmony_ci
1497cabdff1aSopenharmony_ci    limit = (v16u8) __msa_fill_b(limit_ptr);
1498cabdff1aSopenharmony_ci    vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1499cabdff1aSopenharmony_ci    limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1500cabdff1aSopenharmony_ci
1501cabdff1aSopenharmony_ci    /* mask and hev */
1502cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1503cabdff1aSopenharmony_ci                 hev, mask, flat);
1504cabdff1aSopenharmony_ci    /* flat4 */
1505cabdff1aSopenharmony_ci    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1506cabdff1aSopenharmony_ci    /* filter4 */
1507cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1508cabdff1aSopenharmony_ci                       q1_out);
1509cabdff1aSopenharmony_ci
1510cabdff1aSopenharmony_ci    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1511cabdff1aSopenharmony_ci
1512cabdff1aSopenharmony_ci    /* if flat is zero for all pixels, then no need to calculate other filter */
1513cabdff1aSopenharmony_ci    if (__msa_test_bz_v(flat)) {
1514cabdff1aSopenharmony_ci        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1515cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1516cabdff1aSopenharmony_ci        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1517cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1518cabdff1aSopenharmony_ci
1519cabdff1aSopenharmony_ci        src -= 2;
1520cabdff1aSopenharmony_ci        ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1521cabdff1aSopenharmony_ci        ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1522cabdff1aSopenharmony_ci    } else {
1523cabdff1aSopenharmony_ci        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1524cabdff1aSopenharmony_ci                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1525cabdff1aSopenharmony_ci                   q3_r);
1526cabdff1aSopenharmony_ci        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1527cabdff1aSopenharmony_ci                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1528cabdff1aSopenharmony_ci
1529cabdff1aSopenharmony_ci        /* convert 16 bit output data into 8 bit */
1530cabdff1aSopenharmony_ci        PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
1531cabdff1aSopenharmony_ci                    p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
1532cabdff1aSopenharmony_ci                    p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
1533cabdff1aSopenharmony_ci        PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
1534cabdff1aSopenharmony_ci                    q1_filt8_r, q2_filt8_r);
1535cabdff1aSopenharmony_ci
1536cabdff1aSopenharmony_ci        /* store pixel values */
1537cabdff1aSopenharmony_ci        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1538cabdff1aSopenharmony_ci        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1539cabdff1aSopenharmony_ci        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1540cabdff1aSopenharmony_ci        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1541cabdff1aSopenharmony_ci        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1542cabdff1aSopenharmony_ci        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1543cabdff1aSopenharmony_ci
1544cabdff1aSopenharmony_ci        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1545cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1546cabdff1aSopenharmony_ci        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1547cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1548cabdff1aSopenharmony_ci        ILVRL_B2_SH(q2, q1, vec2, vec5);
1549cabdff1aSopenharmony_ci
1550cabdff1aSopenharmony_ci        src -= 3;
1551cabdff1aSopenharmony_ci        ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1552cabdff1aSopenharmony_ci        ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1553cabdff1aSopenharmony_ci        src += (4 * pitch);
1554cabdff1aSopenharmony_ci        ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1555cabdff1aSopenharmony_ci        ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1556cabdff1aSopenharmony_ci        src += (4 * pitch);
1557cabdff1aSopenharmony_ci        ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1558cabdff1aSopenharmony_ci        ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1559cabdff1aSopenharmony_ci        src += (4 * pitch);
1560cabdff1aSopenharmony_ci        ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1561cabdff1aSopenharmony_ci        ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1562cabdff1aSopenharmony_ci    }
1563cabdff1aSopenharmony_ci}
1564cabdff1aSopenharmony_ci
1565cabdff1aSopenharmony_civoid ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch,
1566cabdff1aSopenharmony_ci                                int32_t b_limit_ptr,
1567cabdff1aSopenharmony_ci                                int32_t limit_ptr,
1568cabdff1aSopenharmony_ci                                int32_t thresh_ptr)
1569cabdff1aSopenharmony_ci{
1570cabdff1aSopenharmony_ci    uint8_t *temp_src;
1571cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1572cabdff1aSopenharmony_ci    v16u8 p1_out, p0_out, q0_out, q1_out;
1573cabdff1aSopenharmony_ci    v16u8 flat, mask, hev, thresh, b_limit, limit;
1574cabdff1aSopenharmony_ci    v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1575cabdff1aSopenharmony_ci    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1576cabdff1aSopenharmony_ci    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1577cabdff1aSopenharmony_ci    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1578cabdff1aSopenharmony_ci    v16u8 zero = { 0 };
1579cabdff1aSopenharmony_ci    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1580cabdff1aSopenharmony_ci
1581cabdff1aSopenharmony_ci    temp_src = src - 4;
1582cabdff1aSopenharmony_ci
1583cabdff1aSopenharmony_ci    LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1584cabdff1aSopenharmony_ci    temp_src += (8 * pitch);
1585cabdff1aSopenharmony_ci    LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1586cabdff1aSopenharmony_ci
1587cabdff1aSopenharmony_ci    /* transpose 16x8 matrix into 8x16 */
1588cabdff1aSopenharmony_ci    TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1589cabdff1aSopenharmony_ci                        q3, q2, q1, q0, row12, row13, row14, row15,
1590cabdff1aSopenharmony_ci                        p3, p2, p1, p0, q0, q1, q2, q3);
1591cabdff1aSopenharmony_ci
1592cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_fill_b(thresh_ptr);
1593cabdff1aSopenharmony_ci    vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1594cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1595cabdff1aSopenharmony_ci
1596cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1597cabdff1aSopenharmony_ci    vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1598cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1599cabdff1aSopenharmony_ci
1600cabdff1aSopenharmony_ci    limit = (v16u8) __msa_fill_b(limit_ptr);
1601cabdff1aSopenharmony_ci    vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1602cabdff1aSopenharmony_ci    limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1603cabdff1aSopenharmony_ci
1604cabdff1aSopenharmony_ci    /* mask and hev */
1605cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1606cabdff1aSopenharmony_ci                 hev, mask, flat);
1607cabdff1aSopenharmony_ci    /* flat4 */
1608cabdff1aSopenharmony_ci    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1609cabdff1aSopenharmony_ci    /* filter4 */
1610cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1611cabdff1aSopenharmony_ci                       q1_out);
1612cabdff1aSopenharmony_ci
1613cabdff1aSopenharmony_ci    flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
1614cabdff1aSopenharmony_ci
1615cabdff1aSopenharmony_ci    /* if flat is zero for all pixels, then no need to calculate other filter */
1616cabdff1aSopenharmony_ci    if (__msa_test_bz_v(flat)) {
1617cabdff1aSopenharmony_ci        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1618cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1619cabdff1aSopenharmony_ci        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1620cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1621cabdff1aSopenharmony_ci
1622cabdff1aSopenharmony_ci        src -= 2;
1623cabdff1aSopenharmony_ci        ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1624cabdff1aSopenharmony_ci        ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1625cabdff1aSopenharmony_ci    } else {
1626cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1627cabdff1aSopenharmony_ci                   p0_l);
1628cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1629cabdff1aSopenharmony_ci                   q3_l);
1630cabdff1aSopenharmony_ci
1631cabdff1aSopenharmony_ci        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1632cabdff1aSopenharmony_ci                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1633cabdff1aSopenharmony_ci
1634cabdff1aSopenharmony_ci        /* convert 16 bit output data into 8 bit */
1635cabdff1aSopenharmony_ci        PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1636cabdff1aSopenharmony_ci                    p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
1637cabdff1aSopenharmony_ci                    p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1638cabdff1aSopenharmony_ci        PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1639cabdff1aSopenharmony_ci                    q1_filt8_l, q2_filt8_l);
1640cabdff1aSopenharmony_ci
1641cabdff1aSopenharmony_ci        /* store pixel values */
1642cabdff1aSopenharmony_ci        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
1643cabdff1aSopenharmony_ci        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
1644cabdff1aSopenharmony_ci        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
1645cabdff1aSopenharmony_ci        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
1646cabdff1aSopenharmony_ci        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
1647cabdff1aSopenharmony_ci        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
1648cabdff1aSopenharmony_ci
1649cabdff1aSopenharmony_ci        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1650cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1651cabdff1aSopenharmony_ci        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1652cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1653cabdff1aSopenharmony_ci        ILVRL_B2_SH(q2, q1, vec2, vec5);
1654cabdff1aSopenharmony_ci
1655cabdff1aSopenharmony_ci        src -= 3;
1656cabdff1aSopenharmony_ci        ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1657cabdff1aSopenharmony_ci        ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1658cabdff1aSopenharmony_ci        src += (4 * pitch);
1659cabdff1aSopenharmony_ci        ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1660cabdff1aSopenharmony_ci        ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1661cabdff1aSopenharmony_ci        src += (4 * pitch);
1662cabdff1aSopenharmony_ci        ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1663cabdff1aSopenharmony_ci        ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1664cabdff1aSopenharmony_ci        src += (4 * pitch);
1665cabdff1aSopenharmony_ci        ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1666cabdff1aSopenharmony_ci        ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1667cabdff1aSopenharmony_ci    }
1668cabdff1aSopenharmony_ci}
1669cabdff1aSopenharmony_ci
1670cabdff1aSopenharmony_cistatic void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
1671cabdff1aSopenharmony_ci                                       uint8_t *output, int32_t out_pitch)
1672cabdff1aSopenharmony_ci{
1673cabdff1aSopenharmony_ci    v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
1674cabdff1aSopenharmony_ci    v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1675cabdff1aSopenharmony_ci    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1676cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };
1677cabdff1aSopenharmony_ci
1678cabdff1aSopenharmony_ci    LD_UB8(input, in_pitch,
1679cabdff1aSopenharmony_ci           p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
1680cabdff1aSopenharmony_ci    /* 8x8 transpose */
1681cabdff1aSopenharmony_ci    TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
1682cabdff1aSopenharmony_ci                       p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
1683cabdff1aSopenharmony_ci    /* 8x8 transpose */
1684cabdff1aSopenharmony_ci    ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
1685cabdff1aSopenharmony_ci               tmp0, tmp1, tmp2, tmp3);
1686cabdff1aSopenharmony_ci    ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
1687cabdff1aSopenharmony_ci    ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
1688cabdff1aSopenharmony_ci    ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
1689cabdff1aSopenharmony_ci    ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
1690cabdff1aSopenharmony_ci    SLDI_B4_UB(zeros, q0, zeros, q2, zeros, q4, zeros, q6, 8, q1, q3, q5, q7);
1691cabdff1aSopenharmony_ci
1692cabdff1aSopenharmony_ci    ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1693cabdff1aSopenharmony_ci    output += (8 * out_pitch);
1694cabdff1aSopenharmony_ci    ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1695cabdff1aSopenharmony_ci}
1696cabdff1aSopenharmony_ci
1697cabdff1aSopenharmony_cistatic void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
1698cabdff1aSopenharmony_ci                                       uint8_t *output, int32_t out_pitch)
1699cabdff1aSopenharmony_ci{
1700cabdff1aSopenharmony_ci    v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
1701cabdff1aSopenharmony_ci    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1702cabdff1aSopenharmony_ci
1703cabdff1aSopenharmony_ci    LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
1704cabdff1aSopenharmony_ci    LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
1705cabdff1aSopenharmony_ci    TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
1706cabdff1aSopenharmony_ci                        q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
1707cabdff1aSopenharmony_ci    ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
1708cabdff1aSopenharmony_ci}
1709cabdff1aSopenharmony_ci
1710cabdff1aSopenharmony_cistatic void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch,
1711cabdff1aSopenharmony_ci                                uint8_t *output, int32_t out_pitch)
1712cabdff1aSopenharmony_ci{
1713cabdff1aSopenharmony_ci    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1714cabdff1aSopenharmony_ci    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1715cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
1716cabdff1aSopenharmony_ci    v4i32 tmp2, tmp3;
1717cabdff1aSopenharmony_ci    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1718cabdff1aSopenharmony_ci
1719cabdff1aSopenharmony_ci    LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1720cabdff1aSopenharmony_ci    input += (8 * in_pitch);
1721cabdff1aSopenharmony_ci    LD_UB8(input, in_pitch,
1722cabdff1aSopenharmony_ci           row8, row9, row10, row11, row12, row13, row14, row15);
1723cabdff1aSopenharmony_ci
1724cabdff1aSopenharmony_ci    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1725cabdff1aSopenharmony_ci                        row8, row9, row10, row11, row12, row13, row14, row15,
1726cabdff1aSopenharmony_ci                        p7, p6, p5, p4, p3, p2, p1, p0);
1727cabdff1aSopenharmony_ci
1728cabdff1aSopenharmony_ci    /* transpose 16x8 matrix into 8x16 */
1729cabdff1aSopenharmony_ci    /* total 8 intermediate register and 32 instructions */
1730cabdff1aSopenharmony_ci    q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0);
1731cabdff1aSopenharmony_ci    q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1);
1732cabdff1aSopenharmony_ci    q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2);
1733cabdff1aSopenharmony_ci    q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3);
1734cabdff1aSopenharmony_ci    q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4);
1735cabdff1aSopenharmony_ci    q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5);
1736cabdff1aSopenharmony_ci    q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6);
1737cabdff1aSopenharmony_ci    q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7);
1738cabdff1aSopenharmony_ci
1739cabdff1aSopenharmony_ci    ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
1740cabdff1aSopenharmony_ci    tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7);
1741cabdff1aSopenharmony_ci    tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5);
1742cabdff1aSopenharmony_ci
1743cabdff1aSopenharmony_ci    ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
1744cabdff1aSopenharmony_ci    tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3);
1745cabdff1aSopenharmony_ci    tmp7 = (v8i16) __msa_ilvod_b((v16i8) q0, (v16i8) q1);
1746cabdff1aSopenharmony_ci
1747cabdff1aSopenharmony_ci    ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
1748cabdff1aSopenharmony_ci    q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1749cabdff1aSopenharmony_ci    q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1750cabdff1aSopenharmony_ci
1751cabdff1aSopenharmony_ci    tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0);
1752cabdff1aSopenharmony_ci    tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5);
1753cabdff1aSopenharmony_ci    q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1754cabdff1aSopenharmony_ci    q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1755cabdff1aSopenharmony_ci
1756cabdff1aSopenharmony_ci    ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
1757cabdff1aSopenharmony_ci    q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1758cabdff1aSopenharmony_ci    q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1759cabdff1aSopenharmony_ci
1760cabdff1aSopenharmony_ci    tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4);
1761cabdff1aSopenharmony_ci    tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6);
1762cabdff1aSopenharmony_ci    q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1763cabdff1aSopenharmony_ci    q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1764cabdff1aSopenharmony_ci
1765cabdff1aSopenharmony_ci    ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1766cabdff1aSopenharmony_ci    output += (8 * out_pitch);
1767cabdff1aSopenharmony_ci    ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1768cabdff1aSopenharmony_ci}
1769cabdff1aSopenharmony_ci
1770cabdff1aSopenharmony_cistatic int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
1771cabdff1aSopenharmony_ci                                       uint8_t *src_org, int32_t pitch_org,
1772cabdff1aSopenharmony_ci                                       int32_t b_limit_ptr,
1773cabdff1aSopenharmony_ci                                       int32_t limit_ptr,
1774cabdff1aSopenharmony_ci                                       int32_t thresh_ptr)
1775cabdff1aSopenharmony_ci{
1776cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1777cabdff1aSopenharmony_ci    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1778cabdff1aSopenharmony_ci    v16u8 flat, mask, hev, thresh, b_limit, limit;
1779cabdff1aSopenharmony_ci    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1780cabdff1aSopenharmony_ci    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1781cabdff1aSopenharmony_ci    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1782cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
1783cabdff1aSopenharmony_ci    v8i16 vec0, vec1, vec2, vec3;
1784cabdff1aSopenharmony_ci
1785cabdff1aSopenharmony_ci    /* load vector elements */
1786cabdff1aSopenharmony_ci    LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1787cabdff1aSopenharmony_ci
1788cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_fill_b(thresh_ptr);
1789cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1790cabdff1aSopenharmony_ci    limit = (v16u8) __msa_fill_b(limit_ptr);
1791cabdff1aSopenharmony_ci
1792cabdff1aSopenharmony_ci    /* mask and hev */
1793cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1794cabdff1aSopenharmony_ci                 hev, mask, flat);
1795cabdff1aSopenharmony_ci    /* flat4 */
1796cabdff1aSopenharmony_ci    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1797cabdff1aSopenharmony_ci    /* filter4 */
1798cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1799cabdff1aSopenharmony_ci                       q1_out);
1800cabdff1aSopenharmony_ci
1801cabdff1aSopenharmony_ci    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1802cabdff1aSopenharmony_ci
1803cabdff1aSopenharmony_ci    /* if flat is zero for all pixels, then no need to calculate other filter */
1804cabdff1aSopenharmony_ci    if (__msa_test_bz_v(flat)) {
1805cabdff1aSopenharmony_ci        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1806cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1807cabdff1aSopenharmony_ci        ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, (src_org - 2), pitch_org);
1808cabdff1aSopenharmony_ci        return 1;
1809cabdff1aSopenharmony_ci    } else {
1810cabdff1aSopenharmony_ci        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1811cabdff1aSopenharmony_ci                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1812cabdff1aSopenharmony_ci                   q3_r);
1813cabdff1aSopenharmony_ci        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1814cabdff1aSopenharmony_ci                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1815cabdff1aSopenharmony_ci
1816cabdff1aSopenharmony_ci        /* convert 16 bit output data into 8 bit */
1817cabdff1aSopenharmony_ci        p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r);
1818cabdff1aSopenharmony_ci        p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r);
1819cabdff1aSopenharmony_ci        p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r);
1820cabdff1aSopenharmony_ci        q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r);
1821cabdff1aSopenharmony_ci        q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r);
1822cabdff1aSopenharmony_ci        q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r);
1823cabdff1aSopenharmony_ci
1824cabdff1aSopenharmony_ci        /* store pixel values */
1825cabdff1aSopenharmony_ci        p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat);
1826cabdff1aSopenharmony_ci        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat);
1827cabdff1aSopenharmony_ci        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat);
1828cabdff1aSopenharmony_ci        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat);
1829cabdff1aSopenharmony_ci        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat);
1830cabdff1aSopenharmony_ci        q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat);
1831cabdff1aSopenharmony_ci
1832cabdff1aSopenharmony_ci        ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1833cabdff1aSopenharmony_ci        filter48 += (4 * 16);
1834cabdff1aSopenharmony_ci        ST_UB2(q1_out, q2_out, filter48, 16);
1835cabdff1aSopenharmony_ci        filter48 += (2 * 16);
1836cabdff1aSopenharmony_ci        ST_UB(flat, filter48);
1837cabdff1aSopenharmony_ci
1838cabdff1aSopenharmony_ci        return 0;
1839cabdff1aSopenharmony_ci    }
1840cabdff1aSopenharmony_ci}
1841cabdff1aSopenharmony_ci
1842cabdff1aSopenharmony_cistatic int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
1843cabdff1aSopenharmony_ci                                 uint8_t *filter48)
1844cabdff1aSopenharmony_ci{
1845cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
1846cabdff1aSopenharmony_ci    v16u8 filter8, flat, flat2;
1847cabdff1aSopenharmony_ci    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1848cabdff1aSopenharmony_ci    v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
1849cabdff1aSopenharmony_ci    v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1850cabdff1aSopenharmony_ci    v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
1851cabdff1aSopenharmony_ci    v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1852cabdff1aSopenharmony_ci    v8u16 tmp0_r, tmp1_r;
1853cabdff1aSopenharmony_ci    v8i16 r_out;
1854cabdff1aSopenharmony_ci
1855cabdff1aSopenharmony_ci    flat = LD_UB(filter48 + 6 * 16);
1856cabdff1aSopenharmony_ci
1857cabdff1aSopenharmony_ci    LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1858cabdff1aSopenharmony_ci    LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1859cabdff1aSopenharmony_ci
1860cabdff1aSopenharmony_ci    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1861cabdff1aSopenharmony_ci
1862cabdff1aSopenharmony_ci    /* if flat2 is zero for all pixels, then no need to calculate other filter */
1863cabdff1aSopenharmony_ci    if (__msa_test_bz_v(flat2)) {
1864cabdff1aSopenharmony_ci        v8i16 vec0, vec1, vec2, vec3, vec4;
1865cabdff1aSopenharmony_ci
1866cabdff1aSopenharmony_ci        LD_UB4(filter48, 16, p2, p1, p0, q0);
1867cabdff1aSopenharmony_ci        LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1868cabdff1aSopenharmony_ci
1869cabdff1aSopenharmony_ci        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1870cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1871cabdff1aSopenharmony_ci        vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1872cabdff1aSopenharmony_ci
1873cabdff1aSopenharmony_ci        src_org -= 3;
1874cabdff1aSopenharmony_ci        ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
1875cabdff1aSopenharmony_ci        ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
1876cabdff1aSopenharmony_ci        src_org += (4 * pitch);
1877cabdff1aSopenharmony_ci        ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
1878cabdff1aSopenharmony_ci        ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
1879cabdff1aSopenharmony_ci
1880cabdff1aSopenharmony_ci        return 1;
1881cabdff1aSopenharmony_ci    } else {
1882cabdff1aSopenharmony_ci        src -= 7 * 16;
1883cabdff1aSopenharmony_ci
1884cabdff1aSopenharmony_ci        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
1885cabdff1aSopenharmony_ci                   zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
1886cabdff1aSopenharmony_ci                   p3_r_in, p2_r_in, p1_r_in, p0_r_in);
1887cabdff1aSopenharmony_ci        q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
1888cabdff1aSopenharmony_ci
1889cabdff1aSopenharmony_ci        tmp0_r = p7_r_in << 3;
1890cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
1891cabdff1aSopenharmony_ci        tmp0_r += p6_r_in;
1892cabdff1aSopenharmony_ci        tmp0_r += q0_r_in;
1893cabdff1aSopenharmony_ci        tmp1_r = p6_r_in + p5_r_in;
1894cabdff1aSopenharmony_ci        tmp1_r += p4_r_in;
1895cabdff1aSopenharmony_ci        tmp1_r += p3_r_in;
1896cabdff1aSopenharmony_ci        tmp1_r += p2_r_in;
1897cabdff1aSopenharmony_ci        tmp1_r += p1_r_in;
1898cabdff1aSopenharmony_ci        tmp1_r += p0_r_in;
1899cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
1900cabdff1aSopenharmony_ci
1901cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1902cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1903cabdff1aSopenharmony_ci        p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
1904cabdff1aSopenharmony_ci        ST_D1(p6, 0, src);
1905cabdff1aSopenharmony_ci        src += 16;
1906cabdff1aSopenharmony_ci
1907cabdff1aSopenharmony_ci        /* p5 */
1908cabdff1aSopenharmony_ci        q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
1909cabdff1aSopenharmony_ci        tmp0_r = p5_r_in - p6_r_in;
1910cabdff1aSopenharmony_ci        tmp0_r += q1_r_in;
1911cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
1912cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
1913cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1914cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1915cabdff1aSopenharmony_ci        p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
1916cabdff1aSopenharmony_ci        ST_D1(p5, 0, src);
1917cabdff1aSopenharmony_ci        src += 16;
1918cabdff1aSopenharmony_ci
1919cabdff1aSopenharmony_ci        /* p4 */
1920cabdff1aSopenharmony_ci        q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
1921cabdff1aSopenharmony_ci        tmp0_r = p4_r_in - p5_r_in;
1922cabdff1aSopenharmony_ci        tmp0_r += q2_r_in;
1923cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
1924cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
1925cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1926cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1927cabdff1aSopenharmony_ci        p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
1928cabdff1aSopenharmony_ci        ST_D1(p4, 0, src);
1929cabdff1aSopenharmony_ci        src += 16;
1930cabdff1aSopenharmony_ci
1931cabdff1aSopenharmony_ci        /* p3 */
1932cabdff1aSopenharmony_ci        q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
1933cabdff1aSopenharmony_ci        tmp0_r = p3_r_in - p4_r_in;
1934cabdff1aSopenharmony_ci        tmp0_r += q3_r_in;
1935cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
1936cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
1937cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1938cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1939cabdff1aSopenharmony_ci        p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
1940cabdff1aSopenharmony_ci        ST_D1(p3, 0, src);
1941cabdff1aSopenharmony_ci        src += 16;
1942cabdff1aSopenharmony_ci
1943cabdff1aSopenharmony_ci        /* p2 */
1944cabdff1aSopenharmony_ci        q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
1945cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48);
1946cabdff1aSopenharmony_ci        tmp0_r = p2_r_in - p3_r_in;
1947cabdff1aSopenharmony_ci        tmp0_r += q4_r_in;
1948cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
1949cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
1950cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1951cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1952cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1953cabdff1aSopenharmony_ci        ST_D1(filter8, 0, src);
1954cabdff1aSopenharmony_ci        src += 16;
1955cabdff1aSopenharmony_ci
1956cabdff1aSopenharmony_ci        /* p1 */
1957cabdff1aSopenharmony_ci        q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
1958cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48 + 16);
1959cabdff1aSopenharmony_ci        tmp0_r = p1_r_in - p2_r_in;
1960cabdff1aSopenharmony_ci        tmp0_r += q5_r_in;
1961cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
1962cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
1963cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1964cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1965cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1966cabdff1aSopenharmony_ci        ST_D1(filter8, 0, src);
1967cabdff1aSopenharmony_ci        src += 16;
1968cabdff1aSopenharmony_ci
1969cabdff1aSopenharmony_ci        /* p0 */
1970cabdff1aSopenharmony_ci        q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
1971cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48 + 32);
1972cabdff1aSopenharmony_ci        tmp0_r = p0_r_in - p1_r_in;
1973cabdff1aSopenharmony_ci        tmp0_r += q6_r_in;
1974cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
1975cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
1976cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1977cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1978cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1979cabdff1aSopenharmony_ci        ST_D1(filter8, 0, src);
1980cabdff1aSopenharmony_ci        src += 16;
1981cabdff1aSopenharmony_ci
1982cabdff1aSopenharmony_ci        /* q0 */
1983cabdff1aSopenharmony_ci        q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
1984cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48 + 48);
1985cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - p0_r_in;
1986cabdff1aSopenharmony_ci        tmp0_r += q0_r_in;
1987cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
1988cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
1989cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1990cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1991cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1992cabdff1aSopenharmony_ci        ST_D1(filter8, 0, src);
1993cabdff1aSopenharmony_ci        src += 16;
1994cabdff1aSopenharmony_ci
1995cabdff1aSopenharmony_ci        /* q1 */
1996cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48 + 64);
1997cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q0_r_in;
1998cabdff1aSopenharmony_ci        tmp0_r += q1_r_in;
1999cabdff1aSopenharmony_ci        tmp0_r -= p6_r_in;
2000cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2001cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2002cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2003cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2004cabdff1aSopenharmony_ci        ST_D1(filter8, 0, src);
2005cabdff1aSopenharmony_ci        src += 16;
2006cabdff1aSopenharmony_ci
2007cabdff1aSopenharmony_ci        /* q2 */
2008cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48 + 80);
2009cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q1_r_in;
2010cabdff1aSopenharmony_ci        tmp0_r += q2_r_in;
2011cabdff1aSopenharmony_ci        tmp0_r -= p5_r_in;
2012cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2013cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2014cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2015cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2016cabdff1aSopenharmony_ci        ST_D1(filter8, 0, src);
2017cabdff1aSopenharmony_ci        src += 16;
2018cabdff1aSopenharmony_ci
2019cabdff1aSopenharmony_ci        /* q3 */
2020cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q2_r_in;
2021cabdff1aSopenharmony_ci        tmp0_r += q3_r_in;
2022cabdff1aSopenharmony_ci        tmp0_r -= p4_r_in;
2023cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2024cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2025cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2026cabdff1aSopenharmony_ci        q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2027cabdff1aSopenharmony_ci        ST_D1(q3, 0, src);
2028cabdff1aSopenharmony_ci        src += 16;
2029cabdff1aSopenharmony_ci
2030cabdff1aSopenharmony_ci        /* q4 */
2031cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q3_r_in;
2032cabdff1aSopenharmony_ci        tmp0_r += q4_r_in;
2033cabdff1aSopenharmony_ci        tmp0_r -= p3_r_in;
2034cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2035cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2036cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2037cabdff1aSopenharmony_ci        q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2038cabdff1aSopenharmony_ci        ST_D1(q4, 0, src);
2039cabdff1aSopenharmony_ci        src += 16;
2040cabdff1aSopenharmony_ci
2041cabdff1aSopenharmony_ci        /* q5 */
2042cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q4_r_in;
2043cabdff1aSopenharmony_ci        tmp0_r += q5_r_in;
2044cabdff1aSopenharmony_ci        tmp0_r -= p2_r_in;
2045cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2046cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2047cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2048cabdff1aSopenharmony_ci        q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2049cabdff1aSopenharmony_ci        ST_D1(q5, 0, src);
2050cabdff1aSopenharmony_ci        src += 16;
2051cabdff1aSopenharmony_ci
2052cabdff1aSopenharmony_ci        /* q6 */
2053cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q5_r_in;
2054cabdff1aSopenharmony_ci        tmp0_r += q6_r_in;
2055cabdff1aSopenharmony_ci        tmp0_r -= p1_r_in;
2056cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2057cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2058cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2059cabdff1aSopenharmony_ci        q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2060cabdff1aSopenharmony_ci        ST_D1(q6, 0, src);
2061cabdff1aSopenharmony_ci
2062cabdff1aSopenharmony_ci        return 0;
2063cabdff1aSopenharmony_ci    }
2064cabdff1aSopenharmony_ci}
2065cabdff1aSopenharmony_ci
2066cabdff1aSopenharmony_civoid ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch,
2067cabdff1aSopenharmony_ci                               int32_t b_limit_ptr,
2068cabdff1aSopenharmony_ci                               int32_t limit_ptr,
2069cabdff1aSopenharmony_ci                               int32_t thresh_ptr)
2070cabdff1aSopenharmony_ci{
2071cabdff1aSopenharmony_ci    uint8_t early_exit = 0;
2072cabdff1aSopenharmony_ci    uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2073cabdff1aSopenharmony_ci    uint8_t *filter48 = &transposed_input[16 * 16];
2074cabdff1aSopenharmony_ci
2075cabdff1aSopenharmony_ci    vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
2076cabdff1aSopenharmony_ci
2077cabdff1aSopenharmony_ci    early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
2078cabdff1aSopenharmony_ci                                         &filter48[0], src, pitch,
2079cabdff1aSopenharmony_ci                                         b_limit_ptr, limit_ptr, thresh_ptr);
2080cabdff1aSopenharmony_ci
2081cabdff1aSopenharmony_ci    if (0 == early_exit) {
2082cabdff1aSopenharmony_ci        early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
2083cabdff1aSopenharmony_ci                                       &filter48[0]);
2084cabdff1aSopenharmony_ci
2085cabdff1aSopenharmony_ci        if (0 == early_exit) {
2086cabdff1aSopenharmony_ci            vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
2087cabdff1aSopenharmony_ci        }
2088cabdff1aSopenharmony_ci    }
2089cabdff1aSopenharmony_ci}
2090cabdff1aSopenharmony_ci
2091cabdff1aSopenharmony_cistatic int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
2092cabdff1aSopenharmony_ci                                        uint8_t *src_org, ptrdiff_t pitch,
2093cabdff1aSopenharmony_ci                                        int32_t b_limit_ptr,
2094cabdff1aSopenharmony_ci                                        int32_t limit_ptr,
2095cabdff1aSopenharmony_ci                                        int32_t thresh_ptr)
2096cabdff1aSopenharmony_ci{
2097cabdff1aSopenharmony_ci    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
2098cabdff1aSopenharmony_ci    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2099cabdff1aSopenharmony_ci    v16u8 flat, mask, hev, thresh, b_limit, limit;
2100cabdff1aSopenharmony_ci    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
2101cabdff1aSopenharmony_ci    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2102cabdff1aSopenharmony_ci    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
2103cabdff1aSopenharmony_ci    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
2104cabdff1aSopenharmony_ci    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
2105cabdff1aSopenharmony_ci    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
2106cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
2107cabdff1aSopenharmony_ci    v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
2108cabdff1aSopenharmony_ci
2109cabdff1aSopenharmony_ci    /* load vector elements */
2110cabdff1aSopenharmony_ci    LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
2111cabdff1aSopenharmony_ci
2112cabdff1aSopenharmony_ci    thresh = (v16u8) __msa_fill_b(thresh_ptr);
2113cabdff1aSopenharmony_ci    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
2114cabdff1aSopenharmony_ci    limit = (v16u8) __msa_fill_b(limit_ptr);
2115cabdff1aSopenharmony_ci
2116cabdff1aSopenharmony_ci    /* mask and hev */
2117cabdff1aSopenharmony_ci    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2118cabdff1aSopenharmony_ci                 hev, mask, flat);
2119cabdff1aSopenharmony_ci    /* flat4 */
2120cabdff1aSopenharmony_ci    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2121cabdff1aSopenharmony_ci    /* filter4 */
2122cabdff1aSopenharmony_ci    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2123cabdff1aSopenharmony_ci                       q1_out);
2124cabdff1aSopenharmony_ci
2125cabdff1aSopenharmony_ci    /* if flat is zero for all pixels, then no need to calculate other filter */
2126cabdff1aSopenharmony_ci    if (__msa_test_bz_v(flat)) {
2127cabdff1aSopenharmony_ci        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2128cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
2129cabdff1aSopenharmony_ci        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2130cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
2131cabdff1aSopenharmony_ci
2132cabdff1aSopenharmony_ci        src_org -= 2;
2133cabdff1aSopenharmony_ci        ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src_org, pitch);
2134cabdff1aSopenharmony_ci        ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src_org + 8 * pitch, pitch);
2135cabdff1aSopenharmony_ci
2136cabdff1aSopenharmony_ci        return 1;
2137cabdff1aSopenharmony_ci    } else {
2138cabdff1aSopenharmony_ci        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
2139cabdff1aSopenharmony_ci                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
2140cabdff1aSopenharmony_ci                   q3_r);
2141cabdff1aSopenharmony_ci        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
2142cabdff1aSopenharmony_ci                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
2143cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
2144cabdff1aSopenharmony_ci                   p0_l);
2145cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
2146cabdff1aSopenharmony_ci                   q3_l);
2147cabdff1aSopenharmony_ci        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2148cabdff1aSopenharmony_ci                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2149cabdff1aSopenharmony_ci
2150cabdff1aSopenharmony_ci        /* convert 16 bit output data into 8 bit */
2151cabdff1aSopenharmony_ci        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
2152cabdff1aSopenharmony_ci                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
2153cabdff1aSopenharmony_ci                    p0_filt8_r, q0_filt8_r);
2154cabdff1aSopenharmony_ci        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
2155cabdff1aSopenharmony_ci                    q2_filt8_r);
2156cabdff1aSopenharmony_ci
2157cabdff1aSopenharmony_ci        /* store pixel values */
2158cabdff1aSopenharmony_ci        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
2159cabdff1aSopenharmony_ci        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
2160cabdff1aSopenharmony_ci        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
2161cabdff1aSopenharmony_ci        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
2162cabdff1aSopenharmony_ci        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
2163cabdff1aSopenharmony_ci        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
2164cabdff1aSopenharmony_ci
2165cabdff1aSopenharmony_ci        ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
2166cabdff1aSopenharmony_ci        filter48 += (4 * 16);
2167cabdff1aSopenharmony_ci        ST_UB2(q1_out, q2_out, filter48, 16);
2168cabdff1aSopenharmony_ci        filter48 += (2 * 16);
2169cabdff1aSopenharmony_ci        ST_UB(flat, filter48);
2170cabdff1aSopenharmony_ci
2171cabdff1aSopenharmony_ci        return 0;
2172cabdff1aSopenharmony_ci    }
2173cabdff1aSopenharmony_ci}
2174cabdff1aSopenharmony_ci
2175cabdff1aSopenharmony_cistatic int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
2176cabdff1aSopenharmony_ci                                  uint8_t *filter48)
2177cabdff1aSopenharmony_ci{
2178cabdff1aSopenharmony_ci    v16u8 flat, flat2, filter8;
2179cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
2180cabdff1aSopenharmony_ci    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2181cabdff1aSopenharmony_ci    v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
2182cabdff1aSopenharmony_ci    v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
2183cabdff1aSopenharmony_ci    v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
2184cabdff1aSopenharmony_ci    v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
2185cabdff1aSopenharmony_ci    v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2186cabdff1aSopenharmony_ci    v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2187cabdff1aSopenharmony_ci    v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2188cabdff1aSopenharmony_ci    v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2189cabdff1aSopenharmony_ci    v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
2190cabdff1aSopenharmony_ci    v8i16 l_out, r_out;
2191cabdff1aSopenharmony_ci
2192cabdff1aSopenharmony_ci    flat = LD_UB(filter48 + 6 * 16);
2193cabdff1aSopenharmony_ci
2194cabdff1aSopenharmony_ci    LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
2195cabdff1aSopenharmony_ci    LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
2196cabdff1aSopenharmony_ci
2197cabdff1aSopenharmony_ci    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2198cabdff1aSopenharmony_ci
2199cabdff1aSopenharmony_ci    /* if flat2 is zero for all pixels, then no need to calculate other filter */
2200cabdff1aSopenharmony_ci    if (__msa_test_bz_v(flat2)) {
2201cabdff1aSopenharmony_ci        v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2202cabdff1aSopenharmony_ci
2203cabdff1aSopenharmony_ci        LD_UB4(filter48, 16, p2, p1, p0, q0);
2204cabdff1aSopenharmony_ci        LD_UB2(filter48 + 4 * 16, 16, q1, q2);
2205cabdff1aSopenharmony_ci
2206cabdff1aSopenharmony_ci        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
2207cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
2208cabdff1aSopenharmony_ci        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
2209cabdff1aSopenharmony_ci        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
2210cabdff1aSopenharmony_ci        ILVRL_B2_SH(q2, q1, vec2, vec5);
2211cabdff1aSopenharmony_ci
2212cabdff1aSopenharmony_ci        src_org -= 3;
2213cabdff1aSopenharmony_ci        ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
2214cabdff1aSopenharmony_ci        ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
2215cabdff1aSopenharmony_ci        src_org += (4 * pitch);
2216cabdff1aSopenharmony_ci        ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
2217cabdff1aSopenharmony_ci        ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
2218cabdff1aSopenharmony_ci        src_org += (4 * pitch);
2219cabdff1aSopenharmony_ci        ST_W4(vec6, 0, 1, 2, 3, src_org, pitch);
2220cabdff1aSopenharmony_ci        ST_H4(vec5, 0, 1, 2, 3, (src_org + 4), pitch);
2221cabdff1aSopenharmony_ci        src_org += (4 * pitch);
2222cabdff1aSopenharmony_ci        ST_W4(vec7, 0, 1, 2, 3, src_org, pitch);
2223cabdff1aSopenharmony_ci        ST_H4(vec5, 4, 5, 6, 7, (src_org + 4), pitch);
2224cabdff1aSopenharmony_ci
2225cabdff1aSopenharmony_ci        return 1;
2226cabdff1aSopenharmony_ci    } else {
2227cabdff1aSopenharmony_ci        src -= 7 * 16;
2228cabdff1aSopenharmony_ci
2229cabdff1aSopenharmony_ci        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
2230cabdff1aSopenharmony_ci                   zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
2231cabdff1aSopenharmony_ci                   p3_r_in, p2_r_in, p1_r_in, p0_r_in);
2232cabdff1aSopenharmony_ci        q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
2233cabdff1aSopenharmony_ci
2234cabdff1aSopenharmony_ci        tmp0_r = p7_r_in << 3;
2235cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
2236cabdff1aSopenharmony_ci        tmp0_r += p6_r_in;
2237cabdff1aSopenharmony_ci        tmp0_r += q0_r_in;
2238cabdff1aSopenharmony_ci        tmp1_r = p6_r_in + p5_r_in;
2239cabdff1aSopenharmony_ci        tmp1_r += p4_r_in;
2240cabdff1aSopenharmony_ci        tmp1_r += p3_r_in;
2241cabdff1aSopenharmony_ci        tmp1_r += p2_r_in;
2242cabdff1aSopenharmony_ci        tmp1_r += p1_r_in;
2243cabdff1aSopenharmony_ci        tmp1_r += p0_r_in;
2244cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2245cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2246cabdff1aSopenharmony_ci
2247cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
2248cabdff1aSopenharmony_ci                   p5_l_in, p4_l_in);
2249cabdff1aSopenharmony_ci        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
2250cabdff1aSopenharmony_ci                   p1_l_in, p0_l_in);
2251cabdff1aSopenharmony_ci        q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
2252cabdff1aSopenharmony_ci
2253cabdff1aSopenharmony_ci        tmp0_l = p7_l_in << 3;
2254cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
2255cabdff1aSopenharmony_ci        tmp0_l += p6_l_in;
2256cabdff1aSopenharmony_ci        tmp0_l += q0_l_in;
2257cabdff1aSopenharmony_ci        tmp1_l = p6_l_in + p5_l_in;
2258cabdff1aSopenharmony_ci        tmp1_l += p4_l_in;
2259cabdff1aSopenharmony_ci        tmp1_l += p3_l_in;
2260cabdff1aSopenharmony_ci        tmp1_l += p2_l_in;
2261cabdff1aSopenharmony_ci        tmp1_l += p1_l_in;
2262cabdff1aSopenharmony_ci        tmp1_l += p0_l_in;
2263cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
2264cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2265cabdff1aSopenharmony_ci
2266cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2267cabdff1aSopenharmony_ci        p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
2268cabdff1aSopenharmony_ci        ST_UB(p6, src);
2269cabdff1aSopenharmony_ci        src += 16;
2270cabdff1aSopenharmony_ci
2271cabdff1aSopenharmony_ci        /* p5 */
2272cabdff1aSopenharmony_ci        q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
2273cabdff1aSopenharmony_ci        tmp0_r = p5_r_in - p6_r_in;
2274cabdff1aSopenharmony_ci        tmp0_r += q1_r_in;
2275cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
2276cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2277cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2278cabdff1aSopenharmony_ci        q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
2279cabdff1aSopenharmony_ci        tmp0_l = p5_l_in - p6_l_in;
2280cabdff1aSopenharmony_ci        tmp0_l += q1_l_in;
2281cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
2282cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
2283cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2284cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2285cabdff1aSopenharmony_ci        p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
2286cabdff1aSopenharmony_ci        ST_UB(p5, src);
2287cabdff1aSopenharmony_ci        src += 16;
2288cabdff1aSopenharmony_ci
2289cabdff1aSopenharmony_ci        /* p4 */
2290cabdff1aSopenharmony_ci        q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
2291cabdff1aSopenharmony_ci        tmp0_r = p4_r_in - p5_r_in;
2292cabdff1aSopenharmony_ci        tmp0_r += q2_r_in;
2293cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
2294cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2295cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2296cabdff1aSopenharmony_ci        q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
2297cabdff1aSopenharmony_ci        tmp0_l = p4_l_in - p5_l_in;
2298cabdff1aSopenharmony_ci        tmp0_l += q2_l_in;
2299cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
2300cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
2301cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2302cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2303cabdff1aSopenharmony_ci        p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
2304cabdff1aSopenharmony_ci        ST_UB(p4, src);
2305cabdff1aSopenharmony_ci        src += 16;
2306cabdff1aSopenharmony_ci
2307cabdff1aSopenharmony_ci        /* p3 */
2308cabdff1aSopenharmony_ci        q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
2309cabdff1aSopenharmony_ci        tmp0_r = p3_r_in - p4_r_in;
2310cabdff1aSopenharmony_ci        tmp0_r += q3_r_in;
2311cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
2312cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2313cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2314cabdff1aSopenharmony_ci        q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
2315cabdff1aSopenharmony_ci        tmp0_l = p3_l_in - p4_l_in;
2316cabdff1aSopenharmony_ci        tmp0_l += q3_l_in;
2317cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
2318cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
2319cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2320cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2321cabdff1aSopenharmony_ci        p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
2322cabdff1aSopenharmony_ci        ST_UB(p3, src);
2323cabdff1aSopenharmony_ci        src += 16;
2324cabdff1aSopenharmony_ci
2325cabdff1aSopenharmony_ci        /* p2 */
2326cabdff1aSopenharmony_ci        q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
2327cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48);
2328cabdff1aSopenharmony_ci        tmp0_r = p2_r_in - p3_r_in;
2329cabdff1aSopenharmony_ci        tmp0_r += q4_r_in;
2330cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
2331cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2332cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2333cabdff1aSopenharmony_ci        q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
2334cabdff1aSopenharmony_ci        tmp0_l = p2_l_in - p3_l_in;
2335cabdff1aSopenharmony_ci        tmp0_l += q4_l_in;
2336cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
2337cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
2338cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2339cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2340cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2341cabdff1aSopenharmony_ci        ST_UB(filter8, src);
2342cabdff1aSopenharmony_ci        src += 16;
2343cabdff1aSopenharmony_ci
2344cabdff1aSopenharmony_ci        /* p1 */
2345cabdff1aSopenharmony_ci        q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
2346cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48 + 16);
2347cabdff1aSopenharmony_ci        tmp0_r = p1_r_in - p2_r_in;
2348cabdff1aSopenharmony_ci        tmp0_r += q5_r_in;
2349cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
2350cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2351cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2352cabdff1aSopenharmony_ci        q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
2353cabdff1aSopenharmony_ci        tmp0_l = p1_l_in - p2_l_in;
2354cabdff1aSopenharmony_ci        tmp0_l += q5_l_in;
2355cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
2356cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
2357cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) (tmp1_l), 4);
2358cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2359cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2360cabdff1aSopenharmony_ci        ST_UB(filter8, src);
2361cabdff1aSopenharmony_ci        src += 16;
2362cabdff1aSopenharmony_ci
2363cabdff1aSopenharmony_ci        /* p0 */
2364cabdff1aSopenharmony_ci        q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
2365cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48 + 32);
2366cabdff1aSopenharmony_ci        tmp0_r = p0_r_in - p1_r_in;
2367cabdff1aSopenharmony_ci        tmp0_r += q6_r_in;
2368cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
2369cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2370cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2371cabdff1aSopenharmony_ci        q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
2372cabdff1aSopenharmony_ci        tmp0_l = p0_l_in - p1_l_in;
2373cabdff1aSopenharmony_ci        tmp0_l += q6_l_in;
2374cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
2375cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
2376cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2377cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2378cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2379cabdff1aSopenharmony_ci        ST_UB(filter8, src);
2380cabdff1aSopenharmony_ci        src += 16;
2381cabdff1aSopenharmony_ci
2382cabdff1aSopenharmony_ci        /* q0 */
2383cabdff1aSopenharmony_ci        q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
2384cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48 + 48);
2385cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - p0_r_in;
2386cabdff1aSopenharmony_ci        tmp0_r += q0_r_in;
2387cabdff1aSopenharmony_ci        tmp0_r -= p7_r_in;
2388cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2389cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2390cabdff1aSopenharmony_ci        q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
2391cabdff1aSopenharmony_ci        tmp0_l = q7_l_in - p0_l_in;
2392cabdff1aSopenharmony_ci        tmp0_l += q0_l_in;
2393cabdff1aSopenharmony_ci        tmp0_l -= p7_l_in;
2394cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
2395cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2396cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2397cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2398cabdff1aSopenharmony_ci        ST_UB(filter8, src);
2399cabdff1aSopenharmony_ci        src += 16;
2400cabdff1aSopenharmony_ci
2401cabdff1aSopenharmony_ci        /* q1 */
2402cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48 + 64);
2403cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q0_r_in;
2404cabdff1aSopenharmony_ci        tmp0_r += q1_r_in;
2405cabdff1aSopenharmony_ci        tmp0_r -= p6_r_in;
2406cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2407cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2408cabdff1aSopenharmony_ci        tmp0_l = q7_l_in - q0_l_in;
2409cabdff1aSopenharmony_ci        tmp0_l += q1_l_in;
2410cabdff1aSopenharmony_ci        tmp0_l -= p6_l_in;
2411cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
2412cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2413cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2414cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2415cabdff1aSopenharmony_ci        ST_UB(filter8, src);
2416cabdff1aSopenharmony_ci        src += 16;
2417cabdff1aSopenharmony_ci
2418cabdff1aSopenharmony_ci        /* q2 */
2419cabdff1aSopenharmony_ci        filter8 = LD_UB(filter48 + 80);
2420cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q1_r_in;
2421cabdff1aSopenharmony_ci        tmp0_r += q2_r_in;
2422cabdff1aSopenharmony_ci        tmp0_r -= p5_r_in;
2423cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2424cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2425cabdff1aSopenharmony_ci        tmp0_l = q7_l_in - q1_l_in;
2426cabdff1aSopenharmony_ci        tmp0_l += q2_l_in;
2427cabdff1aSopenharmony_ci        tmp0_l -= p5_l_in;
2428cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
2429cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2430cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2431cabdff1aSopenharmony_ci        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2432cabdff1aSopenharmony_ci        ST_UB(filter8, src);
2433cabdff1aSopenharmony_ci        src += 16;
2434cabdff1aSopenharmony_ci
2435cabdff1aSopenharmony_ci        /* q3 */
2436cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q2_r_in;
2437cabdff1aSopenharmony_ci        tmp0_r += q3_r_in;
2438cabdff1aSopenharmony_ci        tmp0_r -= p4_r_in;
2439cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2440cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2441cabdff1aSopenharmony_ci        tmp0_l = q7_l_in - q2_l_in;
2442cabdff1aSopenharmony_ci        tmp0_l += q3_l_in;
2443cabdff1aSopenharmony_ci        tmp0_l -= p4_l_in;
2444cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
2445cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2446cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2447cabdff1aSopenharmony_ci        q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2448cabdff1aSopenharmony_ci        ST_UB(q3, src);
2449cabdff1aSopenharmony_ci        src += 16;
2450cabdff1aSopenharmony_ci
2451cabdff1aSopenharmony_ci        /* q4 */
2452cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q3_r_in;
2453cabdff1aSopenharmony_ci        tmp0_r += q4_r_in;
2454cabdff1aSopenharmony_ci        tmp0_r -= p3_r_in;
2455cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2456cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2457cabdff1aSopenharmony_ci        tmp0_l = q7_l_in - q3_l_in;
2458cabdff1aSopenharmony_ci        tmp0_l += q4_l_in;
2459cabdff1aSopenharmony_ci        tmp0_l -= p3_l_in;
2460cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
2461cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2462cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2463cabdff1aSopenharmony_ci        q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2464cabdff1aSopenharmony_ci        ST_UB(q4, src);
2465cabdff1aSopenharmony_ci        src += 16;
2466cabdff1aSopenharmony_ci
2467cabdff1aSopenharmony_ci        /* q5 */
2468cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q4_r_in;
2469cabdff1aSopenharmony_ci        tmp0_r += q5_r_in;
2470cabdff1aSopenharmony_ci        tmp0_r -= p2_r_in;
2471cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2472cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2473cabdff1aSopenharmony_ci        tmp0_l = q7_l_in - q4_l_in;
2474cabdff1aSopenharmony_ci        tmp0_l += q5_l_in;
2475cabdff1aSopenharmony_ci        tmp0_l -= p2_l_in;
2476cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
2477cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2478cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2479cabdff1aSopenharmony_ci        q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2480cabdff1aSopenharmony_ci        ST_UB(q5, src);
2481cabdff1aSopenharmony_ci        src += 16;
2482cabdff1aSopenharmony_ci
2483cabdff1aSopenharmony_ci        /* q6 */
2484cabdff1aSopenharmony_ci        tmp0_r = q7_r_in - q5_r_in;
2485cabdff1aSopenharmony_ci        tmp0_r += q6_r_in;
2486cabdff1aSopenharmony_ci        tmp0_r -= p1_r_in;
2487cabdff1aSopenharmony_ci        tmp1_r += tmp0_r;
2488cabdff1aSopenharmony_ci        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2489cabdff1aSopenharmony_ci        tmp0_l = q7_l_in - q5_l_in;
2490cabdff1aSopenharmony_ci        tmp0_l += q6_l_in;
2491cabdff1aSopenharmony_ci        tmp0_l -= p1_l_in;
2492cabdff1aSopenharmony_ci        tmp1_l += tmp0_l;
2493cabdff1aSopenharmony_ci        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2494cabdff1aSopenharmony_ci        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2495cabdff1aSopenharmony_ci        q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2496cabdff1aSopenharmony_ci        ST_UB(q6, src);
2497cabdff1aSopenharmony_ci
2498cabdff1aSopenharmony_ci        return 0;
2499cabdff1aSopenharmony_ci    }
2500cabdff1aSopenharmony_ci}
2501cabdff1aSopenharmony_ci
2502cabdff1aSopenharmony_civoid ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch,
2503cabdff1aSopenharmony_ci                                int32_t b_limit_ptr,
2504cabdff1aSopenharmony_ci                                int32_t limit_ptr,
2505cabdff1aSopenharmony_ci                                int32_t thresh_ptr)
2506cabdff1aSopenharmony_ci{
2507cabdff1aSopenharmony_ci    uint8_t early_exit = 0;
2508cabdff1aSopenharmony_ci    uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2509cabdff1aSopenharmony_ci    uint8_t *filter48 = &transposed_input[16 * 16];
2510cabdff1aSopenharmony_ci
2511cabdff1aSopenharmony_ci    vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
2512cabdff1aSopenharmony_ci
2513cabdff1aSopenharmony_ci    early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
2514cabdff1aSopenharmony_ci                                          &filter48[0], src, pitch,
2515cabdff1aSopenharmony_ci                                          b_limit_ptr, limit_ptr, thresh_ptr);
2516cabdff1aSopenharmony_ci
2517cabdff1aSopenharmony_ci    if (0 == early_exit) {
2518cabdff1aSopenharmony_ci        early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
2519cabdff1aSopenharmony_ci                                        &filter48[0]);
2520cabdff1aSopenharmony_ci
2521cabdff1aSopenharmony_ci        if (0 == early_exit) {
2522cabdff1aSopenharmony_ci            vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch);
2523cabdff1aSopenharmony_ci        }
2524cabdff1aSopenharmony_ci    }
2525cabdff1aSopenharmony_ci}
2526