1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2018 gxw <guxiwei-hf@loongson.cn>
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "vp3dsp_mips.h"
22cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
23cabdff1aSopenharmony_ci#include "libavutil/intreadwrite.h"
24cabdff1aSopenharmony_ci#include "libavcodec/rnd_avg.h"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_cistatic void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
27cabdff1aSopenharmony_ci{
28cabdff1aSopenharmony_ci    v8i16 r0, r1, r2, r3, r4, r5, r6, r7, sign;
29cabdff1aSopenharmony_ci    v4i32 r0_r, r0_l, r1_r, r1_l, r2_r, r2_l, r3_r, r3_l,
30cabdff1aSopenharmony_ci          r4_r, r4_l, r5_r, r5_l, r6_r, r6_l, r7_r, r7_l;
31cabdff1aSopenharmony_ci    v4i32 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;
32cabdff1aSopenharmony_ci    v4i32 Ed, Gd, Add, Bdd, Fd, Hd;
33cabdff1aSopenharmony_ci    v16u8 sign_l;
34cabdff1aSopenharmony_ci    v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
35cabdff1aSopenharmony_ci    v4i32 c0, c1, c2, c3, c4, c5, c6, c7;
36cabdff1aSopenharmony_ci    v4i32 f0, f1, f2, f3, f4, f5, f6, f7;
37cabdff1aSopenharmony_ci    v4i32 sign_t;
38cabdff1aSopenharmony_ci    v16i8 zero = {0};
39cabdff1aSopenharmony_ci    v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
40cabdff1aSopenharmony_ci    v4i32 cnst64277w = {64277, 64277, 64277, 64277};
41cabdff1aSopenharmony_ci    v4i32 cnst60547w = {60547, 60547, 60547, 60547};
42cabdff1aSopenharmony_ci    v4i32 cnst54491w = {54491, 54491, 54491, 54491};
43cabdff1aSopenharmony_ci    v4i32 cnst46341w = {46341, 46341, 46341, 46341};
44cabdff1aSopenharmony_ci    v4i32 cnst36410w = {36410, 36410, 36410, 36410};
45cabdff1aSopenharmony_ci    v4i32 cnst25080w = {25080, 25080, 25080, 25080};
46cabdff1aSopenharmony_ci    v4i32 cnst12785w = {12785, 12785, 12785, 12785};
47cabdff1aSopenharmony_ci    v4i32 cnst8w = {8, 8, 8, 8};
48cabdff1aSopenharmony_ci    v4i32 cnst2048w = {2048, 2048, 2048, 2048};
49cabdff1aSopenharmony_ci    v4i32 cnst128w = {128, 128, 128, 128};
50cabdff1aSopenharmony_ci
51cabdff1aSopenharmony_ci    /* Extended input data */
52cabdff1aSopenharmony_ci    LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7);
53cabdff1aSopenharmony_ci    sign = __msa_clti_s_h(r0, 0);
54cabdff1aSopenharmony_ci    r0_r = (v4i32) __msa_ilvr_h(sign, r0);
55cabdff1aSopenharmony_ci    r0_l = (v4i32) __msa_ilvl_h(sign, r0);
56cabdff1aSopenharmony_ci    sign = __msa_clti_s_h(r1, 0);
57cabdff1aSopenharmony_ci    r1_r = (v4i32) __msa_ilvr_h(sign, r1);
58cabdff1aSopenharmony_ci    r1_l = (v4i32) __msa_ilvl_h(sign, r1);
59cabdff1aSopenharmony_ci    sign = __msa_clti_s_h(r2, 0);
60cabdff1aSopenharmony_ci    r2_r = (v4i32) __msa_ilvr_h(sign, r2);
61cabdff1aSopenharmony_ci    r2_l = (v4i32) __msa_ilvl_h(sign, r2);
62cabdff1aSopenharmony_ci    sign = __msa_clti_s_h(r3, 0);
63cabdff1aSopenharmony_ci    r3_r = (v4i32) __msa_ilvr_h(sign, r3);
64cabdff1aSopenharmony_ci    r3_l = (v4i32) __msa_ilvl_h(sign, r3);
65cabdff1aSopenharmony_ci    sign = __msa_clti_s_h(r4, 0);
66cabdff1aSopenharmony_ci    r4_r = (v4i32) __msa_ilvr_h(sign, r4);
67cabdff1aSopenharmony_ci    r4_l = (v4i32) __msa_ilvl_h(sign, r4);
68cabdff1aSopenharmony_ci    sign = __msa_clti_s_h(r5, 0);
69cabdff1aSopenharmony_ci    r5_r = (v4i32) __msa_ilvr_h(sign, r5);
70cabdff1aSopenharmony_ci    r5_l = (v4i32) __msa_ilvl_h(sign, r5);
71cabdff1aSopenharmony_ci    sign = __msa_clti_s_h(r6, 0);
72cabdff1aSopenharmony_ci    r6_r = (v4i32) __msa_ilvr_h(sign, r6);
73cabdff1aSopenharmony_ci    r6_l = (v4i32) __msa_ilvl_h(sign, r6);
74cabdff1aSopenharmony_ci    sign = __msa_clti_s_h(r7, 0);
75cabdff1aSopenharmony_ci    r7_r = (v4i32) __msa_ilvr_h(sign, r7);
76cabdff1aSopenharmony_ci    r7_l = (v4i32) __msa_ilvl_h(sign, r7);
77cabdff1aSopenharmony_ci
78cabdff1aSopenharmony_ci    /* Right part */
79cabdff1aSopenharmony_ci    A = ((r1_r * cnst64277w) >> 16) + ((r7_r * cnst12785w) >> 16);
80cabdff1aSopenharmony_ci    B = ((r1_r * cnst12785w) >> 16) - ((r7_r * cnst64277w) >> 16);
81cabdff1aSopenharmony_ci    C = ((r3_r * cnst54491w) >> 16) + ((r5_r * cnst36410w) >> 16);
82cabdff1aSopenharmony_ci    D = ((r5_r * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
83cabdff1aSopenharmony_ci    Ad = ((A - C) * cnst46341w) >> 16;
84cabdff1aSopenharmony_ci    Bd = ((B - D) * cnst46341w) >> 16;
85cabdff1aSopenharmony_ci    Cd = A + C;
86cabdff1aSopenharmony_ci    Dd = B + D;
87cabdff1aSopenharmony_ci    E = ((r0_r + r4_r) * cnst46341w) >> 16;
88cabdff1aSopenharmony_ci    F = ((r0_r - r4_r) * cnst46341w) >> 16;
89cabdff1aSopenharmony_ci    G = ((r2_r * cnst60547w) >> 16) + ((r6_r * cnst25080w) >> 16);
90cabdff1aSopenharmony_ci    H = ((r2_r * cnst25080w) >> 16) - ((r6_r * cnst60547w) >> 16);
91cabdff1aSopenharmony_ci    Ed = E - G;
92cabdff1aSopenharmony_ci    Gd = E + G;
93cabdff1aSopenharmony_ci    Add = F + Ad;
94cabdff1aSopenharmony_ci    Bdd = Bd - H;
95cabdff1aSopenharmony_ci    Fd = F - Ad;
96cabdff1aSopenharmony_ci    Hd = Bd + H;
97cabdff1aSopenharmony_ci    r0_r = Gd + Cd;
98cabdff1aSopenharmony_ci    r7_r = Gd - Cd;
99cabdff1aSopenharmony_ci    r1_r = Add + Hd;
100cabdff1aSopenharmony_ci    r2_r = Add - Hd;
101cabdff1aSopenharmony_ci    r3_r = Ed + Dd;
102cabdff1aSopenharmony_ci    r4_r = Ed - Dd;
103cabdff1aSopenharmony_ci    r5_r = Fd + Bdd;
104cabdff1aSopenharmony_ci    r6_r = Fd - Bdd;
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_ci    /* Left part */
107cabdff1aSopenharmony_ci    A = ((r1_l * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
108cabdff1aSopenharmony_ci    B = ((r1_l * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
109cabdff1aSopenharmony_ci    C = ((r3_l * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
110cabdff1aSopenharmony_ci    D = ((r5_l * cnst54491w) >> 16) - ((r3_l * cnst36410w) >> 16);
111cabdff1aSopenharmony_ci    Ad = ((A - C) * cnst46341w) >> 16;
112cabdff1aSopenharmony_ci    Bd = ((B - D) * cnst46341w) >> 16;
113cabdff1aSopenharmony_ci    Cd = A + C;
114cabdff1aSopenharmony_ci    Dd = B + D;
115cabdff1aSopenharmony_ci    E = ((r0_l + r4_l) * cnst46341w) >> 16;
116cabdff1aSopenharmony_ci    F = ((r0_l - r4_l) * cnst46341w) >> 16;
117cabdff1aSopenharmony_ci    G = ((r2_l * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
118cabdff1aSopenharmony_ci    H = ((r2_l * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
119cabdff1aSopenharmony_ci    Ed = E - G;
120cabdff1aSopenharmony_ci    Gd = E + G;
121cabdff1aSopenharmony_ci    Add = F + Ad;
122cabdff1aSopenharmony_ci    Bdd = Bd - H;
123cabdff1aSopenharmony_ci    Fd = F - Ad;
124cabdff1aSopenharmony_ci    Hd = Bd + H;
125cabdff1aSopenharmony_ci    r0_l = Gd + Cd;
126cabdff1aSopenharmony_ci    r7_l = Gd - Cd;
127cabdff1aSopenharmony_ci    r1_l = Add + Hd;
128cabdff1aSopenharmony_ci    r2_l = Add - Hd;
129cabdff1aSopenharmony_ci    r3_l = Ed + Dd;
130cabdff1aSopenharmony_ci    r4_l = Ed - Dd;
131cabdff1aSopenharmony_ci    r5_l = Fd + Bdd;
132cabdff1aSopenharmony_ci    r6_l = Fd - Bdd;
133cabdff1aSopenharmony_ci
134cabdff1aSopenharmony_ci    /* Row 0 to 3 */
135cabdff1aSopenharmony_ci    TRANSPOSE4x4_SW_SW(r0_r, r1_r, r2_r, r3_r,
136cabdff1aSopenharmony_ci                       r0_r, r1_r, r2_r, r3_r);
137cabdff1aSopenharmony_ci    TRANSPOSE4x4_SW_SW(r0_l, r1_l, r2_l, r3_l,
138cabdff1aSopenharmony_ci                       r0_l, r1_l, r2_l, r3_l);
139cabdff1aSopenharmony_ci    A = ((r1_r * cnst64277w) >> 16) + ((r3_l * cnst12785w) >> 16);
140cabdff1aSopenharmony_ci    B = ((r1_r * cnst12785w) >> 16) - ((r3_l * cnst64277w) >> 16);
141cabdff1aSopenharmony_ci    C = ((r3_r * cnst54491w) >> 16) + ((r1_l * cnst36410w) >> 16);
142cabdff1aSopenharmony_ci    D = ((r1_l * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
143cabdff1aSopenharmony_ci    Ad = ((A - C) * cnst46341w) >> 16;
144cabdff1aSopenharmony_ci    Bd = ((B - D) * cnst46341w) >> 16;
145cabdff1aSopenharmony_ci    Cd = A + C;
146cabdff1aSopenharmony_ci    Dd = B + D;
147cabdff1aSopenharmony_ci    E = ((r0_r + r0_l) * cnst46341w) >> 16;
148cabdff1aSopenharmony_ci    E += cnst8w;
149cabdff1aSopenharmony_ci    F = ((r0_r - r0_l) * cnst46341w) >> 16;
150cabdff1aSopenharmony_ci    F += cnst8w;
151cabdff1aSopenharmony_ci    if (type == 1) { // HACK
152cabdff1aSopenharmony_ci        E += cnst2048w;
153cabdff1aSopenharmony_ci        F += cnst2048w;
154cabdff1aSopenharmony_ci    }
155cabdff1aSopenharmony_ci    G = ((r2_r * cnst60547w) >> 16) + ((r2_l * cnst25080w) >> 16);
156cabdff1aSopenharmony_ci    H = ((r2_r * cnst25080w) >> 16) - ((r2_l * cnst60547w) >> 16);
157cabdff1aSopenharmony_ci    Ed = E - G;
158cabdff1aSopenharmony_ci    Gd = E + G;
159cabdff1aSopenharmony_ci    Add = F + Ad;
160cabdff1aSopenharmony_ci    Bdd = Bd - H;
161cabdff1aSopenharmony_ci    Fd = F - Ad;
162cabdff1aSopenharmony_ci    Hd = Bd + H;
163cabdff1aSopenharmony_ci    A = (Gd + Cd) >> 4;
164cabdff1aSopenharmony_ci    B = (Gd - Cd) >> 4;
165cabdff1aSopenharmony_ci    C = (Add + Hd) >> 4;
166cabdff1aSopenharmony_ci    D = (Add - Hd) >> 4;
167cabdff1aSopenharmony_ci    E = (Ed + Dd) >> 4;
168cabdff1aSopenharmony_ci    F = (Ed - Dd) >> 4;
169cabdff1aSopenharmony_ci    G = (Fd + Bdd) >> 4;
170cabdff1aSopenharmony_ci    H = (Fd - Bdd) >> 4;
171cabdff1aSopenharmony_ci    if (type != 1) {
172cabdff1aSopenharmony_ci        LD_SB8(dst, stride, d0, d1, d2, d3, d4, d5, d6, d7);
173cabdff1aSopenharmony_ci        ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
174cabdff1aSopenharmony_ci                   f0, f1, f2, f3);
175cabdff1aSopenharmony_ci        ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7,
176cabdff1aSopenharmony_ci                   f4, f5, f6, f7);
177cabdff1aSopenharmony_ci        ILVR_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3,
178cabdff1aSopenharmony_ci                   c0, c1, c2, c3);
179cabdff1aSopenharmony_ci        ILVR_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7,
180cabdff1aSopenharmony_ci                   c4, c5, c6, c7);
181cabdff1aSopenharmony_ci        A += c0;
182cabdff1aSopenharmony_ci        B += c7;
183cabdff1aSopenharmony_ci        C += c1;
184cabdff1aSopenharmony_ci        D += c2;
185cabdff1aSopenharmony_ci        E += c3;
186cabdff1aSopenharmony_ci        F += c4;
187cabdff1aSopenharmony_ci        G += c5;
188cabdff1aSopenharmony_ci        H += c6;
189cabdff1aSopenharmony_ci    }
190cabdff1aSopenharmony_ci    CLIP_SW8_0_255(A, B, C, D, E, F, G, H);
191cabdff1aSopenharmony_ci    sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r);
192cabdff1aSopenharmony_ci    sign_l = __msa_or_v(sign_l, (v16u8)r3_r);
193cabdff1aSopenharmony_ci    sign_l = __msa_or_v(sign_l, (v16u8)r0_l);
194cabdff1aSopenharmony_ci    sign_l = __msa_or_v(sign_l, (v16u8)r1_l);
195cabdff1aSopenharmony_ci    sign_l = __msa_or_v(sign_l, (v16u8)r2_l);
196cabdff1aSopenharmony_ci    sign_l = __msa_or_v(sign_l, (v16u8)r3_l);
197cabdff1aSopenharmony_ci    sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
198cabdff1aSopenharmony_ci    Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20;
199cabdff1aSopenharmony_ci    if (type == 1) {
200cabdff1aSopenharmony_ci        Bdd = Add + cnst128w;
201cabdff1aSopenharmony_ci        CLIP_SW_0_255(Bdd);
202cabdff1aSopenharmony_ci        Ad = Bdd;
203cabdff1aSopenharmony_ci        Bd = Bdd;
204cabdff1aSopenharmony_ci        Cd = Bdd;
205cabdff1aSopenharmony_ci        Dd = Bdd;
206cabdff1aSopenharmony_ci        Ed = Bdd;
207cabdff1aSopenharmony_ci        Fd = Bdd;
208cabdff1aSopenharmony_ci        Gd = Bdd;
209cabdff1aSopenharmony_ci        Hd = Bdd;
210cabdff1aSopenharmony_ci    } else {
211cabdff1aSopenharmony_ci        Ad = Add + c0;
212cabdff1aSopenharmony_ci        Bd = Add + c1;
213cabdff1aSopenharmony_ci        Cd = Add + c2;
214cabdff1aSopenharmony_ci        Dd = Add + c3;
215cabdff1aSopenharmony_ci        Ed = Add + c4;
216cabdff1aSopenharmony_ci        Fd = Add + c5;
217cabdff1aSopenharmony_ci        Gd = Add + c6;
218cabdff1aSopenharmony_ci        Hd = Add + c7;
219cabdff1aSopenharmony_ci        CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
220cabdff1aSopenharmony_ci    }
221cabdff1aSopenharmony_ci    Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
222cabdff1aSopenharmony_ci    Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
223cabdff1aSopenharmony_ci    Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
224cabdff1aSopenharmony_ci    Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
225cabdff1aSopenharmony_ci    Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
226cabdff1aSopenharmony_ci    Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
227cabdff1aSopenharmony_ci    Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
228cabdff1aSopenharmony_ci    Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
229cabdff1aSopenharmony_ci    sign_t = __msa_ceqi_w(sign_t, 0);
230cabdff1aSopenharmony_ci    A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t);
231cabdff1aSopenharmony_ci    B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t);
232cabdff1aSopenharmony_ci    C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t);
233cabdff1aSopenharmony_ci    D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t);
234cabdff1aSopenharmony_ci    E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t);
235cabdff1aSopenharmony_ci    F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t);
236cabdff1aSopenharmony_ci    G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t);
237cabdff1aSopenharmony_ci    H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t);
238cabdff1aSopenharmony_ci    r0_r = Ad + A;
239cabdff1aSopenharmony_ci    r1_r = Bd + C;
240cabdff1aSopenharmony_ci    r2_r = Cd + D;
241cabdff1aSopenharmony_ci    r3_r = Dd + E;
242cabdff1aSopenharmony_ci    r0_l = Ed + F;
243cabdff1aSopenharmony_ci    r1_l = Fd + G;
244cabdff1aSopenharmony_ci    r2_l = Gd + H;
245cabdff1aSopenharmony_ci    r3_l = Hd + B;
246cabdff1aSopenharmony_ci
247cabdff1aSopenharmony_ci    /* Row 4 to 7 */
248cabdff1aSopenharmony_ci    TRANSPOSE4x4_SW_SW(r4_r, r5_r, r6_r, r7_r,
249cabdff1aSopenharmony_ci                       r4_r, r5_r, r6_r, r7_r);
250cabdff1aSopenharmony_ci    TRANSPOSE4x4_SW_SW(r4_l, r5_l, r6_l, r7_l,
251cabdff1aSopenharmony_ci                       r4_l, r5_l, r6_l, r7_l);
252cabdff1aSopenharmony_ci    A = ((r5_r * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
253cabdff1aSopenharmony_ci    B = ((r5_r * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
254cabdff1aSopenharmony_ci    C = ((r7_r * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
255cabdff1aSopenharmony_ci    D = ((r5_l * cnst54491w) >> 16) - ((r7_r * cnst36410w) >> 16);
256cabdff1aSopenharmony_ci    Ad = ((A - C) * cnst46341w) >> 16;
257cabdff1aSopenharmony_ci    Bd = ((B - D) * cnst46341w) >> 16;
258cabdff1aSopenharmony_ci    Cd = A + C;
259cabdff1aSopenharmony_ci    Dd = B + D;
260cabdff1aSopenharmony_ci    E = ((r4_r + r4_l) * cnst46341w) >> 16;
261cabdff1aSopenharmony_ci    E += cnst8w;
262cabdff1aSopenharmony_ci    F = ((r4_r - r4_l) * cnst46341w) >> 16;
263cabdff1aSopenharmony_ci    F += cnst8w;
264cabdff1aSopenharmony_ci    if (type == 1) { // HACK
265cabdff1aSopenharmony_ci        E += cnst2048w;
266cabdff1aSopenharmony_ci        F += cnst2048w;
267cabdff1aSopenharmony_ci    }
268cabdff1aSopenharmony_ci    G = ((r6_r * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
269cabdff1aSopenharmony_ci    H = ((r6_r * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
270cabdff1aSopenharmony_ci    Ed = E - G;
271cabdff1aSopenharmony_ci    Gd = E + G;
272cabdff1aSopenharmony_ci    Add = F + Ad;
273cabdff1aSopenharmony_ci    Bdd = Bd - H;
274cabdff1aSopenharmony_ci    Fd = F - Ad;
275cabdff1aSopenharmony_ci    Hd = Bd + H;
276cabdff1aSopenharmony_ci    A = (Gd + Cd) >> 4;
277cabdff1aSopenharmony_ci    B = (Gd - Cd) >> 4;
278cabdff1aSopenharmony_ci    C = (Add + Hd) >> 4;
279cabdff1aSopenharmony_ci    D = (Add - Hd) >> 4;
280cabdff1aSopenharmony_ci    E = (Ed + Dd) >> 4;
281cabdff1aSopenharmony_ci    F = (Ed - Dd) >> 4;
282cabdff1aSopenharmony_ci    G = (Fd + Bdd) >> 4;
283cabdff1aSopenharmony_ci    H = (Fd - Bdd) >> 4;
284cabdff1aSopenharmony_ci    if (type != 1) {
285cabdff1aSopenharmony_ci        ILVL_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3,
286cabdff1aSopenharmony_ci                   c0, c1, c2, c3);
287cabdff1aSopenharmony_ci        ILVL_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7,
288cabdff1aSopenharmony_ci                   c4, c5, c6, c7);
289cabdff1aSopenharmony_ci        A += c0;
290cabdff1aSopenharmony_ci        B += c7;
291cabdff1aSopenharmony_ci        C += c1;
292cabdff1aSopenharmony_ci        D += c2;
293cabdff1aSopenharmony_ci        E += c3;
294cabdff1aSopenharmony_ci        F += c4;
295cabdff1aSopenharmony_ci        G += c5;
296cabdff1aSopenharmony_ci        H += c6;
297cabdff1aSopenharmony_ci    }
298cabdff1aSopenharmony_ci    CLIP_SW8_0_255(A, B, C, D, E, F, G, H);
299cabdff1aSopenharmony_ci    sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r);
300cabdff1aSopenharmony_ci    sign_l = __msa_or_v(sign_l, (v16u8)r7_r);
301cabdff1aSopenharmony_ci    sign_l = __msa_or_v(sign_l, (v16u8)r4_l);
302cabdff1aSopenharmony_ci    sign_l = __msa_or_v(sign_l, (v16u8)r5_l);
303cabdff1aSopenharmony_ci    sign_l = __msa_or_v(sign_l, (v16u8)r6_l);
304cabdff1aSopenharmony_ci    sign_l = __msa_or_v(sign_l, (v16u8)r7_l);
305cabdff1aSopenharmony_ci    sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
306cabdff1aSopenharmony_ci    Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20;
307cabdff1aSopenharmony_ci    if (type == 1) {
308cabdff1aSopenharmony_ci        Bdd = Add + cnst128w;
309cabdff1aSopenharmony_ci        CLIP_SW_0_255(Bdd);
310cabdff1aSopenharmony_ci        Ad = Bdd;
311cabdff1aSopenharmony_ci        Bd = Bdd;
312cabdff1aSopenharmony_ci        Cd = Bdd;
313cabdff1aSopenharmony_ci        Dd = Bdd;
314cabdff1aSopenharmony_ci        Ed = Bdd;
315cabdff1aSopenharmony_ci        Fd = Bdd;
316cabdff1aSopenharmony_ci        Gd = Bdd;
317cabdff1aSopenharmony_ci        Hd = Bdd;
318cabdff1aSopenharmony_ci    } else {
319cabdff1aSopenharmony_ci        Ad = Add + c0;
320cabdff1aSopenharmony_ci        Bd = Add + c1;
321cabdff1aSopenharmony_ci        Cd = Add + c2;
322cabdff1aSopenharmony_ci        Dd = Add + c3;
323cabdff1aSopenharmony_ci        Ed = Add + c4;
324cabdff1aSopenharmony_ci        Fd = Add + c5;
325cabdff1aSopenharmony_ci        Gd = Add + c6;
326cabdff1aSopenharmony_ci        Hd = Add + c7;
327cabdff1aSopenharmony_ci        CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
328cabdff1aSopenharmony_ci    }
329cabdff1aSopenharmony_ci    Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
330cabdff1aSopenharmony_ci    Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
331cabdff1aSopenharmony_ci    Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
332cabdff1aSopenharmony_ci    Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
333cabdff1aSopenharmony_ci    Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
334cabdff1aSopenharmony_ci    Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
335cabdff1aSopenharmony_ci    Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
336cabdff1aSopenharmony_ci    Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
337cabdff1aSopenharmony_ci    sign_t = __msa_ceqi_w(sign_t, 0);
338cabdff1aSopenharmony_ci    A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t);
339cabdff1aSopenharmony_ci    B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t);
340cabdff1aSopenharmony_ci    C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t);
341cabdff1aSopenharmony_ci    D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t);
342cabdff1aSopenharmony_ci    E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t);
343cabdff1aSopenharmony_ci    F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t);
344cabdff1aSopenharmony_ci    G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t);
345cabdff1aSopenharmony_ci    H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t);
346cabdff1aSopenharmony_ci    r4_r = Ad + A;
347cabdff1aSopenharmony_ci    r5_r = Bd + C;
348cabdff1aSopenharmony_ci    r6_r = Cd + D;
349cabdff1aSopenharmony_ci    r7_r = Dd + E;
350cabdff1aSopenharmony_ci    r4_l = Ed + F;
351cabdff1aSopenharmony_ci    r5_l = Fd + G;
352cabdff1aSopenharmony_ci    r6_l = Gd + H;
353cabdff1aSopenharmony_ci    r7_l = Hd + B;
354cabdff1aSopenharmony_ci    VSHF_B2_SB(r0_r, r4_r, r1_r, r5_r, mask, mask, d0, d1);
355cabdff1aSopenharmony_ci    VSHF_B2_SB(r2_r, r6_r, r3_r, r7_r, mask, mask, d2, d3);
356cabdff1aSopenharmony_ci    VSHF_B2_SB(r0_l, r4_l, r1_l, r5_l, mask, mask, d4, d5);
357cabdff1aSopenharmony_ci    VSHF_B2_SB(r2_l, r6_l, r3_l, r7_l, mask, mask, d6, d7);
358cabdff1aSopenharmony_ci
359cabdff1aSopenharmony_ci    /* Final sequence of operations over-write original dst */
360cabdff1aSopenharmony_ci    ST_D1(d0, 0, dst);
361cabdff1aSopenharmony_ci    ST_D1(d1, 0, dst + stride);
362cabdff1aSopenharmony_ci    ST_D1(d2, 0, dst + 2 * stride);
363cabdff1aSopenharmony_ci    ST_D1(d3, 0, dst + 3 * stride);
364cabdff1aSopenharmony_ci    ST_D1(d4, 0, dst + 4 * stride);
365cabdff1aSopenharmony_ci    ST_D1(d5, 0, dst + 5 * stride);
366cabdff1aSopenharmony_ci    ST_D1(d6, 0, dst + 6 * stride);
367cabdff1aSopenharmony_ci    ST_D1(d7, 0, dst + 7 * stride);
368cabdff1aSopenharmony_ci}
369cabdff1aSopenharmony_ci
370cabdff1aSopenharmony_civoid ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
371cabdff1aSopenharmony_ci{
372cabdff1aSopenharmony_ci    idct_msa(dest, line_size, block, 1);
373cabdff1aSopenharmony_ci    memset(block, 0, sizeof(*block) * 64);
374cabdff1aSopenharmony_ci}
375cabdff1aSopenharmony_ci
376cabdff1aSopenharmony_civoid ff_vp3_idct_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
377cabdff1aSopenharmony_ci{
378cabdff1aSopenharmony_ci    idct_msa(dest, line_size, block, 2);
379cabdff1aSopenharmony_ci    memset(block, 0, sizeof(*block) * 64);
380cabdff1aSopenharmony_ci}
381cabdff1aSopenharmony_ci
382cabdff1aSopenharmony_civoid ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
383cabdff1aSopenharmony_ci{
384cabdff1aSopenharmony_ci    int i = (block[0] + 15) >> 5;
385cabdff1aSopenharmony_ci    v4i32 dc = {i, i, i, i};
386cabdff1aSopenharmony_ci    v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
387cabdff1aSopenharmony_ci    v4i32 c0, c1, c2, c3, c4, c5, c6, c7;
388cabdff1aSopenharmony_ci    v4i32 e0, e1, e2, e3, e4, e5, e6, e7;
389cabdff1aSopenharmony_ci    v4i32 r0, r1, r2, r3, r4, r5, r6, r7;
390cabdff1aSopenharmony_ci    v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
391cabdff1aSopenharmony_ci    v16i8 zero = {0};
392cabdff1aSopenharmony_ci
393cabdff1aSopenharmony_ci    LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7);
394cabdff1aSopenharmony_ci    ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
395cabdff1aSopenharmony_ci               c0, c1, c2, c3);
396cabdff1aSopenharmony_ci    ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7,
397cabdff1aSopenharmony_ci               c4, c5, c6, c7);
398cabdff1aSopenharmony_ci    /* Right part */
399cabdff1aSopenharmony_ci    ILVR_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3,
400cabdff1aSopenharmony_ci               e0, e1, e2, e3);
401cabdff1aSopenharmony_ci    ILVR_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7,
402cabdff1aSopenharmony_ci               e4, e5, e6, e7);
403cabdff1aSopenharmony_ci    e0 += dc;
404cabdff1aSopenharmony_ci    e1 += dc;
405cabdff1aSopenharmony_ci    e2 += dc;
406cabdff1aSopenharmony_ci    e3 += dc;
407cabdff1aSopenharmony_ci    e4 += dc;
408cabdff1aSopenharmony_ci    e5 += dc;
409cabdff1aSopenharmony_ci    e6 += dc;
410cabdff1aSopenharmony_ci    e7 += dc;
411cabdff1aSopenharmony_ci    CLIP_SW8_0_255(e0, e1, e2, e3, e4, e5, e6, e7);
412cabdff1aSopenharmony_ci
413cabdff1aSopenharmony_ci    /* Left part */
414cabdff1aSopenharmony_ci    ILVL_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3,
415cabdff1aSopenharmony_ci               r0, r1, r2, r3);
416cabdff1aSopenharmony_ci    ILVL_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7,
417cabdff1aSopenharmony_ci               r4, r5, r6, r7);
418cabdff1aSopenharmony_ci    r0 += dc;
419cabdff1aSopenharmony_ci    r1 += dc;
420cabdff1aSopenharmony_ci    r2 += dc;
421cabdff1aSopenharmony_ci    r3 += dc;
422cabdff1aSopenharmony_ci    r4 += dc;
423cabdff1aSopenharmony_ci    r5 += dc;
424cabdff1aSopenharmony_ci    r6 += dc;
425cabdff1aSopenharmony_ci    r7 += dc;
426cabdff1aSopenharmony_ci    CLIP_SW8_0_255(r0, r1, r2, r3, r4, r5, r6, r7);
427cabdff1aSopenharmony_ci    VSHF_B2_SB(e0, r0, e1, r1, mask, mask, d0, d1);
428cabdff1aSopenharmony_ci    VSHF_B2_SB(e2, r2, e3, r3, mask, mask, d2, d3);
429cabdff1aSopenharmony_ci    VSHF_B2_SB(e4, r4, e5, r5, mask, mask, d4, d5);
430cabdff1aSopenharmony_ci    VSHF_B2_SB(e6, r6, e7, r7, mask, mask, d6, d7);
431cabdff1aSopenharmony_ci
432cabdff1aSopenharmony_ci    /* Final sequence of operations over-write original dst */
433cabdff1aSopenharmony_ci    ST_D1(d0, 0, dest);
434cabdff1aSopenharmony_ci    ST_D1(d1, 0, dest + line_size);
435cabdff1aSopenharmony_ci    ST_D1(d2, 0, dest + 2 * line_size);
436cabdff1aSopenharmony_ci    ST_D1(d3, 0, dest + 3 * line_size);
437cabdff1aSopenharmony_ci    ST_D1(d4, 0, dest + 4 * line_size);
438cabdff1aSopenharmony_ci    ST_D1(d5, 0, dest + 5 * line_size);
439cabdff1aSopenharmony_ci    ST_D1(d6, 0, dest + 6 * line_size);
440cabdff1aSopenharmony_ci    ST_D1(d7, 0, dest + 7 * line_size);
441cabdff1aSopenharmony_ci
442cabdff1aSopenharmony_ci    block[0] = 0;
443cabdff1aSopenharmony_ci}
444cabdff1aSopenharmony_ci
445cabdff1aSopenharmony_civoid ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
446cabdff1aSopenharmony_ci                              int *bounding_values)
447cabdff1aSopenharmony_ci{
448cabdff1aSopenharmony_ci    int nstride = -stride;
449cabdff1aSopenharmony_ci    v4i32 e0, e1, f0, f1, g0, g1;
450cabdff1aSopenharmony_ci    v16i8 zero = {0};
451cabdff1aSopenharmony_ci    v16i8 d0, d1, d2, d3;
452cabdff1aSopenharmony_ci    v8i16 c0, c1, c2, c3;
453cabdff1aSopenharmony_ci    v8i16 r0;
454cabdff1aSopenharmony_ci    v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
455cabdff1aSopenharmony_ci          cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
456cabdff1aSopenharmony_ci    v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
457cabdff1aSopenharmony_ci    int16_t temp_16[8];
458cabdff1aSopenharmony_ci    int temp_32[8];
459cabdff1aSopenharmony_ci
460cabdff1aSopenharmony_ci    LD_SB4(first_pixel + nstride * 2, stride, d0, d1, d2, d3);
461cabdff1aSopenharmony_ci    ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3,
462cabdff1aSopenharmony_ci               c0, c1, c2, c3);
463cabdff1aSopenharmony_ci    r0 = (c0 - c3) + (c2 - c1) * cnst3h;
464cabdff1aSopenharmony_ci    r0 += cnst4h;
465cabdff1aSopenharmony_ci    r0 = r0 >> 3;
466cabdff1aSopenharmony_ci    /* Get filter_value from bounding_values one by one */
467cabdff1aSopenharmony_ci    ST_SH(r0, temp_16);
468cabdff1aSopenharmony_ci    for (int i = 0; i < 8; i++)
469cabdff1aSopenharmony_ci        temp_32[i] = bounding_values[temp_16[i]];
470cabdff1aSopenharmony_ci    LD_SW2(temp_32, 4, e0, e1);
471cabdff1aSopenharmony_ci    ILVR_H2_SW(zero, c1, zero, c2, f0, g0);
472cabdff1aSopenharmony_ci    ILVL_H2_SW(zero, c1, zero, c2, f1, g1);
473cabdff1aSopenharmony_ci    f0 += e0;
474cabdff1aSopenharmony_ci    f1 += e1;
475cabdff1aSopenharmony_ci    g0 -= e0;
476cabdff1aSopenharmony_ci    g1 -= e1;
477cabdff1aSopenharmony_ci    CLIP_SW4_0_255(f0, f1, g0, g1);
478cabdff1aSopenharmony_ci    VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2);
479cabdff1aSopenharmony_ci
480cabdff1aSopenharmony_ci    /* Final move to first_pixel */
481cabdff1aSopenharmony_ci    ST_D1(d1, 0, first_pixel + nstride);
482cabdff1aSopenharmony_ci    ST_D1(d2, 0, first_pixel);
483cabdff1aSopenharmony_ci}
484cabdff1aSopenharmony_ci
485cabdff1aSopenharmony_civoid ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
486cabdff1aSopenharmony_ci                              int *bounding_values)
487cabdff1aSopenharmony_ci{
488cabdff1aSopenharmony_ci    v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
489cabdff1aSopenharmony_ci    v8i16 c0, c1, c2, c3, c4, c5, c6, c7;
490cabdff1aSopenharmony_ci    v8i16 r0;
491cabdff1aSopenharmony_ci    v4i32 e0, e1, f0, f1, g0, g1;
492cabdff1aSopenharmony_ci    v16i8 zero = {0};
493cabdff1aSopenharmony_ci    v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
494cabdff1aSopenharmony_ci          cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
495cabdff1aSopenharmony_ci    v16i8 mask = {0, 16, 4, 20, 8, 24, 12, 28, 0, 0, 0, 0, 0, 0, 0, 0};
496cabdff1aSopenharmony_ci    int16_t temp_16[8];
497cabdff1aSopenharmony_ci    int temp_32[8];
498cabdff1aSopenharmony_ci
499cabdff1aSopenharmony_ci    LD_SB8(first_pixel - 2, stride, d0, d1, d2, d3, d4, d5, d6, d7);
500cabdff1aSopenharmony_ci    ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3,
501cabdff1aSopenharmony_ci               c0, c1, c2, c3);
502cabdff1aSopenharmony_ci    ILVR_B4_SH(zero, d4, zero, d5, zero, d6, zero, d7,
503cabdff1aSopenharmony_ci               c4, c5, c6, c7);
504cabdff1aSopenharmony_ci    TRANSPOSE8x8_SH_SH(c0, c1, c2, c3, c4, c5, c6, c7,
505cabdff1aSopenharmony_ci                       c0, c1, c2, c3, c4, c5, c6, c7);
506cabdff1aSopenharmony_ci    r0 = (c0 - c3) + (c2 - c1) * cnst3h;
507cabdff1aSopenharmony_ci    r0 += cnst4h;
508cabdff1aSopenharmony_ci    r0 = r0 >> 3;
509cabdff1aSopenharmony_ci
510cabdff1aSopenharmony_ci    /* Get filter_value from bounding_values one by one */
511cabdff1aSopenharmony_ci    ST_SH(r0, temp_16);
512cabdff1aSopenharmony_ci    for (int i = 0; i < 8; i++)
513cabdff1aSopenharmony_ci        temp_32[i] = bounding_values[temp_16[i]];
514cabdff1aSopenharmony_ci    LD_SW2(temp_32, 4, e0, e1);
515cabdff1aSopenharmony_ci    ILVR_H2_SW(zero, c1, zero, c2, f0, g0);
516cabdff1aSopenharmony_ci    ILVL_H2_SW(zero, c1, zero, c2, f1, g1);
517cabdff1aSopenharmony_ci    f0 += e0;
518cabdff1aSopenharmony_ci    f1 += e1;
519cabdff1aSopenharmony_ci    g0 -= e0;
520cabdff1aSopenharmony_ci    g1 -= e1;
521cabdff1aSopenharmony_ci    CLIP_SW4_0_255(f0, f1, g0, g1);
522cabdff1aSopenharmony_ci    VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2);
523cabdff1aSopenharmony_ci    /* Final move to first_pixel */
524cabdff1aSopenharmony_ci    ST_H4(d1, 0, 1, 2, 3, first_pixel - 1, stride);
525cabdff1aSopenharmony_ci    ST_H4(d2, 0, 1, 2, 3, first_pixel - 1 + 4 * stride, stride);
526cabdff1aSopenharmony_ci}
527cabdff1aSopenharmony_ci
528cabdff1aSopenharmony_civoid ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1,
529cabdff1aSopenharmony_ci                                 const uint8_t *src2, ptrdiff_t stride, int h)
530cabdff1aSopenharmony_ci{
531cabdff1aSopenharmony_ci    if (h == 8) {
532cabdff1aSopenharmony_ci        v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
533cabdff1aSopenharmony_ci        v16i8 c0, c1, c2, c3;
534cabdff1aSopenharmony_ci        v4i32 a0, a1, a2, a3, b0, b1, b2, b3;
535cabdff1aSopenharmony_ci        v4i32 e0, e1, e2;
536cabdff1aSopenharmony_ci        v4i32 f0, f1, f2;
537cabdff1aSopenharmony_ci        v4u32 t0, t1, t2, t3;
538cabdff1aSopenharmony_ci        v16i8 mask = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23};
539cabdff1aSopenharmony_ci        int32_t value = 0xfefefefe;
540cabdff1aSopenharmony_ci        v4i32 fmask = {value, value, value, value};
541cabdff1aSopenharmony_ci
542cabdff1aSopenharmony_ci        LD_SB8(src1, stride, d0, d1, d2, d3, d4, d5, d6, d7);
543cabdff1aSopenharmony_ci        VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1);
544cabdff1aSopenharmony_ci        VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3);
545cabdff1aSopenharmony_ci        a0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0);
546cabdff1aSopenharmony_ci        a2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0);
547cabdff1aSopenharmony_ci        a1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2);
548cabdff1aSopenharmony_ci        a3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2);
549cabdff1aSopenharmony_ci
550cabdff1aSopenharmony_ci        LD_SB8(src2, stride, d0, d1, d2, d3, d4, d5, d6, d7);
551cabdff1aSopenharmony_ci        VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1);
552cabdff1aSopenharmony_ci        VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3);
553cabdff1aSopenharmony_ci        b0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0);
554cabdff1aSopenharmony_ci        b2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0);
555cabdff1aSopenharmony_ci        b1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2);
556cabdff1aSopenharmony_ci        b3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2);
557cabdff1aSopenharmony_ci
558cabdff1aSopenharmony_ci        e0 = (v4i32) __msa_xor_v((v16u8)a0, (v16u8)b0);
559cabdff1aSopenharmony_ci        e0 = (v4i32) __msa_and_v((v16u8)e0, (v16u8)fmask);
560cabdff1aSopenharmony_ci        t0 = ((v4u32)e0) >> 1;
561cabdff1aSopenharmony_ci        e2 = (v4i32) __msa_and_v((v16u8)a0, (v16u8)b0);
562cabdff1aSopenharmony_ci        t0 = t0 + (v4u32)e2;
563cabdff1aSopenharmony_ci
564cabdff1aSopenharmony_ci        e1 = (v4i32) __msa_xor_v((v16u8)a1, (v16u8)b1);
565cabdff1aSopenharmony_ci        e1 = (v4i32) __msa_and_v((v16u8)e1, (v16u8)fmask);
566cabdff1aSopenharmony_ci        t1 = ((v4u32)e1) >> 1;
567cabdff1aSopenharmony_ci        e2 = (v4i32) __msa_and_v((v16u8)a1, (v16u8)b1);
568cabdff1aSopenharmony_ci        t1 = t1 + (v4u32)e2;
569cabdff1aSopenharmony_ci
570cabdff1aSopenharmony_ci        f0 = (v4i32) __msa_xor_v((v16u8)a2, (v16u8)b2);
571cabdff1aSopenharmony_ci        f0 = (v4i32) __msa_and_v((v16u8)f0, (v16u8)fmask);
572cabdff1aSopenharmony_ci        t2 = ((v4u32)f0) >> 1;
573cabdff1aSopenharmony_ci        f2 = (v4i32) __msa_and_v((v16u8)a2, (v16u8)b2);
574cabdff1aSopenharmony_ci        t2 = t2 + (v4u32)f2;
575cabdff1aSopenharmony_ci
576cabdff1aSopenharmony_ci        f1 = (v4i32) __msa_xor_v((v16u8)a3, (v16u8)b3);
577cabdff1aSopenharmony_ci        f1 = (v4i32) __msa_and_v((v16u8)f1, (v16u8)fmask);
578cabdff1aSopenharmony_ci        t3 = ((v4u32)f1) >> 1;
579cabdff1aSopenharmony_ci        f2 = (v4i32) __msa_and_v((v16u8)a3, (v16u8)b3);
580cabdff1aSopenharmony_ci        t3 = t3 + (v4u32)f2;
581cabdff1aSopenharmony_ci
582cabdff1aSopenharmony_ci        ST_W8(t0, t1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
583cabdff1aSopenharmony_ci        ST_W8(t2, t3, 0, 1, 2, 3, 0, 1, 2, 3, dst + 4, stride);
584cabdff1aSopenharmony_ci    } else {
585cabdff1aSopenharmony_ci        int i;
586cabdff1aSopenharmony_ci
587cabdff1aSopenharmony_ci        for (i = 0; i < h; i++) {
588cabdff1aSopenharmony_ci            uint32_t a, b;
589cabdff1aSopenharmony_ci
590cabdff1aSopenharmony_ci            a = AV_RN32(&src1[i * stride]);
591cabdff1aSopenharmony_ci            b = AV_RN32(&src2[i * stride]);
592cabdff1aSopenharmony_ci            AV_WN32A(&dst[i * stride], no_rnd_avg32(a, b));
593cabdff1aSopenharmony_ci            a = AV_RN32(&src1[i * stride + 4]);
594cabdff1aSopenharmony_ci            b = AV_RN32(&src2[i * stride + 4]);
595cabdff1aSopenharmony_ci            AV_WN32A(&dst[i * stride + 4], no_rnd_avg32(a, b));
596cabdff1aSopenharmony_ci        }
597cabdff1aSopenharmony_ci    }
598cabdff1aSopenharmony_ci}
599