1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
22cabdff1aSopenharmony_ci#include "idctdsp_mips.h"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_cistatic void simple_idct_msa(int16_t *block)
25cabdff1aSopenharmony_ci{
26cabdff1aSopenharmony_ci    int32_t const_val;
27cabdff1aSopenharmony_ci    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
28cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
29cabdff1aSopenharmony_ci    v8i16 w1, w3, w5, w7;
30cabdff1aSopenharmony_ci    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
31cabdff1aSopenharmony_ci    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
32cabdff1aSopenharmony_ci    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
33cabdff1aSopenharmony_ci    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
34cabdff1aSopenharmony_ci    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
35cabdff1aSopenharmony_ci    v4i32 w2, w4, w6;
36cabdff1aSopenharmony_ci    v8i16 select_vec, temp;
37cabdff1aSopenharmony_ci    v8i16 zero = { 0 };
38cabdff1aSopenharmony_ci    v4i32 const_val0 = __msa_ldi_w(1);
39cabdff1aSopenharmony_ci    v4i32 const_val1 = __msa_ldi_w(1);
40cabdff1aSopenharmony_ci
41cabdff1aSopenharmony_ci    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
42cabdff1aSopenharmony_ci    const_val0 <<= 10;
43cabdff1aSopenharmony_ci    const_val = 16383 * ((1 << 19) / 16383);
44cabdff1aSopenharmony_ci    const_val1 = __msa_insert_w(const_val0, 0, const_val);
45cabdff1aSopenharmony_ci    const_val1 = __msa_splati_w(const_val1, 0);
46cabdff1aSopenharmony_ci    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
47cabdff1aSopenharmony_ci                       in0, in1, in2, in3, in4, in5, in6, in7);
48cabdff1aSopenharmony_ci    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
49cabdff1aSopenharmony_ci    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
50cabdff1aSopenharmony_ci    UNPCK_SH_SW(in0, a0_r, a0_l);
51cabdff1aSopenharmony_ci    UNPCK_SH_SW(in2, temp3_r, temp3_l);
52cabdff1aSopenharmony_ci    temp = in0 << 3;
53cabdff1aSopenharmony_ci    w2 = (v4i32) __msa_splati_h(weights, 2);
54cabdff1aSopenharmony_ci    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
55cabdff1aSopenharmony_ci    w4 = (v4i32) __msa_splati_h(weights, 4);
56cabdff1aSopenharmony_ci    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
57cabdff1aSopenharmony_ci    w6 = (v4i32) __msa_splati_h(weights, 6);
58cabdff1aSopenharmony_ci    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
59cabdff1aSopenharmony_ci    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
60cabdff1aSopenharmony_ci    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
61cabdff1aSopenharmony_ci    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
62cabdff1aSopenharmony_ci         temp1_r, temp1_l, temp2_r, temp2_l);
63cabdff1aSopenharmony_ci    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
64cabdff1aSopenharmony_ci                temp2_l, temp2_r, temp1_l, temp1_r,
65cabdff1aSopenharmony_ci                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
66cabdff1aSopenharmony_ci    UNPCK_SH_SW(in4, temp0_r, temp0_l);
67cabdff1aSopenharmony_ci    UNPCK_SH_SW(in6, temp3_r, temp3_l);
68cabdff1aSopenharmony_ci    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
69cabdff1aSopenharmony_ci    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
70cabdff1aSopenharmony_ci         temp2_r, temp2_l, temp1_r, temp1_l);
71cabdff1aSopenharmony_ci    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
72cabdff1aSopenharmony_ci    SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
73cabdff1aSopenharmony_ci         a1_r, a1_l, a2_r, a2_l);
74cabdff1aSopenharmony_ci    ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
75cabdff1aSopenharmony_ci         a3_r, a3_l, a0_r, a0_l);
76cabdff1aSopenharmony_ci    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
77cabdff1aSopenharmony_ci    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
78cabdff1aSopenharmony_ci    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
79cabdff1aSopenharmony_ci    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
80cabdff1aSopenharmony_ci    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
81cabdff1aSopenharmony_ci    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
82cabdff1aSopenharmony_ci    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
83cabdff1aSopenharmony_ci               const0, const1, const2, const3);
84cabdff1aSopenharmony_ci    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
85cabdff1aSopenharmony_ci    const5 = __msa_ilvod_h(-w1, -w5);
86cabdff1aSopenharmony_ci    const7 = __msa_ilvod_h(w3, -w1);
87cabdff1aSopenharmony_ci    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
88cabdff1aSopenharmony_ci                b0_r, b1_r, b2_r, b3_r);
89cabdff1aSopenharmony_ci    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
90cabdff1aSopenharmony_ci                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
91cabdff1aSopenharmony_ci    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
92cabdff1aSopenharmony_ci                b0_l, b1_l, b2_l, b3_l);
93cabdff1aSopenharmony_ci    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
94cabdff1aSopenharmony_ci                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
95cabdff1aSopenharmony_ci    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
96cabdff1aSopenharmony_ci                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
97cabdff1aSopenharmony_ci                 temp0_r, temp0_l, temp1_r, temp1_l,
98cabdff1aSopenharmony_ci                 temp2_r, temp2_l, temp3_r, temp3_l,
99cabdff1aSopenharmony_ci                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
100cabdff1aSopenharmony_ci    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
101cabdff1aSopenharmony_ci    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
102cabdff1aSopenharmony_ci    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
103cabdff1aSopenharmony_ci                temp2_l, temp2_r, temp3_l, temp3_r,
104cabdff1aSopenharmony_ci                temp0_r, temp1_r, temp2_r, temp3_r);
105cabdff1aSopenharmony_ci    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
106cabdff1aSopenharmony_ci                               (v16u8) select_vec);
107cabdff1aSopenharmony_ci    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
108cabdff1aSopenharmony_ci                               (v16u8) select_vec);
109cabdff1aSopenharmony_ci    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
110cabdff1aSopenharmony_ci                               (v16u8) select_vec);
111cabdff1aSopenharmony_ci    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
112cabdff1aSopenharmony_ci                               (v16u8) select_vec);
113cabdff1aSopenharmony_ci    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
114cabdff1aSopenharmony_ci    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
115cabdff1aSopenharmony_ci    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
116cabdff1aSopenharmony_ci                a0_r, a1_r, a2_r, a3_r);
117cabdff1aSopenharmony_ci    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
118cabdff1aSopenharmony_ci    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
119cabdff1aSopenharmony_ci    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
120cabdff1aSopenharmony_ci    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
121cabdff1aSopenharmony_ci    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
122cabdff1aSopenharmony_ci                       in0, in1, in2, in3, in4, in5, in6, in7);
123cabdff1aSopenharmony_ci
124cabdff1aSopenharmony_ci    UNPCK_SH_SW(in0, a0_r, a0_l);
125cabdff1aSopenharmony_ci    UNPCK_SH_SW(in2, temp3_r, temp3_l);
126cabdff1aSopenharmony_ci    w2 = (v4i32) __msa_splati_h(weights, 2);
127cabdff1aSopenharmony_ci    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
128cabdff1aSopenharmony_ci    w4 = (v4i32) __msa_splati_h(weights, 4);
129cabdff1aSopenharmony_ci    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
130cabdff1aSopenharmony_ci    w6 = (v4i32) __msa_splati_h(weights, 6);
131cabdff1aSopenharmony_ci    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
132cabdff1aSopenharmony_ci    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
133cabdff1aSopenharmony_ci    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
134cabdff1aSopenharmony_ci    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
135cabdff1aSopenharmony_ci         temp1_r, temp1_l, temp2_r, temp2_l);
136cabdff1aSopenharmony_ci    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
137cabdff1aSopenharmony_ci                temp2_l, temp2_r, temp1_l, temp1_r,
138cabdff1aSopenharmony_ci                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
139cabdff1aSopenharmony_ci    UNPCK_SH_SW(in4, temp0_r, temp0_l);
140cabdff1aSopenharmony_ci    UNPCK_SH_SW(in6, temp3_r, temp3_l);
141cabdff1aSopenharmony_ci    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
142cabdff1aSopenharmony_ci    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
143cabdff1aSopenharmony_ci         temp2_r, temp2_l, temp1_r, temp1_l);
144cabdff1aSopenharmony_ci    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
145cabdff1aSopenharmony_ci    SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
146cabdff1aSopenharmony_ci         a1_r, a1_l, a2_r, a2_l);
147cabdff1aSopenharmony_ci    ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
148cabdff1aSopenharmony_ci         a3_r, a3_l, a0_r, a0_l);
149cabdff1aSopenharmony_ci    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
150cabdff1aSopenharmony_ci    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
151cabdff1aSopenharmony_ci    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
152cabdff1aSopenharmony_ci    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
153cabdff1aSopenharmony_ci    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
154cabdff1aSopenharmony_ci    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
155cabdff1aSopenharmony_ci               const0, const1, const2, const3);
156cabdff1aSopenharmony_ci    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
157cabdff1aSopenharmony_ci                b0_r, b1_r, b2_r, b3_r);
158cabdff1aSopenharmony_ci    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
159cabdff1aSopenharmony_ci                b0_l, b1_l, b2_l, b3_l);
160cabdff1aSopenharmony_ci    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
161cabdff1aSopenharmony_ci    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
162cabdff1aSopenharmony_ci    const5 = __msa_ilvod_h(-w1, -w5);
163cabdff1aSopenharmony_ci    const7 = __msa_ilvod_h(w3, -w1);
164cabdff1aSopenharmony_ci    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
165cabdff1aSopenharmony_ci                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
166cabdff1aSopenharmony_ci    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
167cabdff1aSopenharmony_ci                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
168cabdff1aSopenharmony_ci    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
169cabdff1aSopenharmony_ci                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
170cabdff1aSopenharmony_ci                 temp0_r, temp0_l, temp1_r, temp1_l,
171cabdff1aSopenharmony_ci                 temp2_r, temp2_l, temp3_r, temp3_l,
172cabdff1aSopenharmony_ci                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
173cabdff1aSopenharmony_ci    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
174cabdff1aSopenharmony_ci    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
175cabdff1aSopenharmony_ci    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
176cabdff1aSopenharmony_ci                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
177cabdff1aSopenharmony_ci    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
178cabdff1aSopenharmony_ci    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
179cabdff1aSopenharmony_ci    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
180cabdff1aSopenharmony_ci                a0_r, a1_r, a2_r, a3_r);
181cabdff1aSopenharmony_ci    ST_SW8(temp0_r, temp1_r, temp2_r, temp3_r, a3_r, a2_r, a1_r, a0_r,
182cabdff1aSopenharmony_ci           block, 8);
183cabdff1aSopenharmony_ci}
184cabdff1aSopenharmony_ci
185cabdff1aSopenharmony_cistatic void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride,
186cabdff1aSopenharmony_ci                                int16_t *block)
187cabdff1aSopenharmony_ci{
188cabdff1aSopenharmony_ci    int32_t const_val;
189cabdff1aSopenharmony_ci    uint64_t tmp0, tmp1, tmp2, tmp3;
190cabdff1aSopenharmony_ci    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
191cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
192cabdff1aSopenharmony_ci    v8i16 w1, w3, w5, w7;
193cabdff1aSopenharmony_ci    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
194cabdff1aSopenharmony_ci    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
195cabdff1aSopenharmony_ci    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
196cabdff1aSopenharmony_ci    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
197cabdff1aSopenharmony_ci    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
198cabdff1aSopenharmony_ci    v4i32 w2, w4, w6;
199cabdff1aSopenharmony_ci    v8i16 select_vec, temp;
200cabdff1aSopenharmony_ci    v8i16 zero = { 0 };
201cabdff1aSopenharmony_ci    v4i32 const_val0 = __msa_ldi_w(1);
202cabdff1aSopenharmony_ci    v4i32 const_val1 = __msa_ldi_w(1);
203cabdff1aSopenharmony_ci
204cabdff1aSopenharmony_ci    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
205cabdff1aSopenharmony_ci    const_val0 <<= 10;
206cabdff1aSopenharmony_ci    const_val = 16383 * ((1 << 19) / 16383);
207cabdff1aSopenharmony_ci    const_val1 = __msa_insert_w(const_val0, 0, const_val);
208cabdff1aSopenharmony_ci    const_val1 = __msa_splati_w(const_val1, 0);
209cabdff1aSopenharmony_ci    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
210cabdff1aSopenharmony_ci                       in0, in1, in2, in3, in4, in5, in6, in7);
211cabdff1aSopenharmony_ci    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
212cabdff1aSopenharmony_ci    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
213cabdff1aSopenharmony_ci    UNPCK_SH_SW(in0, a0_r, a0_l);
214cabdff1aSopenharmony_ci    UNPCK_SH_SW(in2, temp3_r, temp3_l);
215cabdff1aSopenharmony_ci    temp = in0 << 3;
216cabdff1aSopenharmony_ci    w2 = (v4i32) __msa_splati_h(weights, 2);
217cabdff1aSopenharmony_ci    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
218cabdff1aSopenharmony_ci    w4 = (v4i32) __msa_splati_h(weights, 4);
219cabdff1aSopenharmony_ci    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
220cabdff1aSopenharmony_ci    w6 = (v4i32) __msa_splati_h(weights, 6);
221cabdff1aSopenharmony_ci    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
222cabdff1aSopenharmony_ci    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
223cabdff1aSopenharmony_ci    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
224cabdff1aSopenharmony_ci    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
225cabdff1aSopenharmony_ci    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
226cabdff1aSopenharmony_ci    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
227cabdff1aSopenharmony_ci                temp2_l, temp2_r, temp1_l, temp1_r,
228cabdff1aSopenharmony_ci                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
229cabdff1aSopenharmony_ci    UNPCK_SH_SW(in4, temp0_r, temp0_l);
230cabdff1aSopenharmony_ci    UNPCK_SH_SW(in6, temp3_r, temp3_l);
231cabdff1aSopenharmony_ci    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
232cabdff1aSopenharmony_ci    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
233cabdff1aSopenharmony_ci    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
234cabdff1aSopenharmony_ci    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
235cabdff1aSopenharmony_ci    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
236cabdff1aSopenharmony_ci    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
237cabdff1aSopenharmony_ci    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
238cabdff1aSopenharmony_ci    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
239cabdff1aSopenharmony_ci    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
240cabdff1aSopenharmony_ci    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
241cabdff1aSopenharmony_ci    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
242cabdff1aSopenharmony_ci    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
243cabdff1aSopenharmony_ci    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
244cabdff1aSopenharmony_ci    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
245cabdff1aSopenharmony_ci    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
246cabdff1aSopenharmony_ci               const0, const1, const2, const3);
247cabdff1aSopenharmony_ci    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
248cabdff1aSopenharmony_ci    const5 = __msa_ilvod_h(-w1, -w5);
249cabdff1aSopenharmony_ci    const7 = __msa_ilvod_h(w3, -w1);
250cabdff1aSopenharmony_ci    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
251cabdff1aSopenharmony_ci                b0_r, b1_r, b2_r, b3_r);
252cabdff1aSopenharmony_ci    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
253cabdff1aSopenharmony_ci                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
254cabdff1aSopenharmony_ci    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
255cabdff1aSopenharmony_ci                b0_l, b1_l, b2_l, b3_l);
256cabdff1aSopenharmony_ci    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
257cabdff1aSopenharmony_ci                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
258cabdff1aSopenharmony_ci    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
259cabdff1aSopenharmony_ci                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
260cabdff1aSopenharmony_ci                 temp0_r, temp0_l, temp1_r, temp1_l,
261cabdff1aSopenharmony_ci                 temp2_r, temp2_l, temp3_r, temp3_l,
262cabdff1aSopenharmony_ci                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
263cabdff1aSopenharmony_ci    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
264cabdff1aSopenharmony_ci    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
265cabdff1aSopenharmony_ci    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
266cabdff1aSopenharmony_ci                temp2_l, temp2_r, temp3_l, temp3_r,
267cabdff1aSopenharmony_ci                temp0_r, temp1_r, temp2_r, temp3_r);
268cabdff1aSopenharmony_ci    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
269cabdff1aSopenharmony_ci                               (v16u8) select_vec);
270cabdff1aSopenharmony_ci    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
271cabdff1aSopenharmony_ci                               (v16u8) select_vec);
272cabdff1aSopenharmony_ci    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
273cabdff1aSopenharmony_ci                               (v16u8) select_vec);
274cabdff1aSopenharmony_ci    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
275cabdff1aSopenharmony_ci                               (v16u8) select_vec);
276cabdff1aSopenharmony_ci    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
277cabdff1aSopenharmony_ci    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
278cabdff1aSopenharmony_ci    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
279cabdff1aSopenharmony_ci                a0_r, a1_r, a2_r, a3_r);
280cabdff1aSopenharmony_ci    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
281cabdff1aSopenharmony_ci    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
282cabdff1aSopenharmony_ci    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
283cabdff1aSopenharmony_ci    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
284cabdff1aSopenharmony_ci    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
285cabdff1aSopenharmony_ci                       in0, in1, in2, in3, in4, in5, in6, in7);
286cabdff1aSopenharmony_ci    UNPCK_SH_SW(in0, a0_r, a0_l);
287cabdff1aSopenharmony_ci    UNPCK_SH_SW(in2, temp3_r, temp3_l);
288cabdff1aSopenharmony_ci    w2 = (v4i32) __msa_splati_h(weights, 2);
289cabdff1aSopenharmony_ci    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
290cabdff1aSopenharmony_ci    w4 = (v4i32) __msa_splati_h(weights, 4);
291cabdff1aSopenharmony_ci    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
292cabdff1aSopenharmony_ci    w6 = (v4i32) __msa_splati_h(weights, 6);
293cabdff1aSopenharmony_ci    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
294cabdff1aSopenharmony_ci    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
295cabdff1aSopenharmony_ci    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
296cabdff1aSopenharmony_ci    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
297cabdff1aSopenharmony_ci    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
298cabdff1aSopenharmony_ci    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
299cabdff1aSopenharmony_ci                temp2_l, temp2_r, temp1_l, temp1_r,
300cabdff1aSopenharmony_ci                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
301cabdff1aSopenharmony_ci    UNPCK_SH_SW(in4, temp0_r, temp0_l);
302cabdff1aSopenharmony_ci    UNPCK_SH_SW(in6, temp3_r, temp3_l);
303cabdff1aSopenharmony_ci    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
304cabdff1aSopenharmony_ci    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
305cabdff1aSopenharmony_ci    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
306cabdff1aSopenharmony_ci    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
307cabdff1aSopenharmony_ci    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
308cabdff1aSopenharmony_ci    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
309cabdff1aSopenharmony_ci    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
310cabdff1aSopenharmony_ci    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
311cabdff1aSopenharmony_ci    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
312cabdff1aSopenharmony_ci    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
313cabdff1aSopenharmony_ci    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
314cabdff1aSopenharmony_ci    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
315cabdff1aSopenharmony_ci    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
316cabdff1aSopenharmony_ci    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
317cabdff1aSopenharmony_ci               const0, const1, const2, const3);
318cabdff1aSopenharmony_ci    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
319cabdff1aSopenharmony_ci                b0_r, b1_r, b2_r, b3_r);
320cabdff1aSopenharmony_ci    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
321cabdff1aSopenharmony_ci                b0_l, b1_l, b2_l, b3_l);
322cabdff1aSopenharmony_ci    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
323cabdff1aSopenharmony_ci    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
324cabdff1aSopenharmony_ci    const5 = __msa_ilvod_h(-w1, -w5);
325cabdff1aSopenharmony_ci    const7 = __msa_ilvod_h(w3, -w1);
326cabdff1aSopenharmony_ci    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
327cabdff1aSopenharmony_ci                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
328cabdff1aSopenharmony_ci    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
329cabdff1aSopenharmony_ci                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
330cabdff1aSopenharmony_ci    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
331cabdff1aSopenharmony_ci                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
332cabdff1aSopenharmony_ci                 temp0_r, temp0_l, temp1_r, temp1_l,
333cabdff1aSopenharmony_ci                 temp2_r, temp2_l, temp3_r, temp3_l,
334cabdff1aSopenharmony_ci                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
335cabdff1aSopenharmony_ci    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
336cabdff1aSopenharmony_ci    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
337cabdff1aSopenharmony_ci    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
338cabdff1aSopenharmony_ci    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
339cabdff1aSopenharmony_ci    PCKEV_H4_SH(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
340cabdff1aSopenharmony_ci                temp3_l, temp3_r, in0, in1, in2, in3);
341cabdff1aSopenharmony_ci    PCKEV_H4_SH(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
342cabdff1aSopenharmony_ci                in4, in5, in6, in7);
343cabdff1aSopenharmony_ci    CLIP_SH4_0_255(in0, in1, in2, in3);
344cabdff1aSopenharmony_ci    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3,
345cabdff1aSopenharmony_ci                in0, in1, in2, in3);
346cabdff1aSopenharmony_ci    tmp0 = __msa_copy_u_d((v2i64) in0, 1);
347cabdff1aSopenharmony_ci    tmp1 = __msa_copy_u_d((v2i64) in1, 1);
348cabdff1aSopenharmony_ci    tmp2 = __msa_copy_u_d((v2i64) in2, 1);
349cabdff1aSopenharmony_ci    tmp3 = __msa_copy_u_d((v2i64) in3, 1);
350cabdff1aSopenharmony_ci    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
351cabdff1aSopenharmony_ci    CLIP_SH4_0_255(in4, in5, in6, in7);
352cabdff1aSopenharmony_ci    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7,
353cabdff1aSopenharmony_ci                in4, in5, in6, in7);
354cabdff1aSopenharmony_ci    tmp3 = __msa_copy_u_d((v2i64) in4, 1);
355cabdff1aSopenharmony_ci    tmp2 = __msa_copy_u_d((v2i64) in5, 1);
356cabdff1aSopenharmony_ci    tmp1 = __msa_copy_u_d((v2i64) in6, 1);
357cabdff1aSopenharmony_ci    tmp0 = __msa_copy_u_d((v2i64) in7, 1);
358cabdff1aSopenharmony_ci    SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
359cabdff1aSopenharmony_ci}
360cabdff1aSopenharmony_ci
361cabdff1aSopenharmony_cistatic void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
362cabdff1aSopenharmony_ci                                int16_t *block)
363cabdff1aSopenharmony_ci{
364cabdff1aSopenharmony_ci    int32_t const_val;
365cabdff1aSopenharmony_ci    uint64_t tmp0, tmp1, tmp2, tmp3;
366cabdff1aSopenharmony_ci    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
367cabdff1aSopenharmony_ci    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
368cabdff1aSopenharmony_ci    v8i16 w1, w3, w5, w7;
369cabdff1aSopenharmony_ci    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
370cabdff1aSopenharmony_ci    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
371cabdff1aSopenharmony_ci    v4i32 temp4_r, temp5_r, temp6_r, temp7_r, temp8_r;
372cabdff1aSopenharmony_ci    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
373cabdff1aSopenharmony_ci    v4i32 temp4_l, temp5_l, temp6_l, temp7_l, temp8_l;
374cabdff1aSopenharmony_ci    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
375cabdff1aSopenharmony_ci    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
376cabdff1aSopenharmony_ci    v4i32 w2, w4, w6;
377cabdff1aSopenharmony_ci    v8i16 select_vec, temp;
378cabdff1aSopenharmony_ci    v8i16 zero = { 0 };
379cabdff1aSopenharmony_ci    v4i32 const_val0 = __msa_ldi_w(1);
380cabdff1aSopenharmony_ci    v4i32 const_val1 = __msa_ldi_w(1);
381cabdff1aSopenharmony_ci
382cabdff1aSopenharmony_ci    const_val0 <<= 10;
383cabdff1aSopenharmony_ci    const_val = 16383 * ((1 << 19) / 16383);
384cabdff1aSopenharmony_ci    const_val1 = __msa_insert_w(const_val0, 0, const_val);
385cabdff1aSopenharmony_ci    const_val1 = __msa_splati_w(const_val1, 0);
386cabdff1aSopenharmony_ci    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
387cabdff1aSopenharmony_ci    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
388cabdff1aSopenharmony_ci                       in0, in1, in2, in3, in4, in5, in6, in7);
389cabdff1aSopenharmony_ci
390cabdff1aSopenharmony_ci    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
391cabdff1aSopenharmony_ci    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
392cabdff1aSopenharmony_ci    UNPCK_SH_SW(in0, a0_r, a0_l);
393cabdff1aSopenharmony_ci    UNPCK_SH_SW(in2, temp3_r, temp3_l);
394cabdff1aSopenharmony_ci    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
395cabdff1aSopenharmony_ci    UNPCK_SH_SW(in4, temp4_r, temp4_l);
396cabdff1aSopenharmony_ci    UNPCK_SH_SW(in6, temp7_r, temp7_l);
397cabdff1aSopenharmony_ci    ILVRL_H2_SW(in5, in7, temp8_r, temp8_l);
398cabdff1aSopenharmony_ci    temp = in0 << 3;
399cabdff1aSopenharmony_ci    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
400cabdff1aSopenharmony_ci    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
401cabdff1aSopenharmony_ci               const0, const1, const2, const3);
402cabdff1aSopenharmony_ci    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
403cabdff1aSopenharmony_ci    const5 = __msa_ilvod_h(-w1, -w5);
404cabdff1aSopenharmony_ci    const7 = __msa_ilvod_h(w3, -w1);
405cabdff1aSopenharmony_ci    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
406cabdff1aSopenharmony_ci                b0_r, b1_r, b2_r, b3_r);
407cabdff1aSopenharmony_ci    DPADD_SH4_SW(temp8_r, temp8_r, temp8_r, temp8_r,
408cabdff1aSopenharmony_ci                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
409cabdff1aSopenharmony_ci    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
410cabdff1aSopenharmony_ci                b0_l, b1_l, b2_l, b3_l);
411cabdff1aSopenharmony_ci    DPADD_SH4_SW(temp8_l, temp8_l, temp8_l, temp8_l,
412cabdff1aSopenharmony_ci                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
413cabdff1aSopenharmony_ci    w2 = (v4i32) __msa_splati_h(weights, 2);
414cabdff1aSopenharmony_ci    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
415cabdff1aSopenharmony_ci    w4 = (v4i32) __msa_splati_h(weights, 4);
416cabdff1aSopenharmony_ci    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
417cabdff1aSopenharmony_ci    w6 = (v4i32) __msa_splati_h(weights, 6);
418cabdff1aSopenharmony_ci    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
419cabdff1aSopenharmony_ci    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
420cabdff1aSopenharmony_ci    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
421cabdff1aSopenharmony_ci    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
422cabdff1aSopenharmony_ci    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
423cabdff1aSopenharmony_ci    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
424cabdff1aSopenharmony_ci                temp2_l, temp2_r, temp1_l, temp1_r,
425cabdff1aSopenharmony_ci                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
426cabdff1aSopenharmony_ci    MUL2(temp4_r, w4, temp4_l, w4, temp4_r, temp4_l);
427cabdff1aSopenharmony_ci    MUL2(temp7_r, w2, temp7_l, w2, temp6_r, temp6_l);
428cabdff1aSopenharmony_ci    MUL2(temp7_r, w6, temp7_l, w6, temp5_r, temp5_l);
429cabdff1aSopenharmony_ci    ADD2(a0_r, temp4_r, a0_l, temp4_l, a0_r, a0_l);
430cabdff1aSopenharmony_ci    SUB2(a1_r, temp4_r, a1_l, temp4_l, a1_r, a1_l);
431cabdff1aSopenharmony_ci    SUB2(a2_r, temp4_r, a2_l, temp4_l, a2_r, a2_l);
432cabdff1aSopenharmony_ci    ADD2(a3_r, temp4_r, a3_l, temp4_l, a3_r, a3_l);
433cabdff1aSopenharmony_ci    ADD2(a0_r, temp5_r, a0_l, temp5_l, a0_r, a0_l);
434cabdff1aSopenharmony_ci    SUB2(a1_r, temp6_r, a1_l, temp6_l, a1_r, a1_l);
435cabdff1aSopenharmony_ci    ADD2(a2_r, temp6_r, a2_l, temp6_l, a2_r, a2_l);
436cabdff1aSopenharmony_ci    SUB2(a3_r, temp5_r, a3_l, temp5_l, a3_r, a3_l);
437cabdff1aSopenharmony_ci    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
438cabdff1aSopenharmony_ci                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
439cabdff1aSopenharmony_ci                 temp0_r, temp0_l, temp1_r, temp1_l,
440cabdff1aSopenharmony_ci                 temp2_r, temp2_l, temp3_r, temp3_l,
441cabdff1aSopenharmony_ci                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
442cabdff1aSopenharmony_ci    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
443cabdff1aSopenharmony_ci    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
444cabdff1aSopenharmony_ci    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
445cabdff1aSopenharmony_ci                temp2_l, temp2_r, temp3_l, temp3_r,
446cabdff1aSopenharmony_ci                temp0_r, temp1_r, temp2_r, temp3_r);
447cabdff1aSopenharmony_ci    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
448cabdff1aSopenharmony_ci                               (v16u8) select_vec);
449cabdff1aSopenharmony_ci    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
450cabdff1aSopenharmony_ci                               (v16u8) select_vec);
451cabdff1aSopenharmony_ci    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
452cabdff1aSopenharmony_ci                               (v16u8) select_vec);
453cabdff1aSopenharmony_ci    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
454cabdff1aSopenharmony_ci                               (v16u8) select_vec);
455cabdff1aSopenharmony_ci    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
456cabdff1aSopenharmony_ci    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
457cabdff1aSopenharmony_ci    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
458cabdff1aSopenharmony_ci                a0_r, a1_r, a2_r, a3_r);
459cabdff1aSopenharmony_ci    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
460cabdff1aSopenharmony_ci    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
461cabdff1aSopenharmony_ci    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
462cabdff1aSopenharmony_ci    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
463cabdff1aSopenharmony_ci    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
464cabdff1aSopenharmony_ci                       in0, in1, in2, in3, in4, in5, in6, in7);
465cabdff1aSopenharmony_ci
466cabdff1aSopenharmony_ci    UNPCK_SH_SW(in0, a0_r, a0_l);
467cabdff1aSopenharmony_ci    UNPCK_SH_SW(in2, temp3_r, temp3_l);
468cabdff1aSopenharmony_ci    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
469cabdff1aSopenharmony_ci    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
470cabdff1aSopenharmony_ci    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
471cabdff1aSopenharmony_ci    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
472cabdff1aSopenharmony_ci    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
473cabdff1aSopenharmony_ci                temp2_l, temp2_r, temp1_l, temp1_r,
474cabdff1aSopenharmony_ci                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
475cabdff1aSopenharmony_ci    UNPCK_SH_SW(in4, temp0_r, temp0_l);
476cabdff1aSopenharmony_ci    UNPCK_SH_SW(in6, temp3_r, temp3_l);
477cabdff1aSopenharmony_ci    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
478cabdff1aSopenharmony_ci    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
479cabdff1aSopenharmony_ci    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
480cabdff1aSopenharmony_ci    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
481cabdff1aSopenharmony_ci    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
482cabdff1aSopenharmony_ci    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
483cabdff1aSopenharmony_ci    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
484cabdff1aSopenharmony_ci    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
485cabdff1aSopenharmony_ci    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
486cabdff1aSopenharmony_ci    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
487cabdff1aSopenharmony_ci    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
488cabdff1aSopenharmony_ci    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
489cabdff1aSopenharmony_ci    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
490cabdff1aSopenharmony_ci    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
491cabdff1aSopenharmony_ci                b0_r, b1_r, b2_r, b3_r);
492cabdff1aSopenharmony_ci    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
493cabdff1aSopenharmony_ci                b0_l, b1_l, b2_l, b3_l);
494cabdff1aSopenharmony_ci    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
495cabdff1aSopenharmony_ci                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
496cabdff1aSopenharmony_ci    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
497cabdff1aSopenharmony_ci                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
498cabdff1aSopenharmony_ci    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
499cabdff1aSopenharmony_ci                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
500cabdff1aSopenharmony_ci                 temp0_r, temp0_l, temp1_r, temp1_l,
501cabdff1aSopenharmony_ci                 temp2_r, temp2_l, temp3_r, temp3_l,
502cabdff1aSopenharmony_ci                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
503cabdff1aSopenharmony_ci    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
504cabdff1aSopenharmony_ci    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
505cabdff1aSopenharmony_ci    LD_SH4(dst, dst_stride, in0, in1, in2, in3);
506cabdff1aSopenharmony_ci    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
507cabdff1aSopenharmony_ci                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
508cabdff1aSopenharmony_ci    ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3,
509cabdff1aSopenharmony_ci               temp0_l, temp1_l, temp2_l, temp3_l);
510cabdff1aSopenharmony_ci    in0 = (v8i16) (temp0_r) + (v8i16) (temp0_l);
511cabdff1aSopenharmony_ci    in1 = (v8i16) (temp1_r) + (v8i16) (temp1_l);
512cabdff1aSopenharmony_ci    in2 = (v8i16) (temp2_r) + (v8i16) (temp2_l);
513cabdff1aSopenharmony_ci    in3 = (v8i16) (temp3_r) + (v8i16) (temp3_l);
514cabdff1aSopenharmony_ci    CLIP_SH4_0_255(in0, in1, in2, in3);
515cabdff1aSopenharmony_ci    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3,
516cabdff1aSopenharmony_ci                in0, in1, in2, in3);
517cabdff1aSopenharmony_ci    tmp0 = __msa_copy_u_d((v2i64) in0, 1);
518cabdff1aSopenharmony_ci    tmp1 = __msa_copy_u_d((v2i64) in1, 1);
519cabdff1aSopenharmony_ci    tmp2 = __msa_copy_u_d((v2i64) in2, 1);
520cabdff1aSopenharmony_ci    tmp3 = __msa_copy_u_d((v2i64) in3, 1);
521cabdff1aSopenharmony_ci    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
522cabdff1aSopenharmony_ci
523cabdff1aSopenharmony_ci    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
524cabdff1aSopenharmony_ci    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
525cabdff1aSopenharmony_ci    LD_SH4(dst + 4 * dst_stride, dst_stride, in4, in5, in6, in7);
526cabdff1aSopenharmony_ci    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
527cabdff1aSopenharmony_ci                a0_r, a1_r, a2_r, a3_r);
528cabdff1aSopenharmony_ci    ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7,
529cabdff1aSopenharmony_ci               a3_l, a2_l, a1_l, a0_l);
530cabdff1aSopenharmony_ci    in4 = (v8i16) (a3_r) + (v8i16) (a3_l);
531cabdff1aSopenharmony_ci    in5 = (v8i16) (a2_r) + (v8i16) (a2_l);
532cabdff1aSopenharmony_ci    in6 = (v8i16) (a1_r) + (v8i16) (a1_l);
533cabdff1aSopenharmony_ci    in7 = (v8i16) (a0_r) + (v8i16) (a0_l);
534cabdff1aSopenharmony_ci    CLIP_SH4_0_255(in4, in5, in6, in7);
535cabdff1aSopenharmony_ci    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7,
536cabdff1aSopenharmony_ci                in4, in5, in6, in7);
537cabdff1aSopenharmony_ci    tmp0 = __msa_copy_u_d((v2i64) in4, 1);
538cabdff1aSopenharmony_ci    tmp1 = __msa_copy_u_d((v2i64) in5, 1);
539cabdff1aSopenharmony_ci    tmp2 = __msa_copy_u_d((v2i64) in6, 1);
540cabdff1aSopenharmony_ci    tmp3 = __msa_copy_u_d((v2i64) in7, 1);
541cabdff1aSopenharmony_ci    SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
542cabdff1aSopenharmony_ci}
543cabdff1aSopenharmony_ci
544cabdff1aSopenharmony_civoid ff_simple_idct_msa(int16_t *block)
545cabdff1aSopenharmony_ci{
546cabdff1aSopenharmony_ci    simple_idct_msa(block);
547cabdff1aSopenharmony_ci}
548cabdff1aSopenharmony_ci
549cabdff1aSopenharmony_civoid ff_simple_idct_put_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)
550cabdff1aSopenharmony_ci{
551cabdff1aSopenharmony_ci    simple_idct_put_msa(dst, dst_stride, block);
552cabdff1aSopenharmony_ci}
553cabdff1aSopenharmony_ci
554cabdff1aSopenharmony_civoid ff_simple_idct_add_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)
555cabdff1aSopenharmony_ci{
556cabdff1aSopenharmony_ci    simple_idct_add_msa(dst, dst_stride, block);
557cabdff1aSopenharmony_ci}
558