1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavcodec/vp9dsp.h"
22cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
23cabdff1aSopenharmony_ci#include "vp9dsp_mips.h"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1)  \
26cabdff1aSopenharmony_ci{                                                \
27cabdff1aSopenharmony_ci    out0 = __msa_subs_u_h(out0, in0);            \
28cabdff1aSopenharmony_ci    out1 = __msa_subs_u_h(out1, in1);            \
29cabdff1aSopenharmony_ci}
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_civoid ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
32cabdff1aSopenharmony_ci                       const uint8_t *src)
33cabdff1aSopenharmony_ci{
34cabdff1aSopenharmony_ci    uint32_t row;
35cabdff1aSopenharmony_ci    v16u8 src0;
36cabdff1aSopenharmony_ci
37cabdff1aSopenharmony_ci    src0 = LD_UB(src);
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_ci    for (row = 16; row--;) {
40cabdff1aSopenharmony_ci        ST_UB(src0, dst);
41cabdff1aSopenharmony_ci        dst += dst_stride;
42cabdff1aSopenharmony_ci    }
43cabdff1aSopenharmony_ci}
44cabdff1aSopenharmony_ci
45cabdff1aSopenharmony_civoid ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
46cabdff1aSopenharmony_ci                       const uint8_t *src)
47cabdff1aSopenharmony_ci{
48cabdff1aSopenharmony_ci    uint32_t row;
49cabdff1aSopenharmony_ci    v16u8 src1, src2;
50cabdff1aSopenharmony_ci
51cabdff1aSopenharmony_ci    src1 = LD_UB(src);
52cabdff1aSopenharmony_ci    src2 = LD_UB(src + 16);
53cabdff1aSopenharmony_ci
54cabdff1aSopenharmony_ci    for (row = 32; row--;) {
55cabdff1aSopenharmony_ci        ST_UB2(src1, src2, dst, 16);
56cabdff1aSopenharmony_ci        dst += dst_stride;
57cabdff1aSopenharmony_ci    }
58cabdff1aSopenharmony_ci}
59cabdff1aSopenharmony_ci
60cabdff1aSopenharmony_civoid ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
61cabdff1aSopenharmony_ci                      const uint8_t *top)
62cabdff1aSopenharmony_ci{
63cabdff1aSopenharmony_ci    uint32_t row, inp;
64cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3;
65cabdff1aSopenharmony_ci
66cabdff1aSopenharmony_ci    src += 12;
67cabdff1aSopenharmony_ci    for (row = 4; row--;) {
68cabdff1aSopenharmony_ci        inp = LW(src);
69cabdff1aSopenharmony_ci        src -= 4;
70cabdff1aSopenharmony_ci
71cabdff1aSopenharmony_ci        src0 = (v16u8) __msa_fill_b(inp >> 24);
72cabdff1aSopenharmony_ci        src1 = (v16u8) __msa_fill_b(inp >> 16);
73cabdff1aSopenharmony_ci        src2 = (v16u8) __msa_fill_b(inp >> 8);
74cabdff1aSopenharmony_ci        src3 = (v16u8) __msa_fill_b(inp);
75cabdff1aSopenharmony_ci
76cabdff1aSopenharmony_ci        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
77cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
78cabdff1aSopenharmony_ci    }
79cabdff1aSopenharmony_ci}
80cabdff1aSopenharmony_ci
81cabdff1aSopenharmony_civoid ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
82cabdff1aSopenharmony_ci                      const uint8_t *top)
83cabdff1aSopenharmony_ci{
84cabdff1aSopenharmony_ci    uint32_t row, inp;
85cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3;
86cabdff1aSopenharmony_ci
87cabdff1aSopenharmony_ci    src += 28;
88cabdff1aSopenharmony_ci    for (row = 8; row--;) {
89cabdff1aSopenharmony_ci        inp = LW(src);
90cabdff1aSopenharmony_ci        src -= 4;
91cabdff1aSopenharmony_ci
92cabdff1aSopenharmony_ci        src0 = (v16u8) __msa_fill_b(inp >> 24);
93cabdff1aSopenharmony_ci        src1 = (v16u8) __msa_fill_b(inp >> 16);
94cabdff1aSopenharmony_ci        src2 = (v16u8) __msa_fill_b(inp >> 8);
95cabdff1aSopenharmony_ci        src3 = (v16u8) __msa_fill_b(inp);
96cabdff1aSopenharmony_ci
97cabdff1aSopenharmony_ci        ST_UB2(src0, src0, dst, 16);
98cabdff1aSopenharmony_ci        dst += dst_stride;
99cabdff1aSopenharmony_ci        ST_UB2(src1, src1, dst, 16);
100cabdff1aSopenharmony_ci        dst += dst_stride;
101cabdff1aSopenharmony_ci        ST_UB2(src2, src2, dst, 16);
102cabdff1aSopenharmony_ci        dst += dst_stride;
103cabdff1aSopenharmony_ci        ST_UB2(src3, src3, dst, 16);
104cabdff1aSopenharmony_ci        dst += dst_stride;
105cabdff1aSopenharmony_ci    }
106cabdff1aSopenharmony_ci}
107cabdff1aSopenharmony_ci
108cabdff1aSopenharmony_civoid ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
109cabdff1aSopenharmony_ci                   const uint8_t *src_top)
110cabdff1aSopenharmony_ci{
111cabdff1aSopenharmony_ci    uint32_t val0, val1;
112cabdff1aSopenharmony_ci    v16i8 store, src = { 0 };
113cabdff1aSopenharmony_ci    v8u16 sum_h;
114cabdff1aSopenharmony_ci    v4u32 sum_w;
115cabdff1aSopenharmony_ci    v2u64 sum_d;
116cabdff1aSopenharmony_ci
117cabdff1aSopenharmony_ci    val0 = LW(src_top);
118cabdff1aSopenharmony_ci    val1 = LW(src_left);
119cabdff1aSopenharmony_ci    INSERT_W2_SB(val0, val1, src);
120cabdff1aSopenharmony_ci    sum_h = __msa_hadd_u_h((v16u8) src, (v16u8) src);
121cabdff1aSopenharmony_ci    sum_w = __msa_hadd_u_w(sum_h, sum_h);
122cabdff1aSopenharmony_ci    sum_d = __msa_hadd_u_d(sum_w, sum_w);
123cabdff1aSopenharmony_ci    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3);
124cabdff1aSopenharmony_ci    store = __msa_splati_b((v16i8) sum_w, 0);
125cabdff1aSopenharmony_ci    val0 = __msa_copy_u_w((v4i32) store, 0);
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_ci    SW4(val0, val0, val0, val0, dst, dst_stride);
128cabdff1aSopenharmony_ci}
129cabdff1aSopenharmony_ci
130cabdff1aSopenharmony_ci#define INTRA_DC_TL_4x4(dir)                                    \
131cabdff1aSopenharmony_civoid ff_dc_##dir##_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,  \
132cabdff1aSopenharmony_ci                           const uint8_t *left,                 \
133cabdff1aSopenharmony_ci                           const uint8_t *top)                  \
134cabdff1aSopenharmony_ci{                                                               \
135cabdff1aSopenharmony_ci    uint32_t val0;                                              \
136cabdff1aSopenharmony_ci    v16i8 store, data = { 0 };                                  \
137cabdff1aSopenharmony_ci    v8u16 sum_h;                                                \
138cabdff1aSopenharmony_ci    v4u32 sum_w;                                                \
139cabdff1aSopenharmony_ci                                                                \
140cabdff1aSopenharmony_ci    val0 = LW(dir);                                             \
141cabdff1aSopenharmony_ci    data = (v16i8) __msa_insert_w((v4i32) data, 0, val0);       \
142cabdff1aSopenharmony_ci    sum_h = __msa_hadd_u_h((v16u8) data, (v16u8) data);         \
143cabdff1aSopenharmony_ci    sum_w = __msa_hadd_u_w(sum_h, sum_h);                       \
144cabdff1aSopenharmony_ci    sum_w = (v4u32) __msa_srari_w((v4i32) sum_w, 2);            \
145cabdff1aSopenharmony_ci    store = __msa_splati_b((v16i8) sum_w, 0);                   \
146cabdff1aSopenharmony_ci    val0 = __msa_copy_u_w((v4i32) store, 0);                    \
147cabdff1aSopenharmony_ci                                                                \
148cabdff1aSopenharmony_ci    SW4(val0, val0, val0, val0, dst, dst_stride);               \
149cabdff1aSopenharmony_ci}
150cabdff1aSopenharmony_ciINTRA_DC_TL_4x4(top);
151cabdff1aSopenharmony_ciINTRA_DC_TL_4x4(left);
152cabdff1aSopenharmony_ci
153cabdff1aSopenharmony_civoid ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
154cabdff1aSopenharmony_ci                   const uint8_t *src_top)
155cabdff1aSopenharmony_ci{
156cabdff1aSopenharmony_ci    uint64_t val0, val1;
157cabdff1aSopenharmony_ci    v16i8 store;
158cabdff1aSopenharmony_ci    v16u8 src = { 0 };
159cabdff1aSopenharmony_ci    v8u16 sum_h;
160cabdff1aSopenharmony_ci    v4u32 sum_w;
161cabdff1aSopenharmony_ci    v2u64 sum_d;
162cabdff1aSopenharmony_ci
163cabdff1aSopenharmony_ci    val0 = LD(src_top);
164cabdff1aSopenharmony_ci    val1 = LD(src_left);
165cabdff1aSopenharmony_ci    INSERT_D2_UB(val0, val1, src);
166cabdff1aSopenharmony_ci    sum_h = __msa_hadd_u_h(src, src);
167cabdff1aSopenharmony_ci    sum_w = __msa_hadd_u_w(sum_h, sum_h);
168cabdff1aSopenharmony_ci    sum_d = __msa_hadd_u_d(sum_w, sum_w);
169cabdff1aSopenharmony_ci    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
170cabdff1aSopenharmony_ci    sum_d = __msa_hadd_u_d(sum_w, sum_w);
171cabdff1aSopenharmony_ci    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4);
172cabdff1aSopenharmony_ci    store = __msa_splati_b((v16i8) sum_w, 0);
173cabdff1aSopenharmony_ci    val0 = __msa_copy_u_d((v2i64) store, 0);
174cabdff1aSopenharmony_ci
175cabdff1aSopenharmony_ci    SD4(val0, val0, val0, val0, dst, dst_stride);
176cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
177cabdff1aSopenharmony_ci    SD4(val0, val0, val0, val0, dst, dst_stride);
178cabdff1aSopenharmony_ci}
179cabdff1aSopenharmony_ci
180cabdff1aSopenharmony_ci#define INTRA_DC_TL_8x8(dir)                                    \
181cabdff1aSopenharmony_civoid ff_dc_##dir##_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,  \
182cabdff1aSopenharmony_ci                           const uint8_t *left,                 \
183cabdff1aSopenharmony_ci                           const uint8_t *top)                  \
184cabdff1aSopenharmony_ci{                                                               \
185cabdff1aSopenharmony_ci    uint64_t val0;                                              \
186cabdff1aSopenharmony_ci    v16i8 store;                                                \
187cabdff1aSopenharmony_ci    v16u8 data = { 0 };                                         \
188cabdff1aSopenharmony_ci    v8u16 sum_h;                                                \
189cabdff1aSopenharmony_ci    v4u32 sum_w;                                                \
190cabdff1aSopenharmony_ci    v2u64 sum_d;                                                \
191cabdff1aSopenharmony_ci                                                                \
192cabdff1aSopenharmony_ci    val0 = LD(dir);                                             \
193cabdff1aSopenharmony_ci    data = (v16u8) __msa_insert_d((v2i64) data, 0, val0);       \
194cabdff1aSopenharmony_ci    sum_h = __msa_hadd_u_h(data, data);                         \
195cabdff1aSopenharmony_ci    sum_w = __msa_hadd_u_w(sum_h, sum_h);                       \
196cabdff1aSopenharmony_ci    sum_d = __msa_hadd_u_d(sum_w, sum_w);                       \
197cabdff1aSopenharmony_ci    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3);            \
198cabdff1aSopenharmony_ci    store = __msa_splati_b((v16i8) sum_w, 0);                   \
199cabdff1aSopenharmony_ci    val0 = __msa_copy_u_d((v2i64) store, 0);                    \
200cabdff1aSopenharmony_ci                                                                \
201cabdff1aSopenharmony_ci    SD4(val0, val0, val0, val0, dst, dst_stride);               \
202cabdff1aSopenharmony_ci    dst += (4 * dst_stride);                                    \
203cabdff1aSopenharmony_ci    SD4(val0, val0, val0, val0, dst, dst_stride);               \
204cabdff1aSopenharmony_ci}
205cabdff1aSopenharmony_ci
206cabdff1aSopenharmony_ciINTRA_DC_TL_8x8(top);
207cabdff1aSopenharmony_ciINTRA_DC_TL_8x8(left);
208cabdff1aSopenharmony_ci
209cabdff1aSopenharmony_civoid ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,
210cabdff1aSopenharmony_ci                     const uint8_t *src_left, const uint8_t *src_top)
211cabdff1aSopenharmony_ci{
212cabdff1aSopenharmony_ci    v16u8 top, left, out;
213cabdff1aSopenharmony_ci    v8u16 sum_h, sum_top, sum_left;
214cabdff1aSopenharmony_ci    v4u32 sum_w;
215cabdff1aSopenharmony_ci    v2u64 sum_d;
216cabdff1aSopenharmony_ci
217cabdff1aSopenharmony_ci    top = LD_UB(src_top);
218cabdff1aSopenharmony_ci    left = LD_UB(src_left);
219cabdff1aSopenharmony_ci    HADD_UB2_UH(top, left, sum_top, sum_left);
220cabdff1aSopenharmony_ci    sum_h = sum_top + sum_left;
221cabdff1aSopenharmony_ci    sum_w = __msa_hadd_u_w(sum_h, sum_h);
222cabdff1aSopenharmony_ci    sum_d = __msa_hadd_u_d(sum_w, sum_w);
223cabdff1aSopenharmony_ci    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
224cabdff1aSopenharmony_ci    sum_d = __msa_hadd_u_d(sum_w, sum_w);
225cabdff1aSopenharmony_ci    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5);
226cabdff1aSopenharmony_ci    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
227cabdff1aSopenharmony_ci
228cabdff1aSopenharmony_ci    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
229cabdff1aSopenharmony_ci    dst += (8 * dst_stride);
230cabdff1aSopenharmony_ci    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
231cabdff1aSopenharmony_ci}
232cabdff1aSopenharmony_ci
233cabdff1aSopenharmony_ci#define INTRA_DC_TL_16x16(dir)                                        \
234cabdff1aSopenharmony_civoid ff_dc_##dir##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,      \
235cabdff1aSopenharmony_ci                             const uint8_t *left,                     \
236cabdff1aSopenharmony_ci                             const uint8_t *top)                      \
237cabdff1aSopenharmony_ci{                                                                     \
238cabdff1aSopenharmony_ci    v16u8 data, out;                                                  \
239cabdff1aSopenharmony_ci    v8u16 sum_h;                                                      \
240cabdff1aSopenharmony_ci    v4u32 sum_w;                                                      \
241cabdff1aSopenharmony_ci    v2u64 sum_d;                                                      \
242cabdff1aSopenharmony_ci                                                                      \
243cabdff1aSopenharmony_ci    data = LD_UB(dir);                                                \
244cabdff1aSopenharmony_ci    sum_h = __msa_hadd_u_h(data, data);                               \
245cabdff1aSopenharmony_ci    sum_w = __msa_hadd_u_w(sum_h, sum_h);                             \
246cabdff1aSopenharmony_ci    sum_d = __msa_hadd_u_d(sum_w, sum_w);                             \
247cabdff1aSopenharmony_ci    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);      \
248cabdff1aSopenharmony_ci    sum_d = __msa_hadd_u_d(sum_w, sum_w);                             \
249cabdff1aSopenharmony_ci    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4);                  \
250cabdff1aSopenharmony_ci    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);                   \
251cabdff1aSopenharmony_ci                                                                      \
252cabdff1aSopenharmony_ci    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \
253cabdff1aSopenharmony_ci    dst += (8 * dst_stride);                                          \
254cabdff1aSopenharmony_ci    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \
255cabdff1aSopenharmony_ci}
256cabdff1aSopenharmony_ciINTRA_DC_TL_16x16(top);
257cabdff1aSopenharmony_ciINTRA_DC_TL_16x16(left);
258cabdff1aSopenharmony_ci
259cabdff1aSopenharmony_civoid ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,
260cabdff1aSopenharmony_ci                     const uint8_t *src_left, const uint8_t *src_top)
261cabdff1aSopenharmony_ci{
262cabdff1aSopenharmony_ci    uint32_t row;
263cabdff1aSopenharmony_ci    v16u8 top0, top1, left0, left1, out;
264cabdff1aSopenharmony_ci    v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
265cabdff1aSopenharmony_ci    v4u32 sum_w;
266cabdff1aSopenharmony_ci    v2u64 sum_d;
267cabdff1aSopenharmony_ci
268cabdff1aSopenharmony_ci    LD_UB2(src_top, 16, top0, top1);
269cabdff1aSopenharmony_ci    LD_UB2(src_left, 16, left0, left1);
270cabdff1aSopenharmony_ci    HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
271cabdff1aSopenharmony_ci    HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
272cabdff1aSopenharmony_ci    sum_h = sum_top0 + sum_top1;
273cabdff1aSopenharmony_ci    sum_h += sum_left0 + sum_left1;
274cabdff1aSopenharmony_ci    sum_w = __msa_hadd_u_w(sum_h, sum_h);
275cabdff1aSopenharmony_ci    sum_d = __msa_hadd_u_d(sum_w, sum_w);
276cabdff1aSopenharmony_ci    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
277cabdff1aSopenharmony_ci    sum_d = __msa_hadd_u_d(sum_w, sum_w);
278cabdff1aSopenharmony_ci    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 6);
279cabdff1aSopenharmony_ci    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
280cabdff1aSopenharmony_ci
281cabdff1aSopenharmony_ci    for (row = 16; row--;)
282cabdff1aSopenharmony_ci    {
283cabdff1aSopenharmony_ci        ST_UB2(out, out, dst, 16);
284cabdff1aSopenharmony_ci        dst += dst_stride;
285cabdff1aSopenharmony_ci        ST_UB2(out, out, dst, 16);
286cabdff1aSopenharmony_ci        dst += dst_stride;
287cabdff1aSopenharmony_ci    }
288cabdff1aSopenharmony_ci}
289cabdff1aSopenharmony_ci
290cabdff1aSopenharmony_ci#define INTRA_DC_TL_32x32(dir)                                    \
291cabdff1aSopenharmony_civoid ff_dc_##dir##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,  \
292cabdff1aSopenharmony_ci                             const uint8_t *left,                 \
293cabdff1aSopenharmony_ci                             const uint8_t *top)                  \
294cabdff1aSopenharmony_ci{                                                                 \
295cabdff1aSopenharmony_ci    uint32_t row;                                                 \
296cabdff1aSopenharmony_ci    v16u8 data0, data1, out;                                      \
297cabdff1aSopenharmony_ci    v8u16 sum_h, sum_data0, sum_data1;                            \
298cabdff1aSopenharmony_ci    v4u32 sum_w;                                                  \
299cabdff1aSopenharmony_ci    v2u64 sum_d;                                                  \
300cabdff1aSopenharmony_ci                                                                  \
301cabdff1aSopenharmony_ci    LD_UB2(dir, 16, data0, data1);                                \
302cabdff1aSopenharmony_ci    HADD_UB2_UH(data0, data1, sum_data0, sum_data1);              \
303cabdff1aSopenharmony_ci    sum_h = sum_data0 + sum_data1;                                \
304cabdff1aSopenharmony_ci    sum_w = __msa_hadd_u_w(sum_h, sum_h);                         \
305cabdff1aSopenharmony_ci    sum_d = __msa_hadd_u_d(sum_w, sum_w);                         \
306cabdff1aSopenharmony_ci    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);  \
307cabdff1aSopenharmony_ci    sum_d = __msa_hadd_u_d(sum_w, sum_w);                         \
308cabdff1aSopenharmony_ci    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5);              \
309cabdff1aSopenharmony_ci    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);               \
310cabdff1aSopenharmony_ci                                                                  \
311cabdff1aSopenharmony_ci    for (row = 16; row--;)                                        \
312cabdff1aSopenharmony_ci    {                                                             \
313cabdff1aSopenharmony_ci        ST_UB2(out, out, dst, 16);                                \
314cabdff1aSopenharmony_ci        dst += dst_stride;                                        \
315cabdff1aSopenharmony_ci        ST_UB2(out, out, dst, 16);                                \
316cabdff1aSopenharmony_ci        dst += dst_stride;                                        \
317cabdff1aSopenharmony_ci    }                                                             \
318cabdff1aSopenharmony_ci}
319cabdff1aSopenharmony_ciINTRA_DC_TL_32x32(top);
320cabdff1aSopenharmony_ciINTRA_DC_TL_32x32(left);
321cabdff1aSopenharmony_ci
322cabdff1aSopenharmony_ci#define INTRA_PREDICT_VALDC_16X16_MSA(val)                             \
323cabdff1aSopenharmony_civoid ff_dc_##val##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,       \
324cabdff1aSopenharmony_ci                             const uint8_t *left, const uint8_t *top)  \
325cabdff1aSopenharmony_ci{                                                                      \
326cabdff1aSopenharmony_ci    v16u8 out = (v16u8) __msa_ldi_b(val);                              \
327cabdff1aSopenharmony_ci                                                                       \
328cabdff1aSopenharmony_ci    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);   \
329cabdff1aSopenharmony_ci    dst += (8 * dst_stride);                                           \
330cabdff1aSopenharmony_ci    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);   \
331cabdff1aSopenharmony_ci}
332cabdff1aSopenharmony_ci
333cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_16X16_MSA(127);
334cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_16X16_MSA(128);
335cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_16X16_MSA(129);
336cabdff1aSopenharmony_ci
337cabdff1aSopenharmony_ci#define INTRA_PREDICT_VALDC_32X32_MSA(val)                             \
338cabdff1aSopenharmony_civoid ff_dc_##val##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,       \
339cabdff1aSopenharmony_ci                             const uint8_t *left, const uint8_t *top)  \
340cabdff1aSopenharmony_ci{                                                                      \
341cabdff1aSopenharmony_ci    uint32_t row;                                                      \
342cabdff1aSopenharmony_ci    v16u8 out = (v16u8) __msa_ldi_b(val);                              \
343cabdff1aSopenharmony_ci                                                                       \
344cabdff1aSopenharmony_ci    for (row = 16; row--;)                                             \
345cabdff1aSopenharmony_ci    {                                                                  \
346cabdff1aSopenharmony_ci        ST_UB2(out, out, dst, 16);                                     \
347cabdff1aSopenharmony_ci        dst += dst_stride;                                             \
348cabdff1aSopenharmony_ci        ST_UB2(out, out, dst, 16);                                     \
349cabdff1aSopenharmony_ci        dst += dst_stride;                                             \
350cabdff1aSopenharmony_ci    }                                                                  \
351cabdff1aSopenharmony_ci}
352cabdff1aSopenharmony_ci
353cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_32X32_MSA(127);
354cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_32X32_MSA(128);
355cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_32X32_MSA(129);
356cabdff1aSopenharmony_ci
357cabdff1aSopenharmony_civoid ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,
358cabdff1aSopenharmony_ci                   const uint8_t *src_left, const uint8_t *src_top_ptr)
359cabdff1aSopenharmony_ci{
360cabdff1aSopenharmony_ci    uint32_t left;
361cabdff1aSopenharmony_ci    uint8_t top_left = src_top_ptr[-1];
362cabdff1aSopenharmony_ci    v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
363cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3;
364cabdff1aSopenharmony_ci    v8u16 src_top_left, vec0, vec1, vec2, vec3;
365cabdff1aSopenharmony_ci
366cabdff1aSopenharmony_ci    src_top_left = (v8u16) __msa_fill_h(top_left);
367cabdff1aSopenharmony_ci    src_top = LD_SB(src_top_ptr);
368cabdff1aSopenharmony_ci    left = LW(src_left);
369cabdff1aSopenharmony_ci    src_left0 = __msa_fill_b(left >> 24);
370cabdff1aSopenharmony_ci    src_left1 = __msa_fill_b(left >> 16);
371cabdff1aSopenharmony_ci    src_left2 = __msa_fill_b(left >> 8);
372cabdff1aSopenharmony_ci    src_left3 = __msa_fill_b(left);
373cabdff1aSopenharmony_ci
374cabdff1aSopenharmony_ci    ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
375cabdff1aSopenharmony_ci               src_left3, src_top, src0, src1, src2, src3);
376cabdff1aSopenharmony_ci    HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
377cabdff1aSopenharmony_ci    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
378cabdff1aSopenharmony_ci    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
379cabdff1aSopenharmony_ci    SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
380cabdff1aSopenharmony_ci    PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
381cabdff1aSopenharmony_ci    ST_W2(tmp0, 0, 2, dst, dst_stride);
382cabdff1aSopenharmony_ci    ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
383cabdff1aSopenharmony_ci}
384cabdff1aSopenharmony_ci
385cabdff1aSopenharmony_civoid ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,
386cabdff1aSopenharmony_ci                   const uint8_t *src_left, const uint8_t *src_top_ptr)
387cabdff1aSopenharmony_ci{
388cabdff1aSopenharmony_ci    uint8_t top_left = src_top_ptr[-1];
389cabdff1aSopenharmony_ci    uint32_t loop_cnt, left;
390cabdff1aSopenharmony_ci    v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
391cabdff1aSopenharmony_ci    v8u16 src_top_left, vec0, vec1, vec2, vec3;
392cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3;
393cabdff1aSopenharmony_ci
394cabdff1aSopenharmony_ci    src_top = LD_SB(src_top_ptr);
395cabdff1aSopenharmony_ci    src_top_left = (v8u16) __msa_fill_h(top_left);
396cabdff1aSopenharmony_ci
397cabdff1aSopenharmony_ci    src_left += 4;
398cabdff1aSopenharmony_ci    for (loop_cnt = 2; loop_cnt--;) {
399cabdff1aSopenharmony_ci        left = LW(src_left);
400cabdff1aSopenharmony_ci        src_left0 = __msa_fill_b(left >> 24);
401cabdff1aSopenharmony_ci        src_left1 = __msa_fill_b(left >> 16);
402cabdff1aSopenharmony_ci        src_left2 = __msa_fill_b(left >> 8);
403cabdff1aSopenharmony_ci        src_left3 = __msa_fill_b(left);
404cabdff1aSopenharmony_ci        src_left -= 4;
405cabdff1aSopenharmony_ci
406cabdff1aSopenharmony_ci        ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
407cabdff1aSopenharmony_ci                   src_left3, src_top, src0, src1, src2, src3);
408cabdff1aSopenharmony_ci        HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
409cabdff1aSopenharmony_ci        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
410cabdff1aSopenharmony_ci        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
411cabdff1aSopenharmony_ci        SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
412cabdff1aSopenharmony_ci        PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
413cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
414cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
415cabdff1aSopenharmony_ci    }
416cabdff1aSopenharmony_ci}
417cabdff1aSopenharmony_ci
418cabdff1aSopenharmony_civoid ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,
419cabdff1aSopenharmony_ci                     const uint8_t *src_left, const uint8_t *src_top_ptr)
420cabdff1aSopenharmony_ci{
421cabdff1aSopenharmony_ci    uint8_t top_left = src_top_ptr[-1];
422cabdff1aSopenharmony_ci    uint32_t loop_cnt, left;
423cabdff1aSopenharmony_ci    v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
424cabdff1aSopenharmony_ci    v8u16 src_top_left, res_r, res_l;
425cabdff1aSopenharmony_ci
426cabdff1aSopenharmony_ci    src_top = LD_SB(src_top_ptr);
427cabdff1aSopenharmony_ci    src_top_left = (v8u16) __msa_fill_h(top_left);
428cabdff1aSopenharmony_ci
429cabdff1aSopenharmony_ci    src_left += 12;
430cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
431cabdff1aSopenharmony_ci        left = LW(src_left);
432cabdff1aSopenharmony_ci        src_left0 = __msa_fill_b(left >> 24);
433cabdff1aSopenharmony_ci        src_left1 = __msa_fill_b(left >> 16);
434cabdff1aSopenharmony_ci        src_left2 = __msa_fill_b(left >> 8);
435cabdff1aSopenharmony_ci        src_left3 = __msa_fill_b(left);
436cabdff1aSopenharmony_ci        src_left -= 4;
437cabdff1aSopenharmony_ci
438cabdff1aSopenharmony_ci        ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
439cabdff1aSopenharmony_ci        HADD_UB2_UH(res_r, res_l, res_r, res_l);
440cabdff1aSopenharmony_ci        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
441cabdff1aSopenharmony_ci
442cabdff1aSopenharmony_ci        SAT_UH2_UH(res_r, res_l, 7);
443cabdff1aSopenharmony_ci        PCKEV_ST_SB(res_r, res_l, dst);
444cabdff1aSopenharmony_ci        dst += dst_stride;
445cabdff1aSopenharmony_ci
446cabdff1aSopenharmony_ci        ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
447cabdff1aSopenharmony_ci        HADD_UB2_UH(res_r, res_l, res_r, res_l);
448cabdff1aSopenharmony_ci        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
449cabdff1aSopenharmony_ci        SAT_UH2_UH(res_r, res_l, 7);
450cabdff1aSopenharmony_ci        PCKEV_ST_SB(res_r, res_l, dst);
451cabdff1aSopenharmony_ci        dst += dst_stride;
452cabdff1aSopenharmony_ci
453cabdff1aSopenharmony_ci        ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
454cabdff1aSopenharmony_ci        HADD_UB2_UH(res_r, res_l, res_r, res_l);
455cabdff1aSopenharmony_ci        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
456cabdff1aSopenharmony_ci        SAT_UH2_UH(res_r, res_l, 7);
457cabdff1aSopenharmony_ci        PCKEV_ST_SB(res_r, res_l, dst);
458cabdff1aSopenharmony_ci        dst += dst_stride;
459cabdff1aSopenharmony_ci
460cabdff1aSopenharmony_ci        ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
461cabdff1aSopenharmony_ci        HADD_UB2_UH(res_r, res_l, res_r, res_l);
462cabdff1aSopenharmony_ci        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
463cabdff1aSopenharmony_ci        SAT_UH2_UH(res_r, res_l, 7);
464cabdff1aSopenharmony_ci        PCKEV_ST_SB(res_r, res_l, dst);
465cabdff1aSopenharmony_ci        dst += dst_stride;
466cabdff1aSopenharmony_ci    }
467cabdff1aSopenharmony_ci}
468cabdff1aSopenharmony_ci
469cabdff1aSopenharmony_civoid ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,
470cabdff1aSopenharmony_ci                     const uint8_t *src_left, const uint8_t *src_top_ptr)
471cabdff1aSopenharmony_ci{
472cabdff1aSopenharmony_ci    uint8_t top_left = src_top_ptr[-1];
473cabdff1aSopenharmony_ci    uint32_t loop_cnt, left;
474cabdff1aSopenharmony_ci    v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
475cabdff1aSopenharmony_ci    v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
476cabdff1aSopenharmony_ci
477cabdff1aSopenharmony_ci    src_top0 = LD_SB(src_top_ptr);
478cabdff1aSopenharmony_ci    src_top1 = LD_SB(src_top_ptr + 16);
479cabdff1aSopenharmony_ci    src_top_left = (v8u16) __msa_fill_h(top_left);
480cabdff1aSopenharmony_ci
481cabdff1aSopenharmony_ci    src_left += 28;
482cabdff1aSopenharmony_ci    for (loop_cnt = 8; loop_cnt--;) {
483cabdff1aSopenharmony_ci        left = LW(src_left);
484cabdff1aSopenharmony_ci        src_left0 = __msa_fill_b(left >> 24);
485cabdff1aSopenharmony_ci        src_left1 = __msa_fill_b(left >> 16);
486cabdff1aSopenharmony_ci        src_left2 = __msa_fill_b(left >> 8);
487cabdff1aSopenharmony_ci        src_left3 = __msa_fill_b(left);
488cabdff1aSopenharmony_ci        src_left -= 4;
489cabdff1aSopenharmony_ci
490cabdff1aSopenharmony_ci        ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
491cabdff1aSopenharmony_ci        ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
492cabdff1aSopenharmony_ci        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
493cabdff1aSopenharmony_ci                    res_l1);
494cabdff1aSopenharmony_ci        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
495cabdff1aSopenharmony_ci        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
496cabdff1aSopenharmony_ci        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
497cabdff1aSopenharmony_ci        PCKEV_ST_SB(res_r0, res_l0, dst);
498cabdff1aSopenharmony_ci        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
499cabdff1aSopenharmony_ci        dst += dst_stride;
500cabdff1aSopenharmony_ci
501cabdff1aSopenharmony_ci        ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
502cabdff1aSopenharmony_ci        ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
503cabdff1aSopenharmony_ci        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
504cabdff1aSopenharmony_ci                    res_l1);
505cabdff1aSopenharmony_ci        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
506cabdff1aSopenharmony_ci        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
507cabdff1aSopenharmony_ci        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
508cabdff1aSopenharmony_ci        PCKEV_ST_SB(res_r0, res_l0, dst);
509cabdff1aSopenharmony_ci        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
510cabdff1aSopenharmony_ci        dst += dst_stride;
511cabdff1aSopenharmony_ci
512cabdff1aSopenharmony_ci        ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
513cabdff1aSopenharmony_ci        ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
514cabdff1aSopenharmony_ci        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
515cabdff1aSopenharmony_ci                    res_l1);
516cabdff1aSopenharmony_ci        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
517cabdff1aSopenharmony_ci        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
518cabdff1aSopenharmony_ci        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
519cabdff1aSopenharmony_ci        PCKEV_ST_SB(res_r0, res_l0, dst);
520cabdff1aSopenharmony_ci        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
521cabdff1aSopenharmony_ci        dst += dst_stride;
522cabdff1aSopenharmony_ci
523cabdff1aSopenharmony_ci        ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
524cabdff1aSopenharmony_ci        ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
525cabdff1aSopenharmony_ci        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
526cabdff1aSopenharmony_ci                    res_l1);
527cabdff1aSopenharmony_ci        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
528cabdff1aSopenharmony_ci        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
529cabdff1aSopenharmony_ci        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
530cabdff1aSopenharmony_ci        PCKEV_ST_SB(res_r0, res_l0, dst);
531cabdff1aSopenharmony_ci        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
532cabdff1aSopenharmony_ci        dst += dst_stride;
533cabdff1aSopenharmony_ci    }
534cabdff1aSopenharmony_ci}
535