1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
22cabdff1aSopenharmony_ci#include "me_cmp_mips.h"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_cistatic uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride,
25cabdff1aSopenharmony_ci                               uint8_t *ref, int32_t ref_stride,
26cabdff1aSopenharmony_ci                               int32_t height)
27cabdff1aSopenharmony_ci{
28cabdff1aSopenharmony_ci    int32_t ht_cnt;
29cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
30cabdff1aSopenharmony_ci    v8u16 sad = { 0 };
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ci    for (ht_cnt = (height >> 2); ht_cnt--;) {
33cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
34cabdff1aSopenharmony_ci        src += (4 * src_stride);
35cabdff1aSopenharmony_ci        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
36cabdff1aSopenharmony_ci        ref += (4 * ref_stride);
37cabdff1aSopenharmony_ci
38cabdff1aSopenharmony_ci        PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
39cabdff1aSopenharmony_ci                    src0, src1, ref0, ref1);
40cabdff1aSopenharmony_ci        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
41cabdff1aSopenharmony_ci    }
42cabdff1aSopenharmony_ci
43cabdff1aSopenharmony_ci    return (HADD_UH_U32(sad));
44cabdff1aSopenharmony_ci}
45cabdff1aSopenharmony_ci
46cabdff1aSopenharmony_cistatic uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride,
47cabdff1aSopenharmony_ci                                uint8_t *ref, int32_t ref_stride,
48cabdff1aSopenharmony_ci                                int32_t height)
49cabdff1aSopenharmony_ci{
50cabdff1aSopenharmony_ci    int32_t ht_cnt;
51cabdff1aSopenharmony_ci    v16u8 src0, src1, ref0, ref1;
52cabdff1aSopenharmony_ci    v8u16 sad = { 0 };
53cabdff1aSopenharmony_ci
54cabdff1aSopenharmony_ci    for (ht_cnt = (height >> 2); ht_cnt--;) {
55cabdff1aSopenharmony_ci        LD_UB2(src, src_stride, src0, src1);
56cabdff1aSopenharmony_ci        src += (2 * src_stride);
57cabdff1aSopenharmony_ci        LD_UB2(ref, ref_stride, ref0, ref1);
58cabdff1aSopenharmony_ci        ref += (2 * ref_stride);
59cabdff1aSopenharmony_ci        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
60cabdff1aSopenharmony_ci
61cabdff1aSopenharmony_ci        LD_UB2(src, src_stride, src0, src1);
62cabdff1aSopenharmony_ci        src += (2 * src_stride);
63cabdff1aSopenharmony_ci        LD_UB2(ref, ref_stride, ref0, ref1);
64cabdff1aSopenharmony_ci        ref += (2 * ref_stride);
65cabdff1aSopenharmony_ci        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
66cabdff1aSopenharmony_ci    }
67cabdff1aSopenharmony_ci
68cabdff1aSopenharmony_ci    return (HADD_UH_U32(sad));
69cabdff1aSopenharmony_ci}
70cabdff1aSopenharmony_ci
71cabdff1aSopenharmony_cistatic uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src,
72cabdff1aSopenharmony_ci                                                     int32_t src_stride,
73cabdff1aSopenharmony_ci                                                     uint8_t *ref,
74cabdff1aSopenharmony_ci                                                     int32_t ref_stride,
75cabdff1aSopenharmony_ci                                                     int32_t height)
76cabdff1aSopenharmony_ci{
77cabdff1aSopenharmony_ci    int32_t ht_cnt;
78cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, comp0, comp1;
79cabdff1aSopenharmony_ci    v16u8 ref0, ref1, ref2, ref3, ref4, ref5;
80cabdff1aSopenharmony_ci    v8u16 sad = { 0 };
81cabdff1aSopenharmony_ci
82cabdff1aSopenharmony_ci    for (ht_cnt = (height >> 3); ht_cnt--;) {
83cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
84cabdff1aSopenharmony_ci        src += (4 * src_stride);
85cabdff1aSopenharmony_ci        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
86cabdff1aSopenharmony_ci        ref += (4 * ref_stride);
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
89cabdff1aSopenharmony_ci        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
90cabdff1aSopenharmony_ci        SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
91cabdff1aSopenharmony_ci                   ref0, ref1, ref2, ref3);
92cabdff1aSopenharmony_ci        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
93cabdff1aSopenharmony_ci        AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
94cabdff1aSopenharmony_ci        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
95cabdff1aSopenharmony_ci
96cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
97cabdff1aSopenharmony_ci        src += (4 * src_stride);
98cabdff1aSopenharmony_ci        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
99cabdff1aSopenharmony_ci        ref += (4 * ref_stride);
100cabdff1aSopenharmony_ci
101cabdff1aSopenharmony_ci        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
102cabdff1aSopenharmony_ci        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
103cabdff1aSopenharmony_ci        SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
104cabdff1aSopenharmony_ci                   ref0, ref1, ref2, ref3);
105cabdff1aSopenharmony_ci        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
106cabdff1aSopenharmony_ci        AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
107cabdff1aSopenharmony_ci        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
108cabdff1aSopenharmony_ci    }
109cabdff1aSopenharmony_ci
110cabdff1aSopenharmony_ci    return (HADD_UH_U32(sad));
111cabdff1aSopenharmony_ci}
112cabdff1aSopenharmony_ci
113cabdff1aSopenharmony_cistatic uint32_t sad_horiz_bilinear_filter_16width_msa(uint8_t *src,
114cabdff1aSopenharmony_ci                                                      int32_t src_stride,
115cabdff1aSopenharmony_ci                                                      uint8_t *ref,
116cabdff1aSopenharmony_ci                                                      int32_t ref_stride,
117cabdff1aSopenharmony_ci                                                      int32_t height)
118cabdff1aSopenharmony_ci{
119cabdff1aSopenharmony_ci    int32_t ht_cnt;
120cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, comp0, comp1;
121cabdff1aSopenharmony_ci    v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31;
122cabdff1aSopenharmony_ci    v8u16 sad = { 0 };
123cabdff1aSopenharmony_ci
124cabdff1aSopenharmony_ci    for (ht_cnt = (height >> 3); ht_cnt--;) {
125cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
126cabdff1aSopenharmony_ci        src += (4 * src_stride);
127cabdff1aSopenharmony_ci        LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
128cabdff1aSopenharmony_ci        LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
129cabdff1aSopenharmony_ci        ref += (4 * ref_stride);
130cabdff1aSopenharmony_ci
131cabdff1aSopenharmony_ci        AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
132cabdff1aSopenharmony_ci        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
133cabdff1aSopenharmony_ci        AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
134cabdff1aSopenharmony_ci        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
135cabdff1aSopenharmony_ci
136cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
137cabdff1aSopenharmony_ci        src += (4 * src_stride);
138cabdff1aSopenharmony_ci        LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
139cabdff1aSopenharmony_ci        LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
140cabdff1aSopenharmony_ci        ref += (4 * ref_stride);
141cabdff1aSopenharmony_ci
142cabdff1aSopenharmony_ci        AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
143cabdff1aSopenharmony_ci        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
144cabdff1aSopenharmony_ci        AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
145cabdff1aSopenharmony_ci        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
146cabdff1aSopenharmony_ci    }
147cabdff1aSopenharmony_ci
148cabdff1aSopenharmony_ci    return (HADD_UH_U32(sad));
149cabdff1aSopenharmony_ci}
150cabdff1aSopenharmony_ci
151cabdff1aSopenharmony_cistatic uint32_t sad_vert_bilinear_filter_8width_msa(uint8_t *src,
152cabdff1aSopenharmony_ci                                                    int32_t src_stride,
153cabdff1aSopenharmony_ci                                                    uint8_t *ref,
154cabdff1aSopenharmony_ci                                                    int32_t ref_stride,
155cabdff1aSopenharmony_ci                                                    int32_t height)
156cabdff1aSopenharmony_ci{
157cabdff1aSopenharmony_ci    int32_t ht_cnt;
158cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, comp0, comp1;
159cabdff1aSopenharmony_ci    v16u8 ref0, ref1, ref2, ref3, ref4;
160cabdff1aSopenharmony_ci    v8u16 sad = { 0 };
161cabdff1aSopenharmony_ci
162cabdff1aSopenharmony_ci    for (ht_cnt = (height >> 3); ht_cnt--;) {
163cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
164cabdff1aSopenharmony_ci        src += (4 * src_stride);
165cabdff1aSopenharmony_ci        LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
166cabdff1aSopenharmony_ci        ref += (4 * ref_stride);
167cabdff1aSopenharmony_ci
168cabdff1aSopenharmony_ci        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
169cabdff1aSopenharmony_ci        PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
170cabdff1aSopenharmony_ci        PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
171cabdff1aSopenharmony_ci        AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
172cabdff1aSopenharmony_ci        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
173cabdff1aSopenharmony_ci
174cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
175cabdff1aSopenharmony_ci        src += (4 * src_stride);
176cabdff1aSopenharmony_ci        LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
177cabdff1aSopenharmony_ci        ref += (4 * ref_stride);
178cabdff1aSopenharmony_ci
179cabdff1aSopenharmony_ci        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
180cabdff1aSopenharmony_ci        PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
181cabdff1aSopenharmony_ci        PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
182cabdff1aSopenharmony_ci        AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
183cabdff1aSopenharmony_ci        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
184cabdff1aSopenharmony_ci    }
185cabdff1aSopenharmony_ci
186cabdff1aSopenharmony_ci    return (HADD_UH_U32(sad));
187cabdff1aSopenharmony_ci}
188cabdff1aSopenharmony_ci
189cabdff1aSopenharmony_cistatic uint32_t sad_vert_bilinear_filter_16width_msa(uint8_t *src,
190cabdff1aSopenharmony_ci                                                     int32_t src_stride,
191cabdff1aSopenharmony_ci                                                     uint8_t *ref,
192cabdff1aSopenharmony_ci                                                     int32_t ref_stride,
193cabdff1aSopenharmony_ci                                                     int32_t height)
194cabdff1aSopenharmony_ci{
195cabdff1aSopenharmony_ci    int32_t ht_cnt;
196cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, comp0, comp1;
197cabdff1aSopenharmony_ci    v16u8 ref0, ref1, ref2, ref3, ref4;
198cabdff1aSopenharmony_ci    v8u16 sad = { 0 };
199cabdff1aSopenharmony_ci
200cabdff1aSopenharmony_ci    for (ht_cnt = (height >> 3); ht_cnt--;) {
201cabdff1aSopenharmony_ci        LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
202cabdff1aSopenharmony_ci        ref += (5 * ref_stride);
203cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
204cabdff1aSopenharmony_ci        src += (4 * src_stride);
205cabdff1aSopenharmony_ci
206cabdff1aSopenharmony_ci        AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
207cabdff1aSopenharmony_ci        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
208cabdff1aSopenharmony_ci        AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
209cabdff1aSopenharmony_ci        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
210cabdff1aSopenharmony_ci
211cabdff1aSopenharmony_ci        ref4 = ref3;
212cabdff1aSopenharmony_ci
213cabdff1aSopenharmony_ci        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
214cabdff1aSopenharmony_ci        ref += (3 * ref_stride);
215cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
216cabdff1aSopenharmony_ci        src += (4 * src_stride);
217cabdff1aSopenharmony_ci
218cabdff1aSopenharmony_ci        AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
219cabdff1aSopenharmony_ci        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
220cabdff1aSopenharmony_ci        AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
221cabdff1aSopenharmony_ci        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
222cabdff1aSopenharmony_ci    }
223cabdff1aSopenharmony_ci
224cabdff1aSopenharmony_ci    return (HADD_UH_U32(sad));
225cabdff1aSopenharmony_ci}
226cabdff1aSopenharmony_ci
227cabdff1aSopenharmony_cistatic uint32_t sad_hv_bilinear_filter_8width_msa(uint8_t *src,
228cabdff1aSopenharmony_ci                                                  int32_t src_stride,
229cabdff1aSopenharmony_ci                                                  uint8_t *ref,
230cabdff1aSopenharmony_ci                                                  int32_t ref_stride,
231cabdff1aSopenharmony_ci                                                  int32_t height)
232cabdff1aSopenharmony_ci{
233cabdff1aSopenharmony_ci    int32_t ht_cnt;
234cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, temp0, temp1, diff;
235cabdff1aSopenharmony_ci    v16u8 ref0, ref1, ref2, ref3, ref4;
236cabdff1aSopenharmony_ci    v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
237cabdff1aSopenharmony_ci    v8u16 comp0, comp1, comp2, comp3;
238cabdff1aSopenharmony_ci    v8u16 sad = { 0 };
239cabdff1aSopenharmony_ci
240cabdff1aSopenharmony_ci    for (ht_cnt = (height >> 2); ht_cnt--;) {
241cabdff1aSopenharmony_ci        LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
242cabdff1aSopenharmony_ci        ref += (4 * ref_stride);
243cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
244cabdff1aSopenharmony_ci        src += (4 * src_stride);
245cabdff1aSopenharmony_ci
246cabdff1aSopenharmony_ci        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
247cabdff1aSopenharmony_ci
248cabdff1aSopenharmony_ci        VSHF_B2_UB(ref4, ref4, ref0, ref0, mask, mask, temp0, temp1);
249cabdff1aSopenharmony_ci        comp0 = __msa_hadd_u_h(temp0, temp0);
250cabdff1aSopenharmony_ci        comp1 = __msa_hadd_u_h(temp1, temp1);
251cabdff1aSopenharmony_ci        comp0 += comp1;
252cabdff1aSopenharmony_ci        comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2);
253cabdff1aSopenharmony_ci        comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0);
254cabdff1aSopenharmony_ci
255cabdff1aSopenharmony_ci        temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref1, (v16i8) ref1);
256cabdff1aSopenharmony_ci        comp2 = __msa_hadd_u_h(temp0, temp0);
257cabdff1aSopenharmony_ci        comp1 += comp2;
258cabdff1aSopenharmony_ci        comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2);
259cabdff1aSopenharmony_ci        comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1);
260cabdff1aSopenharmony_ci        comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0);
261cabdff1aSopenharmony_ci        diff = (v16u8) __msa_asub_u_b(src0, (v16u8) comp1);
262cabdff1aSopenharmony_ci        sad += __msa_hadd_u_h(diff, diff);
263cabdff1aSopenharmony_ci
264cabdff1aSopenharmony_ci        temp1 = (v16u8) __msa_vshf_b(mask, (v16i8) ref2, (v16i8) ref2);
265cabdff1aSopenharmony_ci        comp3 = __msa_hadd_u_h(temp1, temp1);
266cabdff1aSopenharmony_ci        comp2 += comp3;
267cabdff1aSopenharmony_ci        comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2);
268cabdff1aSopenharmony_ci        comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2);
269cabdff1aSopenharmony_ci
270cabdff1aSopenharmony_ci        temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref3, (v16i8) ref3);
271cabdff1aSopenharmony_ci        comp0 = __msa_hadd_u_h(temp0, temp0);
272cabdff1aSopenharmony_ci        comp3 += comp0;
273cabdff1aSopenharmony_ci        comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2);
274cabdff1aSopenharmony_ci        comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3);
275cabdff1aSopenharmony_ci        comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2);
276cabdff1aSopenharmony_ci        diff = (v16u8) __msa_asub_u_b(src1, (v16u8) comp3);
277cabdff1aSopenharmony_ci        sad += __msa_hadd_u_h(diff, diff);
278cabdff1aSopenharmony_ci    }
279cabdff1aSopenharmony_ci
280cabdff1aSopenharmony_ci    return (HADD_UH_U32(sad));
281cabdff1aSopenharmony_ci}
282cabdff1aSopenharmony_ci
283cabdff1aSopenharmony_cistatic uint32_t sad_hv_bilinear_filter_16width_msa(uint8_t *src,
284cabdff1aSopenharmony_ci                                                   int32_t src_stride,
285cabdff1aSopenharmony_ci                                                   uint8_t *ref,
286cabdff1aSopenharmony_ci                                                   int32_t ref_stride,
287cabdff1aSopenharmony_ci                                                   int32_t height)
288cabdff1aSopenharmony_ci{
289cabdff1aSopenharmony_ci    int32_t ht_cnt;
290cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, comp, diff;
291cabdff1aSopenharmony_ci    v16u8 temp0, temp1, temp2, temp3;
292cabdff1aSopenharmony_ci    v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14;
293cabdff1aSopenharmony_ci    v8u16 comp0, comp1, comp2, comp3;
294cabdff1aSopenharmony_ci    v8u16 sad = { 0 };
295cabdff1aSopenharmony_ci
296cabdff1aSopenharmony_ci    for (ht_cnt = (height >> 3); ht_cnt--;) {
297cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
298cabdff1aSopenharmony_ci        src += (4 * src_stride);
299cabdff1aSopenharmony_ci        LD_UB5(ref, ref_stride, ref04, ref00, ref01, ref02, ref03);
300cabdff1aSopenharmony_ci        LD_UB5(ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13);
301cabdff1aSopenharmony_ci        ref += (5 * ref_stride);
302cabdff1aSopenharmony_ci
303cabdff1aSopenharmony_ci        ILVRL_B2_UB(ref14, ref04, temp0, temp1);
304cabdff1aSopenharmony_ci        comp0 = __msa_hadd_u_h(temp0, temp0);
305cabdff1aSopenharmony_ci        comp1 = __msa_hadd_u_h(temp1, temp1);
306cabdff1aSopenharmony_ci        ILVRL_B2_UB(ref10, ref00, temp2, temp3);
307cabdff1aSopenharmony_ci        comp2 = __msa_hadd_u_h(temp2, temp2);
308cabdff1aSopenharmony_ci        comp3 = __msa_hadd_u_h(temp3, temp3);
309cabdff1aSopenharmony_ci        comp0 += comp2;
310cabdff1aSopenharmony_ci        comp1 += comp3;
311cabdff1aSopenharmony_ci        SRARI_H2_UH(comp0, comp1, 2);
312cabdff1aSopenharmony_ci        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
313cabdff1aSopenharmony_ci        diff = __msa_asub_u_b(src0, comp);
314cabdff1aSopenharmony_ci        sad += __msa_hadd_u_h(diff, diff);
315cabdff1aSopenharmony_ci
316cabdff1aSopenharmony_ci        ILVRL_B2_UB(ref11, ref01, temp0, temp1);
317cabdff1aSopenharmony_ci        comp0 = __msa_hadd_u_h(temp0, temp0);
318cabdff1aSopenharmony_ci        comp1 = __msa_hadd_u_h(temp1, temp1);
319cabdff1aSopenharmony_ci        comp2 += comp0;
320cabdff1aSopenharmony_ci        comp3 += comp1;
321cabdff1aSopenharmony_ci        SRARI_H2_UH(comp2, comp3, 2);
322cabdff1aSopenharmony_ci        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
323cabdff1aSopenharmony_ci        diff = __msa_asub_u_b(src1, comp);
324cabdff1aSopenharmony_ci        sad += __msa_hadd_u_h(diff, diff);
325cabdff1aSopenharmony_ci
326cabdff1aSopenharmony_ci        ILVRL_B2_UB(ref12, ref02, temp2, temp3);
327cabdff1aSopenharmony_ci        comp2 = __msa_hadd_u_h(temp2, temp2);
328cabdff1aSopenharmony_ci        comp3 = __msa_hadd_u_h(temp3, temp3);
329cabdff1aSopenharmony_ci        comp0 += comp2;
330cabdff1aSopenharmony_ci        comp1 += comp3;
331cabdff1aSopenharmony_ci        SRARI_H2_UH(comp0, comp1, 2);
332cabdff1aSopenharmony_ci        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
333cabdff1aSopenharmony_ci        diff = __msa_asub_u_b(src2, comp);
334cabdff1aSopenharmony_ci        sad += __msa_hadd_u_h(diff, diff);
335cabdff1aSopenharmony_ci
336cabdff1aSopenharmony_ci        ILVRL_B2_UB(ref13, ref03, temp0, temp1);
337cabdff1aSopenharmony_ci        comp0 = __msa_hadd_u_h(temp0, temp0);
338cabdff1aSopenharmony_ci        comp1 = __msa_hadd_u_h(temp1, temp1);
339cabdff1aSopenharmony_ci        comp2 += comp0;
340cabdff1aSopenharmony_ci        comp3 += comp1;
341cabdff1aSopenharmony_ci        SRARI_H2_UH(comp2, comp3, 2);
342cabdff1aSopenharmony_ci        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
343cabdff1aSopenharmony_ci        diff = __msa_asub_u_b(src3, comp);
344cabdff1aSopenharmony_ci        sad += __msa_hadd_u_h(diff, diff);
345cabdff1aSopenharmony_ci
346cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src0, src1, src2, src3);
347cabdff1aSopenharmony_ci        src += (4 * src_stride);
348cabdff1aSopenharmony_ci        LD_UB4(ref, ref_stride, ref00, ref01, ref02, ref03);
349cabdff1aSopenharmony_ci        LD_UB4(ref + 1, ref_stride, ref10, ref11, ref12, ref13);
350cabdff1aSopenharmony_ci        ref += (3 * ref_stride);
351cabdff1aSopenharmony_ci
352cabdff1aSopenharmony_ci        ILVRL_B2_UB(ref10, ref00, temp2, temp3);
353cabdff1aSopenharmony_ci        comp2 = __msa_hadd_u_h(temp2, temp2);
354cabdff1aSopenharmony_ci        comp3 = __msa_hadd_u_h(temp3, temp3);
355cabdff1aSopenharmony_ci        comp0 += comp2;
356cabdff1aSopenharmony_ci        comp1 += comp3;
357cabdff1aSopenharmony_ci        SRARI_H2_UH(comp0, comp1, 2);
358cabdff1aSopenharmony_ci        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
359cabdff1aSopenharmony_ci        diff = __msa_asub_u_b(src0, comp);
360cabdff1aSopenharmony_ci        sad += __msa_hadd_u_h(diff, diff);
361cabdff1aSopenharmony_ci
362cabdff1aSopenharmony_ci        ILVRL_B2_UB(ref11, ref01, temp0, temp1);
363cabdff1aSopenharmony_ci        comp0 = __msa_hadd_u_h(temp0, temp0);
364cabdff1aSopenharmony_ci        comp1 = __msa_hadd_u_h(temp1, temp1);
365cabdff1aSopenharmony_ci        comp2 += comp0;
366cabdff1aSopenharmony_ci        comp3 += comp1;
367cabdff1aSopenharmony_ci        SRARI_H2_UH(comp2, comp3, 2);
368cabdff1aSopenharmony_ci        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
369cabdff1aSopenharmony_ci        diff = __msa_asub_u_b(src1, comp);
370cabdff1aSopenharmony_ci        sad += __msa_hadd_u_h(diff, diff);
371cabdff1aSopenharmony_ci
372cabdff1aSopenharmony_ci        ILVRL_B2_UB(ref12, ref02, temp2, temp3);
373cabdff1aSopenharmony_ci        comp2 = __msa_hadd_u_h(temp2, temp2);
374cabdff1aSopenharmony_ci        comp3 = __msa_hadd_u_h(temp3, temp3);
375cabdff1aSopenharmony_ci        comp0 += comp2;
376cabdff1aSopenharmony_ci        comp1 += comp3;
377cabdff1aSopenharmony_ci        SRARI_H2_UH(comp0, comp1, 2);
378cabdff1aSopenharmony_ci        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
379cabdff1aSopenharmony_ci        diff = __msa_asub_u_b(src2, comp);
380cabdff1aSopenharmony_ci        sad += __msa_hadd_u_h(diff, diff);
381cabdff1aSopenharmony_ci
382cabdff1aSopenharmony_ci        ILVRL_B2_UB(ref13, ref03, temp0, temp1);
383cabdff1aSopenharmony_ci        comp0 = __msa_hadd_u_h(temp0, temp0);
384cabdff1aSopenharmony_ci        comp1 = __msa_hadd_u_h(temp1, temp1);
385cabdff1aSopenharmony_ci        comp2 += comp0;
386cabdff1aSopenharmony_ci        comp3 += comp1;
387cabdff1aSopenharmony_ci        SRARI_H2_UH(comp2, comp3, 2);
388cabdff1aSopenharmony_ci        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
389cabdff1aSopenharmony_ci        diff = __msa_asub_u_b(src3, comp);
390cabdff1aSopenharmony_ci        sad += __msa_hadd_u_h(diff, diff);
391cabdff1aSopenharmony_ci    }
392cabdff1aSopenharmony_ci
393cabdff1aSopenharmony_ci    return (HADD_UH_U32(sad));
394cabdff1aSopenharmony_ci}
395cabdff1aSopenharmony_ci
396cabdff1aSopenharmony_ci#define CALC_MSE_B(src, ref, var)                                    \
397cabdff1aSopenharmony_ci{                                                                    \
398cabdff1aSopenharmony_ci    v16u8 src_l0_m, src_l1_m;                                        \
399cabdff1aSopenharmony_ci    v8i16 res_l0_m, res_l1_m;                                        \
400cabdff1aSopenharmony_ci                                                                     \
401cabdff1aSopenharmony_ci    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
402cabdff1aSopenharmony_ci    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
403cabdff1aSopenharmony_ci    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
404cabdff1aSopenharmony_ci}
405cabdff1aSopenharmony_ci
406cabdff1aSopenharmony_cistatic uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride,
407cabdff1aSopenharmony_ci                               uint8_t *ref_ptr, int32_t ref_stride,
408cabdff1aSopenharmony_ci                               int32_t height)
409cabdff1aSopenharmony_ci{
410cabdff1aSopenharmony_ci    int32_t ht_cnt;
411cabdff1aSopenharmony_ci    uint32_t sse;
412cabdff1aSopenharmony_ci    uint32_t src0, src1, src2, src3;
413cabdff1aSopenharmony_ci    uint32_t ref0, ref1, ref2, ref3;
414cabdff1aSopenharmony_ci    v16u8 src = { 0 };
415cabdff1aSopenharmony_ci    v16u8 ref = { 0 };
416cabdff1aSopenharmony_ci    v4i32 var = { 0 };
417cabdff1aSopenharmony_ci
418cabdff1aSopenharmony_ci    for (ht_cnt = (height >> 2); ht_cnt--;) {
419cabdff1aSopenharmony_ci        LW4(src_ptr, src_stride, src0, src1, src2, src3);
420cabdff1aSopenharmony_ci        src_ptr += (4 * src_stride);
421cabdff1aSopenharmony_ci        LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
422cabdff1aSopenharmony_ci        ref_ptr += (4 * ref_stride);
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_ci        INSERT_W4_UB(src0, src1, src2, src3, src);
425cabdff1aSopenharmony_ci        INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
426cabdff1aSopenharmony_ci        CALC_MSE_B(src, ref, var);
427cabdff1aSopenharmony_ci    }
428cabdff1aSopenharmony_ci
429cabdff1aSopenharmony_ci    sse = HADD_SW_S32(var);
430cabdff1aSopenharmony_ci
431cabdff1aSopenharmony_ci    return sse;
432cabdff1aSopenharmony_ci}
433cabdff1aSopenharmony_ci
434cabdff1aSopenharmony_cistatic uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride,
435cabdff1aSopenharmony_ci                               uint8_t *ref_ptr, int32_t ref_stride,
436cabdff1aSopenharmony_ci                               int32_t height)
437cabdff1aSopenharmony_ci{
438cabdff1aSopenharmony_ci    int32_t ht_cnt;
439cabdff1aSopenharmony_ci    uint32_t sse;
440cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3;
441cabdff1aSopenharmony_ci    v16u8 ref0, ref1, ref2, ref3;
442cabdff1aSopenharmony_ci    v4i32 var = { 0 };
443cabdff1aSopenharmony_ci
444cabdff1aSopenharmony_ci    for (ht_cnt = (height >> 2); ht_cnt--;) {
445cabdff1aSopenharmony_ci        LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
446cabdff1aSopenharmony_ci        src_ptr += (4 * src_stride);
447cabdff1aSopenharmony_ci        LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
448cabdff1aSopenharmony_ci        ref_ptr += (4 * ref_stride);
449cabdff1aSopenharmony_ci
450cabdff1aSopenharmony_ci        PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
451cabdff1aSopenharmony_ci                    src0, src1, ref0, ref1);
452cabdff1aSopenharmony_ci        CALC_MSE_B(src0, ref0, var);
453cabdff1aSopenharmony_ci        CALC_MSE_B(src1, ref1, var);
454cabdff1aSopenharmony_ci    }
455cabdff1aSopenharmony_ci
456cabdff1aSopenharmony_ci    sse = HADD_SW_S32(var);
457cabdff1aSopenharmony_ci
458cabdff1aSopenharmony_ci    return sse;
459cabdff1aSopenharmony_ci}
460cabdff1aSopenharmony_ci
461cabdff1aSopenharmony_cistatic uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride,
462cabdff1aSopenharmony_ci                                uint8_t *ref_ptr, int32_t ref_stride,
463cabdff1aSopenharmony_ci                                int32_t height)
464cabdff1aSopenharmony_ci{
465cabdff1aSopenharmony_ci    int32_t ht_cnt;
466cabdff1aSopenharmony_ci    uint32_t sse;
467cabdff1aSopenharmony_ci    v16u8 src, ref;
468cabdff1aSopenharmony_ci    v4i32 var = { 0 };
469cabdff1aSopenharmony_ci
470cabdff1aSopenharmony_ci    for (ht_cnt = (height >> 2); ht_cnt--;) {
471cabdff1aSopenharmony_ci        src = LD_UB(src_ptr);
472cabdff1aSopenharmony_ci        src_ptr += src_stride;
473cabdff1aSopenharmony_ci        ref = LD_UB(ref_ptr);
474cabdff1aSopenharmony_ci        ref_ptr += ref_stride;
475cabdff1aSopenharmony_ci        CALC_MSE_B(src, ref, var);
476cabdff1aSopenharmony_ci
477cabdff1aSopenharmony_ci        src = LD_UB(src_ptr);
478cabdff1aSopenharmony_ci        src_ptr += src_stride;
479cabdff1aSopenharmony_ci        ref = LD_UB(ref_ptr);
480cabdff1aSopenharmony_ci        ref_ptr += ref_stride;
481cabdff1aSopenharmony_ci        CALC_MSE_B(src, ref, var);
482cabdff1aSopenharmony_ci
483cabdff1aSopenharmony_ci        src = LD_UB(src_ptr);
484cabdff1aSopenharmony_ci        src_ptr += src_stride;
485cabdff1aSopenharmony_ci        ref = LD_UB(ref_ptr);
486cabdff1aSopenharmony_ci        ref_ptr += ref_stride;
487cabdff1aSopenharmony_ci        CALC_MSE_B(src, ref, var);
488cabdff1aSopenharmony_ci
489cabdff1aSopenharmony_ci        src = LD_UB(src_ptr);
490cabdff1aSopenharmony_ci        src_ptr += src_stride;
491cabdff1aSopenharmony_ci        ref = LD_UB(ref_ptr);
492cabdff1aSopenharmony_ci        ref_ptr += ref_stride;
493cabdff1aSopenharmony_ci        CALC_MSE_B(src, ref, var);
494cabdff1aSopenharmony_ci    }
495cabdff1aSopenharmony_ci
496cabdff1aSopenharmony_ci    sse = HADD_SW_S32(var);
497cabdff1aSopenharmony_ci
498cabdff1aSopenharmony_ci    return sse;
499cabdff1aSopenharmony_ci}
500cabdff1aSopenharmony_ci
501cabdff1aSopenharmony_cistatic int32_t hadamard_diff_8x8_msa(uint8_t *src, int32_t src_stride,
502cabdff1aSopenharmony_ci                                     uint8_t *ref, int32_t ref_stride)
503cabdff1aSopenharmony_ci{
504cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
505cabdff1aSopenharmony_ci    v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
506cabdff1aSopenharmony_ci    v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
507cabdff1aSopenharmony_ci    v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
508cabdff1aSopenharmony_ci    v8i16 sum = { 0 };
509cabdff1aSopenharmony_ci    v8i16 zero = { 0 };
510cabdff1aSopenharmony_ci
511cabdff1aSopenharmony_ci    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
512cabdff1aSopenharmony_ci    LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
513cabdff1aSopenharmony_ci    ILVR_B8_UH(src0, ref0, src1, ref1, src2, ref2, src3, ref3,
514cabdff1aSopenharmony_ci               src4, ref4, src5, ref5, src6, ref6, src7, ref7,
515cabdff1aSopenharmony_ci               diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
516cabdff1aSopenharmony_ci    HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3);
517cabdff1aSopenharmony_ci    HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7);
518cabdff1aSopenharmony_ci    TRANSPOSE8x8_UH_UH(diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
519cabdff1aSopenharmony_ci                       diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
520cabdff1aSopenharmony_ci    BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
521cabdff1aSopenharmony_ci                temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
522cabdff1aSopenharmony_ci    BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
523cabdff1aSopenharmony_ci                diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
524cabdff1aSopenharmony_ci    BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
525cabdff1aSopenharmony_ci                temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
526cabdff1aSopenharmony_ci    TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
527cabdff1aSopenharmony_ci                       temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
528cabdff1aSopenharmony_ci    BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
529cabdff1aSopenharmony_ci                diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
530cabdff1aSopenharmony_ci    BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
531cabdff1aSopenharmony_ci                temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
532cabdff1aSopenharmony_ci    ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
533cabdff1aSopenharmony_ci         diff0, diff1, diff2, diff3);
534cabdff1aSopenharmony_ci    sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
535cabdff1aSopenharmony_ci    sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
536cabdff1aSopenharmony_ci    sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
537cabdff1aSopenharmony_ci    sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
538cabdff1aSopenharmony_ci    sum += __msa_add_a_h((v8i16) diff0, zero);
539cabdff1aSopenharmony_ci    sum += __msa_add_a_h((v8i16) diff1, zero);
540cabdff1aSopenharmony_ci    sum += __msa_add_a_h((v8i16) diff2, zero);
541cabdff1aSopenharmony_ci    sum += __msa_add_a_h((v8i16) diff3, zero);
542cabdff1aSopenharmony_ci
543cabdff1aSopenharmony_ci    return (HADD_UH_U32(sum));
544cabdff1aSopenharmony_ci}
545cabdff1aSopenharmony_ci
546cabdff1aSopenharmony_cistatic int32_t hadamard_intra_8x8_msa(uint8_t *src, int32_t src_stride,
547cabdff1aSopenharmony_ci                                      uint8_t *ref, int32_t ref_stride)
548cabdff1aSopenharmony_ci{
549cabdff1aSopenharmony_ci    int32_t sum_res = 0;
550cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
551cabdff1aSopenharmony_ci    v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
552cabdff1aSopenharmony_ci    v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
553cabdff1aSopenharmony_ci    v8i16 sum = { 0 };
554cabdff1aSopenharmony_ci    v16i8 zero = { 0 };
555cabdff1aSopenharmony_ci
556cabdff1aSopenharmony_ci    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
557cabdff1aSopenharmony_ci    TRANSPOSE8x8_UB_UB(src0, src1, src2, src3, src4, src5, src6, src7,
558cabdff1aSopenharmony_ci                       src0, src1, src2, src3, src4, src5, src6, src7);
559cabdff1aSopenharmony_ci    ILVR_B8_UH(zero, src0, zero, src1, zero, src2, zero, src3,
560cabdff1aSopenharmony_ci               zero, src4, zero, src5, zero, src6, zero, src7,
561cabdff1aSopenharmony_ci               diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
562cabdff1aSopenharmony_ci    BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
563cabdff1aSopenharmony_ci                temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
564cabdff1aSopenharmony_ci    BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
565cabdff1aSopenharmony_ci                diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
566cabdff1aSopenharmony_ci    BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
567cabdff1aSopenharmony_ci                temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
568cabdff1aSopenharmony_ci    TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
569cabdff1aSopenharmony_ci                       temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
570cabdff1aSopenharmony_ci    BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
571cabdff1aSopenharmony_ci                diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
572cabdff1aSopenharmony_ci    BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
573cabdff1aSopenharmony_ci                temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
574cabdff1aSopenharmony_ci    ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
575cabdff1aSopenharmony_ci         diff0, diff1, diff2, diff3);
576cabdff1aSopenharmony_ci    sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
577cabdff1aSopenharmony_ci    sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
578cabdff1aSopenharmony_ci    sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
579cabdff1aSopenharmony_ci    sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
580cabdff1aSopenharmony_ci    sum += __msa_add_a_h((v8i16) diff0, (v8i16) zero);
581cabdff1aSopenharmony_ci    sum += __msa_add_a_h((v8i16) diff1, (v8i16) zero);
582cabdff1aSopenharmony_ci    sum += __msa_add_a_h((v8i16) diff2, (v8i16) zero);
583cabdff1aSopenharmony_ci    sum += __msa_add_a_h((v8i16) diff3, (v8i16) zero);
584cabdff1aSopenharmony_ci    sum_res = (HADD_UH_U32(sum));
585cabdff1aSopenharmony_ci    sum_res -= abs(temp0[0] + temp4[0]);
586cabdff1aSopenharmony_ci
587cabdff1aSopenharmony_ci    return sum_res;
588cabdff1aSopenharmony_ci}
589cabdff1aSopenharmony_ci
590cabdff1aSopenharmony_ciint ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
591cabdff1aSopenharmony_ci                     ptrdiff_t stride, int height)
592cabdff1aSopenharmony_ci{
593cabdff1aSopenharmony_ci    return sad_16width_msa(src, stride, ref, stride, height);
594cabdff1aSopenharmony_ci}
595cabdff1aSopenharmony_ci
596cabdff1aSopenharmony_ciint ff_pix_abs8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
597cabdff1aSopenharmony_ci                    ptrdiff_t stride, int height)
598cabdff1aSopenharmony_ci{
599cabdff1aSopenharmony_ci    return sad_8width_msa(src, stride, ref, stride, height);
600cabdff1aSopenharmony_ci}
601cabdff1aSopenharmony_ci
602cabdff1aSopenharmony_ciint ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
603cabdff1aSopenharmony_ci                        ptrdiff_t stride, int h)
604cabdff1aSopenharmony_ci{
605cabdff1aSopenharmony_ci    return sad_horiz_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
606cabdff1aSopenharmony_ci}
607cabdff1aSopenharmony_ci
608cabdff1aSopenharmony_ciint ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
609cabdff1aSopenharmony_ci                        ptrdiff_t stride, int h)
610cabdff1aSopenharmony_ci{
611cabdff1aSopenharmony_ci    return sad_vert_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
612cabdff1aSopenharmony_ci}
613cabdff1aSopenharmony_ci
614cabdff1aSopenharmony_ciint ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
615cabdff1aSopenharmony_ci                         ptrdiff_t stride, int h)
616cabdff1aSopenharmony_ci{
617cabdff1aSopenharmony_ci    return sad_hv_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
618cabdff1aSopenharmony_ci}
619cabdff1aSopenharmony_ci
620cabdff1aSopenharmony_ciint ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
621cabdff1aSopenharmony_ci                       ptrdiff_t stride, int h)
622cabdff1aSopenharmony_ci{
623cabdff1aSopenharmony_ci    return sad_horiz_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
624cabdff1aSopenharmony_ci}
625cabdff1aSopenharmony_ci
626cabdff1aSopenharmony_ciint ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
627cabdff1aSopenharmony_ci                       ptrdiff_t stride, int h)
628cabdff1aSopenharmony_ci{
629cabdff1aSopenharmony_ci    return sad_vert_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
630cabdff1aSopenharmony_ci}
631cabdff1aSopenharmony_ci
632cabdff1aSopenharmony_ciint ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
633cabdff1aSopenharmony_ci                        ptrdiff_t stride, int h)
634cabdff1aSopenharmony_ci{
635cabdff1aSopenharmony_ci    return sad_hv_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
636cabdff1aSopenharmony_ci}
637cabdff1aSopenharmony_ci
638cabdff1aSopenharmony_ciint ff_sse16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
639cabdff1aSopenharmony_ci                 ptrdiff_t stride, int height)
640cabdff1aSopenharmony_ci{
641cabdff1aSopenharmony_ci    return sse_16width_msa(src, stride, ref, stride, height);
642cabdff1aSopenharmony_ci}
643cabdff1aSopenharmony_ci
644cabdff1aSopenharmony_ciint ff_sse8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
645cabdff1aSopenharmony_ci                ptrdiff_t stride, int height)
646cabdff1aSopenharmony_ci{
647cabdff1aSopenharmony_ci    return sse_8width_msa(src, stride, ref, stride, height);
648cabdff1aSopenharmony_ci}
649cabdff1aSopenharmony_ci
650cabdff1aSopenharmony_ciint ff_sse4_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
651cabdff1aSopenharmony_ci                ptrdiff_t stride, int height)
652cabdff1aSopenharmony_ci{
653cabdff1aSopenharmony_ci    return sse_4width_msa(src, stride, ref, stride, height);
654cabdff1aSopenharmony_ci}
655cabdff1aSopenharmony_ci
656cabdff1aSopenharmony_ciint ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
657cabdff1aSopenharmony_ci                             ptrdiff_t stride, int h)
658cabdff1aSopenharmony_ci{
659cabdff1aSopenharmony_ci    return hadamard_diff_8x8_msa(src, stride, dst, stride);
660cabdff1aSopenharmony_ci}
661cabdff1aSopenharmony_ci
662cabdff1aSopenharmony_ciint ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
663cabdff1aSopenharmony_ci                              ptrdiff_t stride, int h)
664cabdff1aSopenharmony_ci{
665cabdff1aSopenharmony_ci    return hadamard_intra_8x8_msa(src, stride, dst, stride);
666cabdff1aSopenharmony_ci}
667cabdff1aSopenharmony_ci
668cabdff1aSopenharmony_ci/* Hadamard Transform functions */
669cabdff1aSopenharmony_ci#define WRAPPER8_16_SQ(name8, name16)                      \
670cabdff1aSopenharmony_ciint name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,  \
671cabdff1aSopenharmony_ci           ptrdiff_t stride, int h)                        \
672cabdff1aSopenharmony_ci{                                                          \
673cabdff1aSopenharmony_ci    int score = 0;                                         \
674cabdff1aSopenharmony_ci    score += name8(s, dst, src, stride, 8);                \
675cabdff1aSopenharmony_ci    score += name8(s, dst + 8, src + 8, stride, 8);        \
676cabdff1aSopenharmony_ci    if(h == 16) {                                          \
677cabdff1aSopenharmony_ci        dst += 8 * stride;                                 \
678cabdff1aSopenharmony_ci        src += 8 * stride;                                 \
679cabdff1aSopenharmony_ci        score +=name8(s, dst, src, stride, 8);             \
680cabdff1aSopenharmony_ci        score +=name8(s, dst + 8, src + 8, stride, 8);     \
681cabdff1aSopenharmony_ci    }                                                      \
682cabdff1aSopenharmony_ci    return score;                                          \
683cabdff1aSopenharmony_ci}
684cabdff1aSopenharmony_ci
685cabdff1aSopenharmony_ciWRAPPER8_16_SQ(ff_hadamard8_diff8x8_msa, ff_hadamard8_diff16_msa);
686cabdff1aSopenharmony_ciWRAPPER8_16_SQ(ff_hadamard8_intra8x8_msa, ff_hadamard8_intra16_msa);
687