1/*
2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mips/generic_macros_msa.h"
22#include "me_cmp_mips.h"
23
24static uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride,
25                               uint8_t *ref, int32_t ref_stride,
26                               int32_t height)
27{
28    int32_t ht_cnt;
29    v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
30    v8u16 sad = { 0 };
31
32    for (ht_cnt = (height >> 2); ht_cnt--;) {
33        LD_UB4(src, src_stride, src0, src1, src2, src3);
34        src += (4 * src_stride);
35        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
36        ref += (4 * ref_stride);
37
38        PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
39                    src0, src1, ref0, ref1);
40        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
41    }
42
43    return (HADD_UH_U32(sad));
44}
45
46static uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride,
47                                uint8_t *ref, int32_t ref_stride,
48                                int32_t height)
49{
50    int32_t ht_cnt;
51    v16u8 src0, src1, ref0, ref1;
52    v8u16 sad = { 0 };
53
54    for (ht_cnt = (height >> 2); ht_cnt--;) {
55        LD_UB2(src, src_stride, src0, src1);
56        src += (2 * src_stride);
57        LD_UB2(ref, ref_stride, ref0, ref1);
58        ref += (2 * ref_stride);
59        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
60
61        LD_UB2(src, src_stride, src0, src1);
62        src += (2 * src_stride);
63        LD_UB2(ref, ref_stride, ref0, ref1);
64        ref += (2 * ref_stride);
65        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
66    }
67
68    return (HADD_UH_U32(sad));
69}
70
71static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src,
72                                                     int32_t src_stride,
73                                                     uint8_t *ref,
74                                                     int32_t ref_stride,
75                                                     int32_t height)
76{
77    int32_t ht_cnt;
78    v16u8 src0, src1, src2, src3, comp0, comp1;
79    v16u8 ref0, ref1, ref2, ref3, ref4, ref5;
80    v8u16 sad = { 0 };
81
82    for (ht_cnt = (height >> 3); ht_cnt--;) {
83        LD_UB4(src, src_stride, src0, src1, src2, src3);
84        src += (4 * src_stride);
85        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
86        ref += (4 * ref_stride);
87
88        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
89        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
90        SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
91                   ref0, ref1, ref2, ref3);
92        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
93        AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
94        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
95
96        LD_UB4(src, src_stride, src0, src1, src2, src3);
97        src += (4 * src_stride);
98        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
99        ref += (4 * ref_stride);
100
101        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
102        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
103        SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
104                   ref0, ref1, ref2, ref3);
105        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
106        AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
107        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
108    }
109
110    return (HADD_UH_U32(sad));
111}
112
113static uint32_t sad_horiz_bilinear_filter_16width_msa(uint8_t *src,
114                                                      int32_t src_stride,
115                                                      uint8_t *ref,
116                                                      int32_t ref_stride,
117                                                      int32_t height)
118{
119    int32_t ht_cnt;
120    v16u8 src0, src1, src2, src3, comp0, comp1;
121    v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31;
122    v8u16 sad = { 0 };
123
124    for (ht_cnt = (height >> 3); ht_cnt--;) {
125        LD_UB4(src, src_stride, src0, src1, src2, src3);
126        src += (4 * src_stride);
127        LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
128        LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
129        ref += (4 * ref_stride);
130
131        AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
132        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
133        AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
134        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
135
136        LD_UB4(src, src_stride, src0, src1, src2, src3);
137        src += (4 * src_stride);
138        LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
139        LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
140        ref += (4 * ref_stride);
141
142        AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
143        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
144        AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
145        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
146    }
147
148    return (HADD_UH_U32(sad));
149}
150
151static uint32_t sad_vert_bilinear_filter_8width_msa(uint8_t *src,
152                                                    int32_t src_stride,
153                                                    uint8_t *ref,
154                                                    int32_t ref_stride,
155                                                    int32_t height)
156{
157    int32_t ht_cnt;
158    v16u8 src0, src1, src2, src3, comp0, comp1;
159    v16u8 ref0, ref1, ref2, ref3, ref4;
160    v8u16 sad = { 0 };
161
162    for (ht_cnt = (height >> 3); ht_cnt--;) {
163        LD_UB4(src, src_stride, src0, src1, src2, src3);
164        src += (4 * src_stride);
165        LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
166        ref += (4 * ref_stride);
167
168        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
169        PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
170        PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
171        AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
172        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
173
174        LD_UB4(src, src_stride, src0, src1, src2, src3);
175        src += (4 * src_stride);
176        LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
177        ref += (4 * ref_stride);
178
179        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
180        PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
181        PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
182        AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
183        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
184    }
185
186    return (HADD_UH_U32(sad));
187}
188
189static uint32_t sad_vert_bilinear_filter_16width_msa(uint8_t *src,
190                                                     int32_t src_stride,
191                                                     uint8_t *ref,
192                                                     int32_t ref_stride,
193                                                     int32_t height)
194{
195    int32_t ht_cnt;
196    v16u8 src0, src1, src2, src3, comp0, comp1;
197    v16u8 ref0, ref1, ref2, ref3, ref4;
198    v8u16 sad = { 0 };
199
200    for (ht_cnt = (height >> 3); ht_cnt--;) {
201        LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
202        ref += (5 * ref_stride);
203        LD_UB4(src, src_stride, src0, src1, src2, src3);
204        src += (4 * src_stride);
205
206        AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
207        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
208        AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
209        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
210
211        ref4 = ref3;
212
213        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
214        ref += (3 * ref_stride);
215        LD_UB4(src, src_stride, src0, src1, src2, src3);
216        src += (4 * src_stride);
217
218        AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
219        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
220        AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
221        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
222    }
223
224    return (HADD_UH_U32(sad));
225}
226
227static uint32_t sad_hv_bilinear_filter_8width_msa(uint8_t *src,
228                                                  int32_t src_stride,
229                                                  uint8_t *ref,
230                                                  int32_t ref_stride,
231                                                  int32_t height)
232{
233    int32_t ht_cnt;
234    v16u8 src0, src1, src2, src3, temp0, temp1, diff;
235    v16u8 ref0, ref1, ref2, ref3, ref4;
236    v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
237    v8u16 comp0, comp1, comp2, comp3;
238    v8u16 sad = { 0 };
239
240    for (ht_cnt = (height >> 2); ht_cnt--;) {
241        LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
242        ref += (4 * ref_stride);
243        LD_UB4(src, src_stride, src0, src1, src2, src3);
244        src += (4 * src_stride);
245
246        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
247
248        VSHF_B2_UB(ref4, ref4, ref0, ref0, mask, mask, temp0, temp1);
249        comp0 = __msa_hadd_u_h(temp0, temp0);
250        comp1 = __msa_hadd_u_h(temp1, temp1);
251        comp0 += comp1;
252        comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2);
253        comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0);
254
255        temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref1, (v16i8) ref1);
256        comp2 = __msa_hadd_u_h(temp0, temp0);
257        comp1 += comp2;
258        comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2);
259        comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1);
260        comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0);
261        diff = (v16u8) __msa_asub_u_b(src0, (v16u8) comp1);
262        sad += __msa_hadd_u_h(diff, diff);
263
264        temp1 = (v16u8) __msa_vshf_b(mask, (v16i8) ref2, (v16i8) ref2);
265        comp3 = __msa_hadd_u_h(temp1, temp1);
266        comp2 += comp3;
267        comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2);
268        comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2);
269
270        temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref3, (v16i8) ref3);
271        comp0 = __msa_hadd_u_h(temp0, temp0);
272        comp3 += comp0;
273        comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2);
274        comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3);
275        comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2);
276        diff = (v16u8) __msa_asub_u_b(src1, (v16u8) comp3);
277        sad += __msa_hadd_u_h(diff, diff);
278    }
279
280    return (HADD_UH_U32(sad));
281}
282
283static uint32_t sad_hv_bilinear_filter_16width_msa(uint8_t *src,
284                                                   int32_t src_stride,
285                                                   uint8_t *ref,
286                                                   int32_t ref_stride,
287                                                   int32_t height)
288{
289    int32_t ht_cnt;
290    v16u8 src0, src1, src2, src3, comp, diff;
291    v16u8 temp0, temp1, temp2, temp3;
292    v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14;
293    v8u16 comp0, comp1, comp2, comp3;
294    v8u16 sad = { 0 };
295
296    for (ht_cnt = (height >> 3); ht_cnt--;) {
297        LD_UB4(src, src_stride, src0, src1, src2, src3);
298        src += (4 * src_stride);
299        LD_UB5(ref, ref_stride, ref04, ref00, ref01, ref02, ref03);
300        LD_UB5(ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13);
301        ref += (5 * ref_stride);
302
303        ILVRL_B2_UB(ref14, ref04, temp0, temp1);
304        comp0 = __msa_hadd_u_h(temp0, temp0);
305        comp1 = __msa_hadd_u_h(temp1, temp1);
306        ILVRL_B2_UB(ref10, ref00, temp2, temp3);
307        comp2 = __msa_hadd_u_h(temp2, temp2);
308        comp3 = __msa_hadd_u_h(temp3, temp3);
309        comp0 += comp2;
310        comp1 += comp3;
311        SRARI_H2_UH(comp0, comp1, 2);
312        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
313        diff = __msa_asub_u_b(src0, comp);
314        sad += __msa_hadd_u_h(diff, diff);
315
316        ILVRL_B2_UB(ref11, ref01, temp0, temp1);
317        comp0 = __msa_hadd_u_h(temp0, temp0);
318        comp1 = __msa_hadd_u_h(temp1, temp1);
319        comp2 += comp0;
320        comp3 += comp1;
321        SRARI_H2_UH(comp2, comp3, 2);
322        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
323        diff = __msa_asub_u_b(src1, comp);
324        sad += __msa_hadd_u_h(diff, diff);
325
326        ILVRL_B2_UB(ref12, ref02, temp2, temp3);
327        comp2 = __msa_hadd_u_h(temp2, temp2);
328        comp3 = __msa_hadd_u_h(temp3, temp3);
329        comp0 += comp2;
330        comp1 += comp3;
331        SRARI_H2_UH(comp0, comp1, 2);
332        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
333        diff = __msa_asub_u_b(src2, comp);
334        sad += __msa_hadd_u_h(diff, diff);
335
336        ILVRL_B2_UB(ref13, ref03, temp0, temp1);
337        comp0 = __msa_hadd_u_h(temp0, temp0);
338        comp1 = __msa_hadd_u_h(temp1, temp1);
339        comp2 += comp0;
340        comp3 += comp1;
341        SRARI_H2_UH(comp2, comp3, 2);
342        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
343        diff = __msa_asub_u_b(src3, comp);
344        sad += __msa_hadd_u_h(diff, diff);
345
346        LD_UB4(src, src_stride, src0, src1, src2, src3);
347        src += (4 * src_stride);
348        LD_UB4(ref, ref_stride, ref00, ref01, ref02, ref03);
349        LD_UB4(ref + 1, ref_stride, ref10, ref11, ref12, ref13);
350        ref += (3 * ref_stride);
351
352        ILVRL_B2_UB(ref10, ref00, temp2, temp3);
353        comp2 = __msa_hadd_u_h(temp2, temp2);
354        comp3 = __msa_hadd_u_h(temp3, temp3);
355        comp0 += comp2;
356        comp1 += comp3;
357        SRARI_H2_UH(comp0, comp1, 2);
358        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
359        diff = __msa_asub_u_b(src0, comp);
360        sad += __msa_hadd_u_h(diff, diff);
361
362        ILVRL_B2_UB(ref11, ref01, temp0, temp1);
363        comp0 = __msa_hadd_u_h(temp0, temp0);
364        comp1 = __msa_hadd_u_h(temp1, temp1);
365        comp2 += comp0;
366        comp3 += comp1;
367        SRARI_H2_UH(comp2, comp3, 2);
368        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
369        diff = __msa_asub_u_b(src1, comp);
370        sad += __msa_hadd_u_h(diff, diff);
371
372        ILVRL_B2_UB(ref12, ref02, temp2, temp3);
373        comp2 = __msa_hadd_u_h(temp2, temp2);
374        comp3 = __msa_hadd_u_h(temp3, temp3);
375        comp0 += comp2;
376        comp1 += comp3;
377        SRARI_H2_UH(comp0, comp1, 2);
378        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
379        diff = __msa_asub_u_b(src2, comp);
380        sad += __msa_hadd_u_h(diff, diff);
381
382        ILVRL_B2_UB(ref13, ref03, temp0, temp1);
383        comp0 = __msa_hadd_u_h(temp0, temp0);
384        comp1 = __msa_hadd_u_h(temp1, temp1);
385        comp2 += comp0;
386        comp3 += comp1;
387        SRARI_H2_UH(comp2, comp3, 2);
388        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
389        diff = __msa_asub_u_b(src3, comp);
390        sad += __msa_hadd_u_h(diff, diff);
391    }
392
393    return (HADD_UH_U32(sad));
394}
395
396#define CALC_MSE_B(src, ref, var)                                    \
397{                                                                    \
398    v16u8 src_l0_m, src_l1_m;                                        \
399    v8i16 res_l0_m, res_l1_m;                                        \
400                                                                     \
401    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
402    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
403    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
404}
405
406static uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride,
407                               uint8_t *ref_ptr, int32_t ref_stride,
408                               int32_t height)
409{
410    int32_t ht_cnt;
411    uint32_t sse;
412    uint32_t src0, src1, src2, src3;
413    uint32_t ref0, ref1, ref2, ref3;
414    v16u8 src = { 0 };
415    v16u8 ref = { 0 };
416    v4i32 var = { 0 };
417
418    for (ht_cnt = (height >> 2); ht_cnt--;) {
419        LW4(src_ptr, src_stride, src0, src1, src2, src3);
420        src_ptr += (4 * src_stride);
421        LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
422        ref_ptr += (4 * ref_stride);
423
424        INSERT_W4_UB(src0, src1, src2, src3, src);
425        INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
426        CALC_MSE_B(src, ref, var);
427    }
428
429    sse = HADD_SW_S32(var);
430
431    return sse;
432}
433
434static uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride,
435                               uint8_t *ref_ptr, int32_t ref_stride,
436                               int32_t height)
437{
438    int32_t ht_cnt;
439    uint32_t sse;
440    v16u8 src0, src1, src2, src3;
441    v16u8 ref0, ref1, ref2, ref3;
442    v4i32 var = { 0 };
443
444    for (ht_cnt = (height >> 2); ht_cnt--;) {
445        LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
446        src_ptr += (4 * src_stride);
447        LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
448        ref_ptr += (4 * ref_stride);
449
450        PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
451                    src0, src1, ref0, ref1);
452        CALC_MSE_B(src0, ref0, var);
453        CALC_MSE_B(src1, ref1, var);
454    }
455
456    sse = HADD_SW_S32(var);
457
458    return sse;
459}
460
461static uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride,
462                                uint8_t *ref_ptr, int32_t ref_stride,
463                                int32_t height)
464{
465    int32_t ht_cnt;
466    uint32_t sse;
467    v16u8 src, ref;
468    v4i32 var = { 0 };
469
470    for (ht_cnt = (height >> 2); ht_cnt--;) {
471        src = LD_UB(src_ptr);
472        src_ptr += src_stride;
473        ref = LD_UB(ref_ptr);
474        ref_ptr += ref_stride;
475        CALC_MSE_B(src, ref, var);
476
477        src = LD_UB(src_ptr);
478        src_ptr += src_stride;
479        ref = LD_UB(ref_ptr);
480        ref_ptr += ref_stride;
481        CALC_MSE_B(src, ref, var);
482
483        src = LD_UB(src_ptr);
484        src_ptr += src_stride;
485        ref = LD_UB(ref_ptr);
486        ref_ptr += ref_stride;
487        CALC_MSE_B(src, ref, var);
488
489        src = LD_UB(src_ptr);
490        src_ptr += src_stride;
491        ref = LD_UB(ref_ptr);
492        ref_ptr += ref_stride;
493        CALC_MSE_B(src, ref, var);
494    }
495
496    sse = HADD_SW_S32(var);
497
498    return sse;
499}
500
501static int32_t hadamard_diff_8x8_msa(uint8_t *src, int32_t src_stride,
502                                     uint8_t *ref, int32_t ref_stride)
503{
504    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
505    v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
506    v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
507    v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
508    v8i16 sum = { 0 };
509    v8i16 zero = { 0 };
510
511    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
512    LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
513    ILVR_B8_UH(src0, ref0, src1, ref1, src2, ref2, src3, ref3,
514               src4, ref4, src5, ref5, src6, ref6, src7, ref7,
515               diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
516    HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3);
517    HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7);
518    TRANSPOSE8x8_UH_UH(diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
519                       diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
520    BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
521                temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
522    BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
523                diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
524    BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
525                temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
526    TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
527                       temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
528    BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
529                diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
530    BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
531                temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
532    ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
533         diff0, diff1, diff2, diff3);
534    sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
535    sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
536    sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
537    sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
538    sum += __msa_add_a_h((v8i16) diff0, zero);
539    sum += __msa_add_a_h((v8i16) diff1, zero);
540    sum += __msa_add_a_h((v8i16) diff2, zero);
541    sum += __msa_add_a_h((v8i16) diff3, zero);
542
543    return (HADD_UH_U32(sum));
544}
545
546static int32_t hadamard_intra_8x8_msa(uint8_t *src, int32_t src_stride,
547                                      uint8_t *ref, int32_t ref_stride)
548{
549    int32_t sum_res = 0;
550    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
551    v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
552    v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
553    v8i16 sum = { 0 };
554    v16i8 zero = { 0 };
555
556    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
557    TRANSPOSE8x8_UB_UB(src0, src1, src2, src3, src4, src5, src6, src7,
558                       src0, src1, src2, src3, src4, src5, src6, src7);
559    ILVR_B8_UH(zero, src0, zero, src1, zero, src2, zero, src3,
560               zero, src4, zero, src5, zero, src6, zero, src7,
561               diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
562    BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
563                temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
564    BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
565                diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
566    BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
567                temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
568    TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
569                       temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
570    BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
571                diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
572    BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
573                temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
574    ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
575         diff0, diff1, diff2, diff3);
576    sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
577    sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
578    sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
579    sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
580    sum += __msa_add_a_h((v8i16) diff0, (v8i16) zero);
581    sum += __msa_add_a_h((v8i16) diff1, (v8i16) zero);
582    sum += __msa_add_a_h((v8i16) diff2, (v8i16) zero);
583    sum += __msa_add_a_h((v8i16) diff3, (v8i16) zero);
584    sum_res = (HADD_UH_U32(sum));
585    sum_res -= abs(temp0[0] + temp4[0]);
586
587    return sum_res;
588}
589
590int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
591                     ptrdiff_t stride, int height)
592{
593    return sad_16width_msa(src, stride, ref, stride, height);
594}
595
596int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
597                    ptrdiff_t stride, int height)
598{
599    return sad_8width_msa(src, stride, ref, stride, height);
600}
601
602int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
603                        ptrdiff_t stride, int h)
604{
605    return sad_horiz_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
606}
607
608int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
609                        ptrdiff_t stride, int h)
610{
611    return sad_vert_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
612}
613
614int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
615                         ptrdiff_t stride, int h)
616{
617    return sad_hv_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
618}
619
620int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
621                       ptrdiff_t stride, int h)
622{
623    return sad_horiz_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
624}
625
626int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
627                       ptrdiff_t stride, int h)
628{
629    return sad_vert_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
630}
631
632int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
633                        ptrdiff_t stride, int h)
634{
635    return sad_hv_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
636}
637
638int ff_sse16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
639                 ptrdiff_t stride, int height)
640{
641    return sse_16width_msa(src, stride, ref, stride, height);
642}
643
644int ff_sse8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
645                ptrdiff_t stride, int height)
646{
647    return sse_8width_msa(src, stride, ref, stride, height);
648}
649
650int ff_sse4_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
651                ptrdiff_t stride, int height)
652{
653    return sse_4width_msa(src, stride, ref, stride, height);
654}
655
656int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
657                             ptrdiff_t stride, int h)
658{
659    return hadamard_diff_8x8_msa(src, stride, dst, stride);
660}
661
662int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
663                              ptrdiff_t stride, int h)
664{
665    return hadamard_intra_8x8_msa(src, stride, dst, stride);
666}
667
668/* Hadamard Transform functions */
669#define WRAPPER8_16_SQ(name8, name16)                      \
670int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,  \
671           ptrdiff_t stride, int h)                        \
672{                                                          \
673    int score = 0;                                         \
674    score += name8(s, dst, src, stride, 8);                \
675    score += name8(s, dst + 8, src + 8, stride, 8);        \
676    if(h == 16) {                                          \
677        dst += 8 * stride;                                 \
678        src += 8 * stride;                                 \
679        score +=name8(s, dst, src, stride, 8);             \
680        score +=name8(s, dst + 8, src + 8, stride, 8);     \
681    }                                                      \
682    return score;                                          \
683}
684
685WRAPPER8_16_SQ(ff_hadamard8_diff8x8_msa, ff_hadamard8_diff16_msa);
686WRAPPER8_16_SQ(ff_hadamard8_intra8x8_msa, ff_hadamard8_intra16_msa);
687