1/*
2 * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mips/generic_macros_msa.h"
22#include "h264dsp_mips.h"
23
24static void intra_predict_vert_8x8_msa(uint8_t *src, uint8_t *dst,
25                                       int32_t dst_stride)
26{
27    uint64_t out = LD(src);
28
29    SD4(out, out, out, out, dst, dst_stride);
30    dst += (4 * dst_stride);
31    SD4(out, out, out, out, dst, dst_stride);
32}
33
34static void intra_predict_vert_16x16_msa(uint8_t *src, uint8_t *dst,
35                                         int32_t dst_stride)
36{
37    v16u8 out = LD_UB(src);
38
39    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
40    dst += (8 * dst_stride);
41    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
42}
43
44static void intra_predict_horiz_8x8_msa(uint8_t *src, int32_t src_stride,
45                                        uint8_t *dst, int32_t dst_stride)
46{
47    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
48
49    out0 = src[0 * src_stride] * 0x0101010101010101;
50    out1 = src[1 * src_stride] * 0x0101010101010101;
51    out2 = src[2 * src_stride] * 0x0101010101010101;
52    out3 = src[3 * src_stride] * 0x0101010101010101;
53    out4 = src[4 * src_stride] * 0x0101010101010101;
54    out5 = src[5 * src_stride] * 0x0101010101010101;
55    out6 = src[6 * src_stride] * 0x0101010101010101;
56    out7 = src[7 * src_stride] * 0x0101010101010101;
57
58    SD4(out0, out1, out2, out3, dst, dst_stride);
59    dst += (4 * dst_stride);
60    SD4(out4, out5, out6, out7, dst, dst_stride);
61}
62
63static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride,
64                                          uint8_t *dst, int32_t dst_stride)
65{
66    uint8_t inp0, inp1, inp2, inp3;
67    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
68    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
69
70    inp0 = src[0 * src_stride];
71    inp1 = src[1 * src_stride];
72    inp2 = src[2 * src_stride];
73    inp3 = src[3 * src_stride];
74    src0 = (v16u8) __msa_fill_b(inp0);
75    src1 = (v16u8) __msa_fill_b(inp1);
76    src2 = (v16u8) __msa_fill_b(inp2);
77    src3 = (v16u8) __msa_fill_b(inp3);
78    inp0 = src[4 * src_stride];
79    inp1 = src[5 * src_stride];
80    inp2 = src[6 * src_stride];
81    inp3 = src[7 * src_stride];
82    src4 = (v16u8) __msa_fill_b(inp0);
83    src5 = (v16u8) __msa_fill_b(inp1);
84    src6 = (v16u8) __msa_fill_b(inp2);
85    src7 = (v16u8) __msa_fill_b(inp3);
86    inp0 = src[ 8 * src_stride];
87    inp1 = src[ 9 * src_stride];
88    inp2 = src[10 * src_stride];
89    inp3 = src[11 * src_stride];
90    src8 = (v16u8) __msa_fill_b(inp0);
91    src9 = (v16u8) __msa_fill_b(inp1);
92    src10 = (v16u8) __msa_fill_b(inp2);
93    src11 = (v16u8) __msa_fill_b(inp3);
94    inp0 = src[12 * src_stride];
95    inp1 = src[13 * src_stride];
96    inp2 = src[14 * src_stride];
97    inp3 = src[15 * src_stride];
98    src12 = (v16u8) __msa_fill_b(inp0);
99    src13 = (v16u8) __msa_fill_b(inp1);
100    src14 = (v16u8) __msa_fill_b(inp2);
101    src15 = (v16u8) __msa_fill_b(inp3);
102
103    ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
104    dst += (8 * dst_stride);
105    ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
106           dst, dst_stride);
107}
108
109#define INTRA_PREDICT_VALDC_8X8_MSA(val)                                       \
110static void intra_predict_##val##dc_8x8_msa(uint8_t *dst, int32_t dst_stride)  \
111{                                                                              \
112    v16i8 store = __msa_fill_b(val);                                           \
113    uint64_t out = __msa_copy_u_d((v2i64) store, 0);                           \
114                                                                               \
115    SD4(out, out, out, out, dst, dst_stride);                                  \
116    dst += (4 * dst_stride);                                                   \
117    SD4(out, out, out, out, dst, dst_stride);                                  \
118}
119
120INTRA_PREDICT_VALDC_8X8_MSA(127);
121INTRA_PREDICT_VALDC_8X8_MSA(129);
122
123#define INTRA_PREDICT_VALDC_16X16_MSA(val)                            \
124static void intra_predict_##val##dc_16x16_msa(uint8_t *dst,           \
125                                              int32_t dst_stride)     \
126{                                                                     \
127    v16u8 out = (v16u8) __msa_fill_b(val);                            \
128                                                                      \
129    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \
130    dst += (8 * dst_stride);                                          \
131    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \
132}
133
134INTRA_PREDICT_VALDC_16X16_MSA(127);
135INTRA_PREDICT_VALDC_16X16_MSA(129);
136
137static void intra_predict_plane_8x8_msa(uint8_t *src, int32_t stride)
138{
139    uint8_t lpcnt;
140    int32_t res, res0, res1, res2, res3;
141    uint64_t out0, out1;
142    v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 };
143    v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 };
144    v4i32 int_multiplier = { 0, 1, 2, 3 };
145    v16u8 src_top;
146    v8i16 vec9, vec10, vec11;
147    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
148    v2i64 sum;
149
150    src_top = LD_UB(src - (stride + 1));
151    src_top = (v16u8) __msa_vshf_b(shf_mask, (v16i8) src_top, (v16i8) src_top);
152
153    vec9 = __msa_hsub_u_h(src_top, src_top);
154    vec9 *= short_multiplier;
155    vec8 = __msa_hadd_s_w(vec9, vec9);
156    sum = __msa_hadd_s_d(vec8, vec8);
157
158    res0 = __msa_copy_s_w((v4i32) sum, 0);
159
160    res1 = (src[4 * stride - 1] - src[2 * stride - 1]) +
161        2 * (src[5 * stride - 1] - src[stride - 1]) +
162        3 * (src[6 * stride - 1] - src[-1]) +
163        4 * (src[7 * stride - 1] - src[-stride - 1]);
164
165    res0 *= 17;
166    res1 *= 17;
167    res0 = (res0 + 16) >> 5;
168    res1 = (res1 + 16) >> 5;
169
170    res3 = 3 * (res0 + res1);
171    res2 = 16 * (src[7 * stride - 1] + src[-stride + 7] + 1);
172    res = res2 - res3;
173
174    vec8 = __msa_fill_w(res0);
175    vec4 = __msa_fill_w(res);
176    vec2 = __msa_fill_w(res1);
177    vec5 = vec8 * int_multiplier;
178    vec3 = vec8 * 4;
179
180    for (lpcnt = 4; lpcnt--;) {
181        vec0 = vec5;
182        vec0 += vec4;
183        vec1 = vec0 + vec3;
184        vec6 = vec5;
185        vec4 += vec2;
186        vec6 += vec4;
187        vec7 = vec6 + vec3;
188
189        SRA_4V(vec0, vec1, vec6, vec7, 5);
190        PCKEV_H2_SH(vec1, vec0, vec7, vec6, vec10, vec11);
191        CLIP_SH2_0_255(vec10, vec11);
192        PCKEV_B2_SH(vec10, vec10, vec11, vec11, vec10, vec11);
193
194        out0 = __msa_copy_s_d((v2i64) vec10, 0);
195        out1 = __msa_copy_s_d((v2i64) vec11, 0);
196        SD(out0, src);
197        src += stride;
198        SD(out1, src);
199        src += stride;
200
201        vec4 += vec2;
202    }
203}
204
205static void intra_predict_plane_16x16_msa(uint8_t *src, int32_t stride)
206{
207    uint8_t lpcnt;
208    int32_t res0, res1, res2, res3;
209    uint64_t load0, load1;
210    v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 };
211    v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 };
212    v4i32 int_multiplier = { 0, 1, 2, 3 };
213    v16u8 src_top = { 0 };
214    v16u8 store0, store1;
215    v8i16 vec9, vec10, vec11, vec12;
216    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add;
217    v4i32 reg0, reg1, reg2, reg3;
218
219    load0 = LD(src - (stride + 1));
220    load1 = LD(src - (stride + 1) + 9);
221
222    INSERT_D2_UB(load0, load1, src_top);
223
224    src_top = (v16u8) __msa_vshf_b(shf_mask, (v16i8) src_top, (v16i8) src_top);
225
226    vec9 = __msa_hsub_u_h(src_top, src_top);
227    vec9 *= short_multiplier;
228    vec8 = __msa_hadd_s_w(vec9, vec9);
229    res_add = (v4i32) __msa_hadd_s_d(vec8, vec8);
230
231    res0 = __msa_copy_s_w(res_add, 0) + __msa_copy_s_w(res_add, 2);
232
233    res1 = (src[8 * stride - 1] - src[6 * stride - 1]) +
234        2 * (src[9 * stride - 1] - src[5 * stride - 1]) +
235        3 * (src[10 * stride - 1] - src[4 * stride - 1]) +
236        4 * (src[11 * stride - 1] - src[3 * stride - 1]) +
237        5 * (src[12 * stride - 1] - src[2 * stride - 1]) +
238        6 * (src[13 * stride - 1] - src[stride - 1]) +
239        7 * (src[14 * stride - 1] - src[-1]) +
240        8 * (src[15 * stride - 1] - src[-1 * stride - 1]);
241
242    res0 *= 5;
243    res1 *= 5;
244    res0 = (res0 + 32) >> 6;
245    res1 = (res1 + 32) >> 6;
246
247    res3 = 7 * (res0 + res1);
248    res2 = 16 * (src[15 * stride - 1] + src[-stride + 15] + 1);
249    res2 -= res3;
250
251    vec8 = __msa_fill_w(res0);
252    vec4 = __msa_fill_w(res2);
253    vec5 = __msa_fill_w(res1);
254    vec6 = vec8 * 4;
255    vec7 = vec8 * int_multiplier;
256
257    for (lpcnt = 8; lpcnt--;) {
258        vec0 = vec7;
259        reg0 = vec7;
260        vec0 += vec4;
261        vec4 += vec5;
262        reg0 += vec4;
263        vec1 = vec0 + vec6;
264        reg1 = reg0 + vec6;
265        vec2 = vec1 + vec6;
266        reg2 = reg1 + vec6;
267        vec3 = vec2 + vec6;
268        reg3 = reg2 + vec6;
269
270        SRA_4V(vec0, vec1, vec2, vec3, 5);
271        SRA_4V(reg0, reg1, reg2, reg3, 5);
272        PCKEV_H2_SH(vec1, vec0, vec3, vec2, vec9, vec10);
273        PCKEV_H2_SH(reg1, reg0, reg3, reg2, vec11, vec12);
274        CLIP_SH2_0_255(vec9, vec10);
275        CLIP_SH2_0_255(vec11, vec12);
276        PCKEV_B2_UB(vec10, vec9, vec12, vec11, store0, store1);
277        ST_UB2(store0, store1, src, stride);
278        src += 2 * stride;
279
280        vec4 += vec5;
281    }
282}
283
284static void intra_predict_dc_4blk_8x8_msa(uint8_t *src, int32_t stride)
285{
286    uint32_t src0, src1, src3, src2;
287    uint32_t out0, out1, out2, out3;
288    uint64_t store0, store1;
289    v16u8 src_top;
290    v8u16 add;
291    v4u32 sum;
292
293    src_top = LD_UB(src - stride);
294    add = __msa_hadd_u_h((v16u8) src_top, (v16u8) src_top);
295    sum = __msa_hadd_u_w(add, add);
296    src0 = __msa_copy_u_w((v4i32) sum, 0);
297    src1 = __msa_copy_u_w((v4i32) sum, 1);
298    src0 += src[0 * stride - 1];
299    src0 += src[1 * stride - 1];
300    src0 += src[2 * stride - 1];
301    src0 += src[3 * stride - 1];
302    src2  = src[4 * stride - 1];
303    src2 += src[5 * stride - 1];
304    src2 += src[6 * stride - 1];
305    src2 += src[7 * stride - 1];
306    src0 = (src0 + 4) >> 3;
307    src3 = (src1 + src2 + 4) >> 3;
308    src1 = (src1 + 2) >> 2;
309    src2 = (src2 + 2) >> 2;
310    out0 = src0 * 0x01010101;
311    out1 = src1 * 0x01010101;
312    out2 = src2 * 0x01010101;
313    out3 = src3 * 0x01010101;
314    store0 = ((uint64_t) out1 << 32) | out0;
315    store1 = ((uint64_t) out3 << 32) | out2;
316
317    SD4(store0, store0, store0, store0, src, stride);
318    src += (4 * stride);
319    SD4(store1, store1, store1, store1, src, stride);
320}
321
322static void intra_predict_hor_dc_8x8_msa(uint8_t *src, int32_t stride)
323{
324    uint32_t src0, src1;
325    uint64_t out0, out1;
326
327    src0  = src[0 * stride - 1];
328    src0 += src[1 * stride - 1];
329    src0 += src[2 * stride - 1];
330    src0 += src[3 * stride - 1];
331    src1  = src[4 * stride - 1];
332    src1 += src[5 * stride - 1];
333    src1 += src[6 * stride - 1];
334    src1 += src[7 * stride - 1];
335    src0 = (src0 + 2) >> 2;
336    src1 = (src1 + 2) >> 2;
337    out0 = src0 * 0x0101010101010101;
338    out1 = src1 * 0x0101010101010101;
339
340    SD4(out0, out0, out0, out0, src, stride);
341    src += (4 * stride);
342    SD4(out1, out1, out1, out1, src, stride);
343}
344
345static void intra_predict_vert_dc_8x8_msa(uint8_t *src, int32_t stride)
346{
347    uint64_t out0;
348    v16i8 mask = { 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
349    v16u8 src_top, res0;
350    v8u16 add;
351    v4u32 sum;
352
353    src_top = LD_UB(src - stride);
354    add = __msa_hadd_u_h(src_top, src_top);
355    sum = __msa_hadd_u_w(add, add);
356    sum = (v4u32) __msa_srari_w((v4i32) sum, 2);
357    res0 = (v16u8) __msa_vshf_b(mask, (v16i8) sum, (v16i8) sum);
358    out0 = __msa_copy_u_d((v2i64) res0, 0);
359
360    SD4(out0, out0, out0, out0, src, stride);
361    src += (4 * stride);
362    SD4(out0, out0, out0, out0, src, stride);
363}
364
365static void intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src, int32_t stride)
366{
367    uint32_t src0, src1, src2;
368    uint32_t out0, out1, out2;
369    uint64_t store0, store1;
370    v16u8 src_top;
371    v8u16 add;
372    v4u32 sum;
373
374    src_top = LD_UB(src - stride);
375    add = __msa_hadd_u_h(src_top, src_top);
376    sum = __msa_hadd_u_w(add, add);
377    src0 = __msa_copy_u_w((v4i32) sum, 0);
378    src1 = __msa_copy_u_w((v4i32) sum, 1);
379
380    src2  = src[0 * stride - 1];
381    src2 += src[1 * stride - 1];
382    src2 += src[2 * stride - 1];
383    src2 += src[3 * stride - 1];
384    src2 = (src0 + src2 + 4) >> 3;
385    src0 = (src0 + 2) >> 2;
386    src1 = (src1 + 2) >> 2;
387    out0 = src0 * 0x01010101;
388    out1 = src1 * 0x01010101;
389    out2 = src2 * 0x01010101;
390    store1 = ((uint64_t) out1 << 32);
391    store0 = store1 | ((uint64_t) out2);
392    store1 = store1 | ((uint64_t) out0);
393
394    SD4(store0, store0, store0, store0, src, stride);
395    src += (4 * stride);
396    SD4(store1, store1, store1, store1, src, stride);
397}
398
399static void intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src, int32_t stride)
400{
401    uint32_t src0, src1, src2, src3;
402    uint32_t out0, out1, out2, out3;
403    uint64_t store0, store1;
404    v16u8 src_top;
405    v8u16 add;
406    v4u32 sum;
407
408    src_top = LD_UB(src - stride);
409    add = __msa_hadd_u_h(src_top, src_top);
410    sum = __msa_hadd_u_w(add, add);
411    src0 = __msa_copy_u_w((v4i32) sum, 0);
412    src1 = __msa_copy_u_w((v4i32) sum, 1);
413
414    src2  = src[4 * stride - 1];
415    src2 += src[5 * stride - 1];
416    src2 += src[6 * stride - 1];
417    src2 += src[7 * stride - 1];
418    src0 = (src0 + 2) >> 2;
419    src3 = (src1 + src2 + 4) >> 3;
420    src1 = (src1 + 2) >> 2;
421    src2 = (src2 + 2) >> 2;
422
423    out0 = src0 * 0x01010101;
424    out1 = src1 * 0x01010101;
425    out2 = src2 * 0x01010101;
426    out3 = src3 * 0x01010101;
427    store0 = ((uint64_t) out1 << 32) | out0;
428    store1 = ((uint64_t) out3 << 32) | out2;
429
430    SD4(store0, store0, store0, store0, src, stride);
431    src += (4 * stride);
432    SD4(store1, store1, store1, store1, src, stride);
433}
434
435static void intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src, int32_t stride)
436{
437    uint32_t src0;
438    uint64_t out0, out1;
439
440    src0  = src[0 * stride - 1];
441    src0 += src[1 * stride - 1];
442    src0 += src[2 * stride - 1];
443    src0 += src[3 * stride - 1];
444    src0 = (src0 + 2) >> 2;
445    out0 = src0 * 0x0101010101010101;
446    out1 = 0x8080808080808080;
447
448    SD4(out0, out0, out0, out0, src, stride);
449    src += (4 * stride);
450    SD4(out1, out1, out1, out1, src, stride);
451}
452
453static void intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src, int32_t stride)
454{
455    uint32_t src0;
456    uint64_t out0, out1;
457
458    src0  = src[4 * stride - 1];
459    src0 += src[5 * stride - 1];
460    src0 += src[6 * stride - 1];
461    src0 += src[7 * stride - 1];
462    src0 = (src0 + 2) >> 2;
463
464    out0 = 0x8080808080808080;
465    out1 = src0 * 0x0101010101010101;
466
467    SD4(out0, out0, out0, out0, src, stride);
468    src += (4 * stride);
469    SD4(out1, out1, out1, out1, src, stride);
470}
471
472void ff_h264_intra_predict_plane_8x8_msa(uint8_t *src, ptrdiff_t stride)
473{
474    intra_predict_plane_8x8_msa(src, stride);
475}
476
477void ff_h264_intra_predict_dc_4blk_8x8_msa(uint8_t *src, ptrdiff_t stride)
478{
479    intra_predict_dc_4blk_8x8_msa(src, stride);
480}
481
482void ff_h264_intra_predict_hor_dc_8x8_msa(uint8_t *src, ptrdiff_t stride)
483{
484    intra_predict_hor_dc_8x8_msa(src, stride);
485}
486
487void ff_h264_intra_predict_vert_dc_8x8_msa(uint8_t *src, ptrdiff_t stride)
488{
489    intra_predict_vert_dc_8x8_msa(src, stride);
490}
491
492void ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src,
493                                                  ptrdiff_t stride)
494{
495    intra_predict_mad_cow_dc_l0t_8x8_msa(src, stride);
496}
497
498void ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src,
499                                                  ptrdiff_t stride)
500{
501    intra_predict_mad_cow_dc_0lt_8x8_msa(src, stride);
502}
503
504void ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src,
505                                                  ptrdiff_t stride)
506{
507    intra_predict_mad_cow_dc_l00_8x8_msa(src, stride);
508}
509
510void ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src,
511                                                  ptrdiff_t stride)
512{
513    intra_predict_mad_cow_dc_0l0_8x8_msa(src, stride);
514}
515
516void ff_h264_intra_predict_plane_16x16_msa(uint8_t *src, ptrdiff_t stride)
517{
518    intra_predict_plane_16x16_msa(src, stride);
519}
520
521void ff_h264_intra_pred_vert_8x8_msa(uint8_t *src, ptrdiff_t stride)
522{
523    uint8_t *dst = src;
524
525    intra_predict_vert_8x8_msa(src - stride, dst, stride);
526}
527
528void ff_h264_intra_pred_horiz_8x8_msa(uint8_t *src, ptrdiff_t stride)
529{
530    uint8_t *dst = src;
531
532    intra_predict_horiz_8x8_msa(src - 1, stride, dst, stride);
533}
534
535void ff_h264_intra_pred_dc_16x16_msa(uint8_t *src, ptrdiff_t stride)
536{
537    uint8_t *src_top = src - stride;
538    uint8_t *src_left = src - 1;
539    uint8_t *dst = src;
540    uint32_t addition = 0;
541    v16u8 src_above, out;
542    v8u16 sum_above;
543    v4u32 sum_top;
544    v2u64 sum;
545
546    src_above = LD_UB(src_top);
547
548    sum_above = __msa_hadd_u_h(src_above, src_above);
549    sum_top = __msa_hadd_u_w(sum_above, sum_above);
550    sum = __msa_hadd_u_d(sum_top, sum_top);
551    sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
552    sum = __msa_hadd_u_d(sum_top, sum_top);
553    addition = __msa_copy_u_w((v4i32) sum, 0);
554    addition += src_left[ 0 * stride];
555    addition += src_left[ 1 * stride];
556    addition += src_left[ 2 * stride];
557    addition += src_left[ 3 * stride];
558    addition += src_left[ 4 * stride];
559    addition += src_left[ 5 * stride];
560    addition += src_left[ 6 * stride];
561    addition += src_left[ 7 * stride];
562    addition += src_left[ 8 * stride];
563    addition += src_left[ 9 * stride];
564    addition += src_left[10 * stride];
565    addition += src_left[11 * stride];
566    addition += src_left[12 * stride];
567    addition += src_left[13 * stride];
568    addition += src_left[14 * stride];
569    addition += src_left[15 * stride];
570    addition = (addition + 16) >> 5;
571    out = (v16u8) __msa_fill_b(addition);
572
573    ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
574    dst += (8 * stride);
575    ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
576}
577
578void ff_h264_intra_pred_vert_16x16_msa(uint8_t *src, ptrdiff_t stride)
579{
580    uint8_t *dst = src;
581
582    intra_predict_vert_16x16_msa(src - stride, dst, stride);
583}
584
585void ff_h264_intra_pred_horiz_16x16_msa(uint8_t *src, ptrdiff_t stride)
586{
587    uint8_t *dst = src;
588
589    intra_predict_horiz_16x16_msa(src - 1, stride, dst, stride);
590}
591
592void ff_h264_intra_pred_dc_left_16x16_msa(uint8_t *src, ptrdiff_t stride)
593{
594    uint8_t *src_left = src - 1;
595    uint8_t *dst = src;
596    uint32_t addition;
597    v16u8 out;
598
599    addition  = src_left[ 0 * stride];
600    addition += src_left[ 1 * stride];
601    addition += src_left[ 2 * stride];
602    addition += src_left[ 3 * stride];
603    addition += src_left[ 4 * stride];
604    addition += src_left[ 5 * stride];
605    addition += src_left[ 6 * stride];
606    addition += src_left[ 7 * stride];
607    addition += src_left[ 8 * stride];
608    addition += src_left[ 9 * stride];
609    addition += src_left[10 * stride];
610    addition += src_left[11 * stride];
611    addition += src_left[12 * stride];
612    addition += src_left[13 * stride];
613    addition += src_left[14 * stride];
614    addition += src_left[15 * stride];
615
616    addition = (addition + 8) >> 4;
617    out = (v16u8) __msa_fill_b(addition);
618
619    ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
620    dst += (8 * stride);
621    ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
622}
623
624void ff_h264_intra_pred_dc_top_16x16_msa(uint8_t *src, ptrdiff_t stride)
625{
626    uint8_t *src_top = src - stride;
627    uint8_t *dst = src;
628    v16u8 src_above, out;
629    v8u16 sum_above;
630    v4u32 sum_top;
631    v2u64 sum;
632
633    src_above = LD_UB(src_top);
634
635    sum_above = __msa_hadd_u_h(src_above, src_above);
636    sum_top = __msa_hadd_u_w(sum_above, sum_above);
637    sum = __msa_hadd_u_d(sum_top, sum_top);
638    sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
639    sum = __msa_hadd_u_d(sum_top, sum_top);
640    sum = (v2u64) __msa_srari_d((v2i64) sum, 4);
641    out = (v16u8) __msa_splati_b((v16i8) sum, 0);
642
643    ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
644    dst += (8 * stride);
645    ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
646}
647
648void ff_h264_intra_pred_dc_128_8x8_msa(uint8_t *src, ptrdiff_t stride)
649{
650    uint64_t out;
651    v16u8 store;
652
653    store = (v16u8) __msa_fill_b(128);
654    out = __msa_copy_u_d((v2i64) store, 0);
655
656    SD4(out, out, out, out, src, stride);
657    src += (4 * stride);
658    SD4(out, out, out, out, src, stride);
659}
660
661void ff_h264_intra_pred_dc_128_16x16_msa(uint8_t *src, ptrdiff_t stride)
662{
663    v16u8 out;
664
665    out = (v16u8) __msa_fill_b(128);
666
667    ST_UB8(out, out, out, out, out, out, out, out, src, stride);
668    src += (8 * stride);
669    ST_UB8(out, out, out, out, out, out, out, out, src, stride);
670}
671
672void ff_vp8_pred8x8_127_dc_8_msa(uint8_t *src, ptrdiff_t stride)
673{
674    intra_predict_127dc_8x8_msa(src, stride);
675}
676
677void ff_vp8_pred8x8_129_dc_8_msa(uint8_t *src, ptrdiff_t stride)
678{
679    intra_predict_129dc_8x8_msa(src, stride);
680}
681
682void ff_vp8_pred16x16_127_dc_8_msa(uint8_t *src, ptrdiff_t stride)
683{
684    intra_predict_127dc_16x16_msa(src, stride);
685}
686
687void ff_vp8_pred16x16_129_dc_8_msa(uint8_t *src, ptrdiff_t stride)
688{
689    intra_predict_129dc_16x16_msa(src, stride);
690}
691