1/*
2 * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavcodec/hevcdec.h"
22#include "libavutil/mips/generic_macros_msa.h"
23#include "hevcpred_mips.h"
24
25static const int8_t intra_pred_angle_up[17] = {
26    -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
27};
28
29static const int8_t intra_pred_angle_low[16] = {
30    32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
31};
32
33#define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,          \
34                              mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3,  \
35                              res0, res1, mul_val_b0, mul_val_b1, round)       \
36{                                                                              \
37    v8i16 res0_m, res1_m, res2_m, res3_m;                                      \
38                                                                               \
39    MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1,                 \
40         mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m);                    \
41                                                                               \
42    res0_m += mul_val_h1 * tmp0;                                               \
43    res1_m += mul_val_h3 * tmp0;                                               \
44    res2_m += mul_val_h1 * tmp0;                                               \
45    res3_m += mul_val_h3 * tmp0;                                               \
46                                                                               \
47    res0_m += mul_val_b0 * src0_r;                                             \
48    res1_m += mul_val_b0 * src0_l;                                             \
49    res2_m += (mul_val_b0 - 1) * src0_r;                                       \
50    res3_m += (mul_val_b0 - 1) * src0_l;                                       \
51                                                                               \
52    res0_m += mul_val_b1 * tmp1;                                               \
53    res1_m += mul_val_b1 * tmp1;                                               \
54    res2_m += (mul_val_b1 + 1) * tmp1;                                         \
55    res3_m += (mul_val_b1 + 1) * tmp1;                                         \
56                                                                               \
57    SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round);                        \
58    PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1);                   \
59}
60
61static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top,
62                                         const uint8_t *src_left,
63                                         uint8_t *dst, int32_t stride,
64                                         int32_t flag)
65{
66    uint32_t col;
67    uint32_t src_data;
68    v8i16 vec0, vec1, vec2;
69    v16i8 zero = { 0 };
70
71    src_data = LW(src_top);
72    SW4(src_data, src_data, src_data, src_data, dst, stride);
73
74    if (0 == flag) {
75        src_data = LW(src_left);
76
77        vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
78
79        vec0 = __msa_fill_h(src_left[-1]);
80        vec1 = __msa_fill_h(src_top[0]);
81
82        vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
83        vec2 -= vec0;
84        vec2 >>= 1;
85        vec2 += vec1;
86        CLIP_SH_0_255(vec2);
87
88        for (col = 0; col < 4; col++) {
89            dst[stride * col] = (uint8_t) vec2[col];
90        }
91    }
92}
93
94static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top,
95                                         const uint8_t *src_left,
96                                         uint8_t *dst, int32_t stride,
97                                         int32_t flag)
98{
99    uint8_t *tmp_dst = dst;
100    uint32_t row;
101    uint16_t val0, val1, val2, val3;
102    uint64_t src_data1;
103    v8i16 vec0, vec1, vec2;
104    v16i8 zero = { 0 };
105
106    src_data1 = LD(src_top);
107
108    for (row = 8; row--;) {
109        SD(src_data1, tmp_dst);
110        tmp_dst += stride;
111    }
112
113    if (0 == flag) {
114        src_data1 = LD(src_left);
115
116        vec2 = (v8i16) __msa_insert_d((v2i64) zero, 0, src_data1);
117
118        vec0 = __msa_fill_h(src_left[-1]);
119        vec1 = __msa_fill_h(src_top[0]);
120
121        vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
122        vec2 -= vec0;
123        vec2 >>= 1;
124        vec2 += vec1;
125        CLIP_SH_0_255(vec2);
126
127        val0 = vec2[0];
128        val1 = vec2[1];
129        val2 = vec2[2];
130        val3 = vec2[3];
131
132        dst[0] = val0;
133        dst[stride] = val1;
134        dst[2 * stride] = val2;
135        dst[3 * stride] = val3;
136
137        val0 = vec2[4];
138        val1 = vec2[5];
139        val2 = vec2[6];
140        val3 = vec2[7];
141
142        dst[4 * stride] = val0;
143        dst[5 * stride] = val1;
144        dst[6 * stride] = val2;
145        dst[7 * stride] = val3;
146    }
147}
148
149static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top,
150                                           const uint8_t *src_left,
151                                           uint8_t *dst, int32_t stride,
152                                           int32_t flag)
153{
154    int32_t col;
155    uint8_t *tmp_dst = dst;
156    uint32_t row;
157    v16u8 src;
158    v8i16 vec0, vec1, vec2, vec3;
159
160    src = LD_UB(src_top);
161
162    for (row = 16; row--;) {
163        ST_UB(src, tmp_dst);
164        tmp_dst += stride;
165    }
166
167    if (0 == flag) {
168        src = LD_UB(src_left);
169
170        vec0 = __msa_fill_h(src_left[-1]);
171        vec1 = __msa_fill_h(src_top[0]);
172
173        UNPCK_UB_SH(src, vec2, vec3);
174        SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
175
176        vec2 >>= 1;
177        vec3 >>= 1;
178
179        ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
180        CLIP_SH2_0_255(vec2, vec3);
181
182        src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
183
184        for (col = 0; col < 16; col++) {
185            dst[stride * col] = src[col];
186        }
187    }
188}
189
190static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top,
191                                          const uint8_t *src_left,
192                                          uint8_t *dst, int32_t stride,
193                                          int32_t flag)
194{
195    uint32_t val0, val1, val2, val3;
196    v16i8 src0;
197    v8i16 src0_r, src_top_val, src_left_val;
198    v16i8 zero = { 0 };
199
200    val0 = src_left[0] * 0x01010101;
201    val1 = src_left[1] * 0x01010101;
202    val2 = src_left[2] * 0x01010101;
203    val3 = src_left[3] * 0x01010101;
204    SW4(val0, val1, val2, val3, dst, stride);
205
206    if (0 == flag) {
207        val0 = LW(src_top);
208        src0 = (v16i8) __msa_insert_w((v4i32) src0, 0, val0);
209        src_top_val = __msa_fill_h(src_top[-1]);
210        src_left_val = __msa_fill_h(src_left[0]);
211
212        src0_r = (v8i16) __msa_ilvr_b(zero, src0);
213
214        src0_r -= src_top_val;
215        src0_r >>= 1;
216        src0_r += src_left_val;
217        CLIP_SH_0_255(src0_r);
218        src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
219        val0 = __msa_copy_s_w((v4i32) src0, 0);
220        SW(val0, dst);
221    }
222}
223
224static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top,
225                                          const uint8_t *src_left,
226                                          uint8_t *dst, int32_t stride,
227                                          int32_t flag)
228{
229    uint64_t val0, val1, val2, val3;
230    v16i8 src0;
231    v8i16 src0_r, src_top_val, src_left_val;
232    v16i8 zero = { 0 };
233
234    val0 = src_left[0] * 0x0101010101010101;
235    val1 = src_left[1] * 0x0101010101010101;
236    val2 = src_left[2] * 0x0101010101010101;
237    val3 = src_left[3] * 0x0101010101010101;
238    SD4(val0, val1, val2, val3, dst, stride);
239
240    val0 = src_left[4] * 0x0101010101010101;
241    val1 = src_left[5] * 0x0101010101010101;
242    val2 = src_left[6] * 0x0101010101010101;
243    val3 = src_left[7] * 0x0101010101010101;
244    SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
245
246    if (0 == flag) {
247        val0 = LD(src_top);
248        src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, val0);
249        src_top_val = __msa_fill_h(src_top[-1]);
250        src_left_val = __msa_fill_h(src_left[0]);
251
252        src0_r = (v8i16) __msa_ilvr_b(zero, src0);
253
254        src0_r -= src_top_val;
255        src0_r >>= 1;
256        src0_r += src_left_val;
257        CLIP_SH_0_255(src0_r);
258        src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
259        val0 = __msa_copy_s_d((v2i64) src0, 0);
260        SD(val0, dst);
261    }
262}
263
264static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top,
265                                            const uint8_t *src_left,
266                                            uint8_t *dst, int32_t stride,
267                                            int32_t flag)
268{
269    uint8_t *tmp_dst = dst;
270    uint32_t row;
271    uint8_t inp0, inp1, inp2, inp3;
272    v16i8 src0, src1, src2, src3;
273    v8i16 src0_r, src0_l, src_left_val, src_top_val;
274
275    src_left_val = __msa_fill_h(src_left[0]);
276
277    for (row = 4; row--;) {
278        inp0 = src_left[0];
279        inp1 = src_left[1];
280        inp2 = src_left[2];
281        inp3 = src_left[3];
282        src_left += 4;
283
284        src0 = __msa_fill_b(inp0);
285        src1 = __msa_fill_b(inp1);
286        src2 = __msa_fill_b(inp2);
287        src3 = __msa_fill_b(inp3);
288
289        ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
290        tmp_dst += (4 * stride);
291    }
292
293    if (0 == flag) {
294        src0 = LD_SB(src_top);
295        src_top_val = __msa_fill_h(src_top[-1]);
296
297        UNPCK_UB_SH(src0, src0_r, src0_l);
298        SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
299
300        src0_r >>= 1;
301        src0_l >>= 1;
302
303        ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
304        CLIP_SH2_0_255(src0_r, src0_l);
305        src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
306        ST_SB(src0, dst);
307    }
308}
309
310static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top,
311                                            const uint8_t *src_left,
312                                            uint8_t *dst, int32_t stride)
313{
314    uint32_t row;
315    uint8_t inp0, inp1, inp2, inp3;
316    v16i8 src0, src1, src2, src3;
317
318    for (row = 0; row < 8; row++) {
319        inp0 = src_left[row * 4];
320        inp1 = src_left[row * 4 + 1];
321        inp2 = src_left[row * 4 + 2];
322        inp3 = src_left[row * 4 + 3];
323
324        src0 = __msa_fill_b(inp0);
325        src1 = __msa_fill_b(inp1);
326        src2 = __msa_fill_b(inp2);
327        src3 = __msa_fill_b(inp3);
328
329        ST_SB2(src0, src0, dst, 16);
330        dst += stride;
331        ST_SB2(src1, src1, dst, 16);
332        dst += stride;
333        ST_SB2(src2, src2, dst, 16);
334        dst += stride;
335        ST_SB2(src3, src3, dst, 16);
336        dst += stride;
337    }
338}
339
340static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top,
341                                       const uint8_t *src_left,
342                                       uint8_t *dst, int32_t stride,
343                                       int32_t flag)
344{
345    uint8_t *tmp_dst = dst;
346    uint32_t addition = 0;
347    uint32_t val0, val1, val2;
348    v16i8 src = { 0 };
349    v16u8 store;
350    v16i8 zero = { 0 };
351    v8u16 sum, vec0, vec1;
352
353    val0 = LW(src_top);
354    val1 = LW(src_left);
355    INSERT_W2_SB(val0, val1, src);
356    sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
357    sum = (v8u16) __msa_hadd_u_w(sum, sum);
358    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
359    sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
360    addition = __msa_copy_u_w((v4i32) sum, 0);
361    store = (v16u8) __msa_fill_b(addition);
362    val0 = __msa_copy_u_w((v4i32) store, 0);
363    SW4(val0, val0, val0, val0, dst, stride)
364
365        if (0 == flag) {
366        ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
367
368        vec1 += vec0;
369        vec0 += vec0;
370        vec1 += vec0;
371
372        vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
373        store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
374        val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
375        store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
376        val0 = __msa_copy_u_w((v4i32) store, 0);
377        SW(val0, tmp_dst);
378
379        val0 = src_left[1];
380        val1 = src_left[2];
381        val2 = src_left[3];
382
383        addition *= 3;
384
385        ADD2(val0, addition, val1, addition, val0, val1);
386        val2 += addition;
387
388        val0 += 2;
389        val1 += 2;
390        val2 += 2;
391        val0 >>= 2;
392        val1 >>= 2;
393        val2 >>= 2;
394
395        tmp_dst[stride * 1] = val0;
396        tmp_dst[stride * 2] = val1;
397        tmp_dst[stride * 3] = val2;
398    }
399}
400
401static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top,
402                                       const uint8_t *src_left,
403                                       uint8_t *dst, int32_t stride,
404                                       int32_t flag)
405{
406    uint8_t *tmp_dst = dst;
407    uint32_t row, col, val;
408    uint32_t addition = 0;
409    uint64_t val0, val1;
410    v16u8 src = { 0 };
411    v16u8 store;
412    v8u16 sum, vec0, vec1;
413    v16i8 zero = { 0 };
414
415    val0 = LD(src_top);
416    val1 = LD(src_left);
417    INSERT_D2_UB(val0, val1, src);
418    sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
419    sum = (v8u16) __msa_hadd_u_w(sum, sum);
420    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
421    sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
422    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
423    sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
424    addition = __msa_copy_u_w((v4i32) sum, 0);
425    store = (v16u8) __msa_fill_b(addition);
426    val0 = __msa_copy_u_d((v2i64) store, 0);
427
428    for (row = 8; row--;) {
429        SD(val0, dst);
430        dst += stride;
431    }
432
433    if (0 == flag) {
434        ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
435
436        vec1 += vec0;
437        vec0 += vec0;
438        vec1 += vec0;
439        vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
440        store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
441        val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
442        store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
443        val0 = __msa_copy_u_d((v2i64) store, 0);
444        SD(val0, tmp_dst);
445
446        val0 = LD(src_left);
447        src = (v16u8) __msa_insert_d((v2i64) src, 0, val0);
448        vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) src);
449        vec0 = (v8u16) __msa_fill_h(addition);
450        vec0 *= 3;
451        vec1 += vec0;
452        vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
453
454        for (col = 1; col < 8; col++) {
455            tmp_dst[stride * col] = vec1[col];
456        }
457    }
458}
459
460static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top,
461                                         const uint8_t *src_left,
462                                         uint8_t *dst, int32_t stride,
463                                         int32_t flag)
464{
465    uint8_t *tmp_dst = dst;
466    uint32_t row, col, val;
467    uint32_t addition = 0;
468    v16u8 src_above1, store, src_left1;
469    v8u16 sum, sum_above, sum_left;
470    v8u16 vec0, vec1, vec2;
471    v16i8 zero = { 0 };
472
473    src_above1 = LD_UB(src_top);
474    src_left1 = LD_UB(src_left);
475
476    HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
477    sum = sum_above + sum_left;
478    sum = (v8u16) __msa_hadd_u_w(sum, sum);
479    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
480    sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
481    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
482    sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
483    addition = __msa_copy_u_w((v4i32) sum, 0);
484    store = (v16u8) __msa_fill_b(addition);
485
486    for (row = 16; row--;) {
487        ST_UB(store, dst);
488        dst += stride;
489    }
490
491    if (0 == flag) {
492        vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
493        ILVRL_B2_UH(zero, src_above1, vec1, vec2);
494        ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
495        vec0 += vec0;
496        ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
497        SRARI_H2_UH(vec1, vec2, 2);
498        store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
499        val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
500        store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
501        ST_UB(store, tmp_dst);
502
503        ILVRL_B2_UH(zero, src_left1, vec1, vec2);
504        vec0 = (v8u16) __msa_fill_h(addition);
505        vec0 *= 3;
506        ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
507        SRARI_H2_UH(vec1, vec2, 2);
508        store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
509
510        for (col = 1; col < 16; col++) {
511            tmp_dst[stride * col] = store[col];
512        }
513    }
514}
515
516static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top,
517                                         const uint8_t *src_left,
518                                         uint8_t *dst, int32_t stride)
519{
520    uint32_t row;
521    v16u8 src_above1, src_above2, store, src_left1, src_left2;
522    v8u16 sum_above1, sum_above2;
523    v8u16 sum_left1, sum_left2;
524    v8u16 sum, sum_above, sum_left;
525
526    LD_UB2(src_top, 16, src_above1, src_above2);
527    LD_UB2(src_left, 16, src_left1, src_left2);
528    HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
529    HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
530    sum_above = sum_above1 + sum_above2;
531    sum_left = sum_left1 + sum_left2;
532    sum = sum_above + sum_left;
533    sum = (v8u16) __msa_hadd_u_w(sum, sum);
534    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
535    sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
536    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
537    sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
538    store = (v16u8) __msa_splati_b((v16i8) sum, 0);
539
540    for (row = 16; row--;) {
541        ST_UB2(store, store, dst, 16);
542        dst += stride;
543        ST_UB2(store, store, dst, 16);
544        dst += stride;
545    }
546}
547
548static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
549                                          const uint8_t *src_left,
550                                          uint8_t *dst, int32_t stride)
551{
552    uint32_t src0, src1;
553    v16i8 src_vec0, src_vec1;
554    v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
555    v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
556    v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
557    v16i8 zero = { 0 };
558
559    src0 = LW(src_top);
560    src1 = LW(src_left);
561
562    mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
563
564    src_vec0 = (v16i8) __msa_insert_w((v4i32) zero, 0, src0);
565    src_vec1 = (v16i8) __msa_insert_w((v4i32) zero, 0, src1);
566
567    ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
568    SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
569
570    tmp0 = __msa_fill_h(src_top[4]);
571    tmp1 = __msa_fill_h(src_left[4]);
572
573    MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
574         res0, res1, res2, res3);
575
576    res0 += mul_val1 * tmp0;
577    res1 += mul_val1 * tmp0;
578    res2 += mul_val1 * tmp0;
579    res3 += mul_val1 * tmp0;
580
581    res0 += 3 * src_vec0_r;
582    res1 += 2 * src_vec0_r;
583    res2 += src_vec0_r;
584    res0 += tmp1;
585    res1 += 2 * tmp1;
586    res2 += 3 * tmp1;
587    res3 += 4 * tmp1;
588
589    PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
590    SRARI_H2_SH(res0, res1, 3);
591    src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
592    ST_W4(src_vec0, 0, 1, 2, 3, dst, stride);
593}
594
595static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
596                                          const uint8_t *src_left,
597                                          uint8_t *dst, int32_t stride)
598{
599    uint64_t src0, src1;
600    v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
601    v8i16 src_vec0_r, src_vec1_r;
602    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
603    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
604    v8i16 tmp0, tmp1, tmp2;
605    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
606    v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
607    v16i8 zero = { 0 };
608
609    src0 = LD(src_top);
610    src1 = LD(src_left);
611
612    src_vec0 = (v16i8) __msa_insert_d((v2i64) zero, 0, src0);
613    src_vec1 = (v16i8) __msa_insert_d((v2i64) zero, 0, src1);
614
615    ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
616    SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
617    SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
618
619    tmp0 = __msa_fill_h(src_top[8]);
620    tmp1 = __msa_fill_h(src_left[8]);
621
622    MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
623         res0, res1, res2, res3);
624    MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
625         res4, res5, res6, res7);
626
627    tmp2 = mul_val1 * tmp0;
628    res0 += tmp2;
629    res1 += tmp2;
630    res2 += tmp2;
631    res3 += tmp2;
632    res4 += tmp2;
633    res5 += tmp2;
634    res6 += tmp2;
635    res7 += tmp2;
636
637    res0 += 7 * src_vec0_r;
638    res1 += 6 * src_vec0_r;
639    res2 += 5 * src_vec0_r;
640    res3 += 4 * src_vec0_r;
641    res4 += 3 * src_vec0_r;
642    res5 += 2 * src_vec0_r;
643    res6 += src_vec0_r;
644
645    res0 += tmp1;
646    res1 += 2 * tmp1;
647    res2 += 3 * tmp1;
648    res3 += 4 * tmp1;
649    res4 += 5 * tmp1;
650    res5 += 6 * tmp1;
651    res6 += 7 * tmp1;
652    res7 += 8 * tmp1;
653
654    SRARI_H4_SH(res0, res1, res2, res3, 4);
655    SRARI_H4_SH(res4, res5, res6, res7, 4);
656    PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
657                src_vec0, src_vec1, src_vec2, src_vec3);
658
659    ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
660          0, 1, 0, 1, dst, stride);
661}
662
663static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
664                                            const uint8_t *src_left,
665                                            uint8_t *dst, int32_t stride)
666{
667    v16u8 src0, src1;
668    v8i16 src0_r, src1_r, src0_l, src1_l;
669    v8i16 vec0, vec1;
670    v8i16 res0, res1, tmp0, tmp1;
671    v8i16 mul_val2, mul_val3;
672    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
673    v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
674
675    src0 = LD_UB(src_top);
676    src1 = LD_UB(src_left);
677
678    UNPCK_UB_SH(src0, src0_r, src0_l);
679    UNPCK_UB_SH(src1, src1_r, src1_l);
680
681    mul_val2 = mul_val0 - 8;
682    mul_val3 = mul_val1 + 8;
683
684    tmp0 = __msa_fill_h(src_top[16]);
685    tmp1 = __msa_fill_h(src_left[16]);
686
687    SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
688    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
689                          mul_val0, mul_val1, mul_val2, mul_val3,
690                          res0, res1, 15, 1, 5);
691    ST_SH2(res0, res1, dst, stride);
692    dst += (2 * stride);
693
694    SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
695    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
696                          mul_val0, mul_val1, mul_val2, mul_val3,
697                          res0, res1, 13, 3, 5);
698    ST_SH2(res0, res1, dst, stride);
699    dst += (2 * stride);
700
701    SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
702    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
703                          mul_val0, mul_val1, mul_val2, mul_val3,
704                          res0, res1, 11, 5, 5);
705    ST_SH2(res0, res1, dst, stride);
706    dst += (2 * stride);
707
708    SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
709    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
710                          mul_val0, mul_val1, mul_val2, mul_val3,
711                          res0, res1, 9, 7, 5);
712    ST_SH2(res0, res1, dst, stride);
713    dst += (2 * stride);
714
715    SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
716    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
717                          mul_val0, mul_val1, mul_val2, mul_val3,
718                          res0, res1, 7, 9, 5);
719    ST_SH2(res0, res1, dst, stride);
720    dst += (2 * stride);
721
722    SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
723    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
724                          mul_val0, mul_val1, mul_val2, mul_val3,
725                          res0, res1, 5, 11, 5);
726    ST_SH2(res0, res1, dst, stride);
727    dst += (2 * stride);
728
729    SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
730    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
731                          mul_val0, mul_val1, mul_val2, mul_val3,
732                          res0, res1, 3, 13, 5);
733    ST_SH2(res0, res1, dst, stride);
734    dst += (2 * stride);
735
736    SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
737    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
738                          mul_val0, mul_val1, mul_val2, mul_val3,
739                          res0, res1, 1, 15, 5);
740    ST_SH2(res0, res1, dst, stride);
741}
742
743static void process_intra_upper_16x16_msa(const uint8_t *src_top,
744                                          const uint8_t *src_left,
745                                          uint8_t *dst, int32_t stride,
746                                          uint8_t offset)
747{
748    v16i8 src0, src1;
749    v8i16 src0_r, src1_r, src0_l, src1_l;
750    v8i16 vec0, vec1, res0, res1;
751    v8i16 tmp0, tmp1;
752    v8i16 mul_val2, mul_val3;
753    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
754    v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
755
756    tmp0 = __msa_fill_h(src_top[32 - offset]);
757    tmp1 = __msa_fill_h(src_left[32]);
758
759    src0 = LD_SB(src_top);
760    src1 = LD_SB(src_left);
761
762    UNPCK_UB_SH(src0, src0_r, src0_l);
763    UNPCK_UB_SH(src1, src1_r, src1_l);
764
765    mul_val1 += offset;
766    mul_val0 -= offset;
767    mul_val2 = mul_val0 - 8;
768    mul_val3 = mul_val1 + 8;
769
770    SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
771    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
772                          mul_val0, mul_val1, mul_val2, mul_val3,
773                          res0, res1, 31, 1, 6);
774    ST_SH2(res0, res1, dst, stride);
775    dst += (2 * stride);
776
777    SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
778    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
779                          mul_val0, mul_val1, mul_val2, mul_val3,
780                          res0, res1, 29, 3, 6);
781    ST_SH2(res0, res1, dst, stride);
782    dst += (2 * stride);
783
784    SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
785    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
786                          mul_val0, mul_val1, mul_val2, mul_val3,
787                          res0, res1, 27, 5, 6);
788    ST_SH2(res0, res1, dst, stride);
789    dst += (2 * stride);
790
791    SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
792    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
793                          mul_val0, mul_val1, mul_val2, mul_val3,
794                          res0, res1, 25, 7, 6);
795    ST_SH2(res0, res1, dst, stride);
796    dst += (2 * stride);
797
798    SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
799    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
800                          mul_val0, mul_val1, mul_val2, mul_val3,
801                          res0, res1, 23, 9, 6);
802    ST_SH2(res0, res1, dst, stride);
803    dst += (2 * stride);
804
805    SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
806    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
807                          mul_val0, mul_val1, mul_val2, mul_val3,
808                          res0, res1, 21, 11, 6);
809    ST_SH2(res0, res1, dst, stride);
810    dst += (2 * stride);
811
812    SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
813    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
814                          mul_val0, mul_val1, mul_val2, mul_val3,
815                          res0, res1, 19, 13, 6);
816    ST_SH2(res0, res1, dst, stride);
817    dst += (2 * stride);
818
819    SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
820    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
821                          mul_val0, mul_val1, mul_val2, mul_val3,
822                          res0, res1, 17, 15, 6);
823    ST_SH2(res0, res1, dst, stride);
824}
825
826static void process_intra_lower_16x16_msa(const uint8_t *src_top,
827                                          const uint8_t *src_left,
828                                          uint8_t *dst, int32_t stride,
829                                          uint8_t offset)
830{
831    v16i8 src0, src1;
832    v8i16 src0_r, src1_r, src0_l, src1_l;
833    v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
834    v8i16 mul_val2, mul_val3;
835    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
836    v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
837
838    tmp0 = __msa_fill_h(src_top[32 - offset]);
839    tmp1 = __msa_fill_h(src_left[16]);
840
841    src0 = LD_SB(src_top);
842    src1 = LD_SB(src_left);
843
844    UNPCK_UB_SH(src0, src0_r, src0_l);
845    UNPCK_UB_SH(src1, src1_r, src1_l);
846
847    mul_val1 += offset;
848    mul_val0 -= offset;
849    mul_val2 = mul_val0 - 8;
850    mul_val3 = mul_val1 + 8;
851
852    SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
853    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
854                          mul_val0, mul_val1, mul_val2, mul_val3,
855                          res0, res1, 15, 17, 6);
856    ST_SH2(res0, res1, dst, stride);
857    dst += (2 * stride);
858
859    SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
860    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
861                          mul_val0, mul_val1, mul_val2, mul_val3,
862                          res0, res1, 13, 19, 6);
863    ST_SH2(res0, res1, dst, stride);
864    dst += (2 * stride);
865
866    SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
867    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
868                          mul_val0, mul_val1, mul_val2, mul_val3,
869                          res0, res1, 11, 21, 6);
870    ST_SH2(res0, res1, dst, stride);
871    dst += (2 * stride);
872
873    SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
874    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
875                          mul_val0, mul_val1, mul_val2, mul_val3,
876                          res0, res1, 9, 23, 6);
877    ST_SH2(res0, res1, dst, stride);
878    dst += (2 * stride);
879
880    SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
881    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
882                          mul_val0, mul_val1, mul_val2, mul_val3,
883                          res0, res1, 7, 25, 6);
884    ST_SH2(res0, res1, dst, stride);
885    dst += (2 * stride);
886
887    SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
888    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
889                          mul_val0, mul_val1, mul_val2, mul_val3,
890                          res0, res1, 5, 27, 6);
891    ST_SH2(res0, res1, dst, stride);
892    dst += (2 * stride);
893
894    SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
895    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
896                          mul_val0, mul_val1, mul_val2, mul_val3,
897                          res0, res1, 3, 29, 6);
898    ST_SH2(res0, res1, dst, stride);
899    dst += (2 * stride);
900
901    SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
902    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
903                          mul_val0, mul_val1, mul_val2, mul_val3,
904                          res0, res1, 1, 31, 6);
905    ST_SH2(res0, res1, dst, stride);
906}
907
908static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top,
909                                            const uint8_t *src_left,
910                                            uint8_t *dst, int32_t stride)
911{
912    process_intra_upper_16x16_msa(src_top, src_left, dst, stride, 0);
913    process_intra_upper_16x16_msa((src_top + 16), src_left,
914                                  (dst + 16), stride, 16);
915    dst += (16 * stride);
916    src_left += 16;
917
918    process_intra_lower_16x16_msa(src_top, src_left, dst, stride, 0);
919    process_intra_lower_16x16_msa((src_top + 16), src_left,
920                                  (dst + 16), stride, 16);
921}
922
923static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top,
924                                                     const uint8_t *src_left,
925                                                     uint8_t *dst,
926                                                     int32_t stride,
927                                                     int32_t mode)
928{
929    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
930    uint8_t ref_array[3 * 32 + 4];
931    uint8_t *ref_tmp = ref_array + 4;
932    const uint8_t *ref;
933    int32_t last;
934    int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
935    int32_t idx2, fact_val2, idx3, fact_val3;
936    int32_t angle, angle_loop;
937    int32_t inv_angle_val, offset;
938    uint64_t tmp0;
939    v16i8 top0, top1, top2, top3;
940    v16i8 dst_val0;
941    v16i8 zero = { 0 };
942    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
943    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
944
945    angle = intra_pred_angle_up[mode - 18];
946    inv_angle_val = inv_angle[mode - 18];
947    last = (angle) >> 3;
948    angle_loop = angle;
949
950    ref = src_top - 1;
951    if (angle < 0 && last < -1) {
952        inv_angle_val = inv_angle[mode - 18];
953
954        tmp0 = LD(ref);
955        SD(tmp0, ref_tmp);
956
957        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
958            offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
959            ref_tmp[h_cnt] = src_left[offset];
960        }
961
962        ref = ref_tmp;
963    }
964
965    idx0 = angle_loop >> 5;
966    fact_val0 = angle_loop & 31;
967    angle_loop += angle;
968
969    idx1 = angle_loop >> 5;
970    fact_val1 = angle_loop & 31;
971    angle_loop += angle;
972
973    idx2 = angle_loop >> 5;
974    fact_val2 = angle_loop & 31;
975    angle_loop += angle;
976
977    idx3 = angle_loop >> 5;
978    fact_val3 = angle_loop & 31;
979
980    top0 = LD_SB(ref + idx0 + 1);
981    top1 = LD_SB(ref + idx1 + 1);
982    top2 = LD_SB(ref + idx2 + 1);
983    top3 = LD_SB(ref + idx3 + 1);
984
985    fact0 = __msa_fill_h(fact_val0);
986    fact1 = __msa_fill_h(32 - fact_val0);
987
988    fact2 = __msa_fill_h(fact_val1);
989    fact3 = __msa_fill_h(32 - fact_val1);
990
991    fact4 = __msa_fill_h(fact_val2);
992    fact5 = __msa_fill_h(32 - fact_val2);
993
994    fact6 = __msa_fill_h(fact_val3);
995    fact7 = __msa_fill_h(32 - fact_val3);
996
997    ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
998    ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
999    ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1000               diff0, diff2, diff4, diff6);
1001    SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
1002               diff1, diff3, diff5, diff7);
1003    ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1004    ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1005    MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1006
1007    diff1 += diff0 * fact1;
1008    diff3 += diff2 * fact3;
1009
1010    SRARI_H2_SH(diff1, diff3, 5);
1011    dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
1012    ST_W4(dst_val0, 0, 1, 2, 3, dst, stride);
1013}
1014
1015static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
1016                                                     const uint8_t *src_left,
1017                                                     uint8_t *dst,
1018                                                     int32_t stride,
1019                                                     int32_t mode)
1020{
1021    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1022    uint8_t ref_array[3 * 32 + 4];
1023    uint8_t *ref_tmp = ref_array + 8;
1024    const uint8_t *ref;
1025    const uint8_t *src_left_tmp = src_left - 1;
1026    int32_t last, offset;
1027    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1028    int32_t idx2, fact_val2, idx3, fact_val3;
1029    int32_t angle, angle_loop;
1030    int32_t inv_angle_val, inv_angle_val_loop;
1031    int32_t tmp0, tmp1, tmp2;
1032    v16i8 top0, top1, top2, top3;
1033    v16u8 dst_val0, dst_val1;
1034    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1035    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1036
1037    angle = intra_pred_angle_up[mode - 18];
1038    inv_angle_val = inv_angle[mode - 18];
1039    last = (angle) >> 2;
1040    angle_loop = angle;
1041
1042    ref = src_top - 1;
1043    if (last < -1) {
1044        inv_angle_val_loop = inv_angle_val * last;
1045
1046        tmp0 = LW(ref);
1047        tmp1 = LW(ref + 4);
1048        tmp2 = LW(ref + 8);
1049        SW(tmp0, ref_tmp);
1050        SW(tmp1, ref_tmp + 4);
1051        SW(tmp2, ref_tmp + 8);
1052
1053        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1054            offset = (inv_angle_val_loop + 128) >> 8;
1055            ref_tmp[h_cnt] = src_left_tmp[offset];
1056            inv_angle_val_loop += inv_angle_val;
1057        }
1058        ref = ref_tmp;
1059    }
1060
1061    for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1062        idx0 = (angle_loop) >> 5;
1063        fact_val0 = (angle_loop) & 31;
1064        angle_loop += angle;
1065
1066        idx1 = (angle_loop) >> 5;
1067        fact_val1 = (angle_loop) & 31;
1068        angle_loop += angle;
1069
1070        idx2 = (angle_loop) >> 5;
1071        fact_val2 = (angle_loop) & 31;
1072        angle_loop += angle;
1073
1074        idx3 = (angle_loop) >> 5;
1075        fact_val3 = (angle_loop) & 31;
1076        angle_loop += angle;
1077
1078        top0 = LD_SB(ref + idx0 + 1);
1079        top1 = LD_SB(ref + idx1 + 1);
1080        top2 = LD_SB(ref + idx2 + 1);
1081        top3 = LD_SB(ref + idx3 + 1);
1082
1083        fact0 = __msa_fill_h(fact_val0);
1084        fact1 = __msa_fill_h(32 - fact_val0);
1085        fact2 = __msa_fill_h(fact_val1);
1086        fact3 = __msa_fill_h(32 - fact_val1);
1087        fact4 = __msa_fill_h(fact_val2);
1088        fact5 = __msa_fill_h(32 - fact_val2);
1089        fact6 = __msa_fill_h(fact_val3);
1090        fact7 = __msa_fill_h(32 - fact_val3);
1091
1092        UNPCK_UB_SH(top0, diff0, diff1);
1093        UNPCK_UB_SH(top1, diff2, diff3);
1094        UNPCK_UB_SH(top2, diff4, diff5);
1095        UNPCK_UB_SH(top3, diff6, diff7);
1096
1097        SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1098                   diff1, diff3, diff5, diff7);
1099        MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1100             diff1, diff3, diff5, diff7);
1101
1102        diff1 += diff0 * fact1;
1103        diff3 += diff2 * fact3;
1104        diff5 += diff4 * fact5;
1105        diff7 += diff6 * fact7;
1106
1107        SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1108        PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
1109        ST_D4(dst_val0, dst_val1, 0, 1, 0, 1, dst, stride);
1110        dst += (4 * stride);
1111    }
1112}
1113
1114static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top,
1115                                                      const uint8_t *src_left,
1116                                                      uint8_t *dst,
1117                                                      int32_t stride,
1118                                                      int32_t mode)
1119{
1120    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1121    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1122    int32_t idx2, fact_val2, idx3, fact_val3;
1123    int32_t tmp0;
1124    int32_t angle, angle_loop, offset;
1125    int32_t inv_angle_val, inv_angle_val_loop;
1126    uint8_t ref_array[3 * 32 + 4];
1127    uint8_t *ref_tmp = ref_array + 16;
1128    const uint8_t *ref;
1129    const uint8_t *src_left_tmp = src_left - 1;
1130    int32_t last;
1131    v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1132    v16i8 dst0, dst1, dst2, dst3;
1133    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1134    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1135    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1136
1137    angle = intra_pred_angle_up[mode - 18];
1138    inv_angle_val = inv_angle[mode - 18];
1139    last = angle >> 1;
1140    angle_loop = angle;
1141
1142    ref = src_top - 1;
1143    if (last < -1) {
1144        inv_angle_val_loop = inv_angle_val * last;
1145
1146        top0 = LD_UB(ref);
1147        tmp0 = LW(ref + 16);
1148        ST_UB(top0, ref_tmp);
1149        SW(tmp0, ref_tmp + 16);
1150
1151        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1152            offset = (inv_angle_val_loop + 128) >> 8;
1153            ref_tmp[h_cnt] = src_left_tmp[offset];
1154            inv_angle_val_loop += inv_angle_val;
1155        }
1156        ref = ref_tmp;
1157    }
1158
1159    for (v_cnt = 4; v_cnt--;) {
1160        idx0 = (angle_loop) >> 5;
1161        fact_val0 = (angle_loop) & 31;
1162        angle_loop += angle;
1163
1164        idx1 = (angle_loop) >> 5;
1165        fact_val1 = (angle_loop) & 31;
1166        angle_loop += angle;
1167
1168        idx2 = (angle_loop) >> 5;
1169        fact_val2 = (angle_loop) & 31;
1170        angle_loop += angle;
1171
1172        idx3 = (angle_loop) >> 5;
1173        fact_val3 = (angle_loop) & 31;
1174        angle_loop += angle;
1175
1176        LD_UB2(ref + idx0 + 1, 16, top0, top1);
1177        LD_UB2(ref + idx1 + 1, 16, top2, top3);
1178        LD_UB2(ref + idx2 + 1, 16, top4, top5);
1179        LD_UB2(ref + idx3 + 1, 16, top6, top7);
1180
1181        fact0 = __msa_fill_h(fact_val0);
1182        fact1 = __msa_fill_h(32 - fact_val0);
1183        fact2 = __msa_fill_h(fact_val1);
1184        fact3 = __msa_fill_h(32 - fact_val1);
1185        fact4 = __msa_fill_h(fact_val2);
1186        fact5 = __msa_fill_h(32 - fact_val2);
1187        fact6 = __msa_fill_h(fact_val3);
1188        fact7 = __msa_fill_h(32 - fact_val3);
1189
1190        SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1191                   top1, top3, top5, top7);
1192        UNPCK_UB_SH(top0, diff0, diff1);
1193        UNPCK_UB_SH(top1, diff2, diff3);
1194        UNPCK_UB_SH(top2, diff4, diff5);
1195        UNPCK_UB_SH(top3, diff6, diff7);
1196        UNPCK_UB_SH(top4, diff8, diff9);
1197        UNPCK_UB_SH(top5, diff10, diff11);
1198        UNPCK_UB_SH(top6, diff12, diff13);
1199        UNPCK_UB_SH(top7, diff14, diff15);
1200
1201        MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1202             diff2, diff3, diff6, diff7);
1203        MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1204             diff10, diff11, diff14, diff15);
1205
1206        diff2 += diff0 * fact1;
1207        diff3 += diff1 * fact1;
1208        diff6 += diff4 * fact3;
1209        diff7 += diff5 * fact3;
1210        diff10 += diff8 * fact5;
1211        diff11 += diff9 * fact5;
1212        diff14 += diff12 * fact7;
1213        diff15 += diff13 * fact7;
1214
1215        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1216        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1217        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1218                    dst0, dst1, dst2, dst3);
1219        ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
1220        dst += (4 * stride);
1221    }
1222}
1223
1224static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top,
1225                                                      const uint8_t *src_left,
1226                                                      uint8_t *dst,
1227                                                      int32_t stride,
1228                                                      int32_t mode)
1229{
1230    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1231    uint8_t ref_array[3 * 32 + 4];
1232    uint8_t *ref_tmp;
1233    const uint8_t *ref;
1234    const uint8_t *src_left_tmp = src_left - 1;
1235    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1236    int32_t tmp0, tmp1, tmp2, tmp3;
1237    int32_t angle, angle_loop;
1238    int32_t inv_angle_val, inv_angle_val_loop;
1239    int32_t last, offset;
1240    v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1241    v16i8 dst0, dst1, dst2, dst3;
1242    v8i16 fact0, fact1, fact2, fact3;
1243    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1244    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1245
1246    ref_tmp = ref_array + 32;
1247
1248    angle = intra_pred_angle_up[mode - 18];
1249    inv_angle_val = inv_angle[mode - 18];
1250    last = angle;
1251    angle_loop = angle;
1252
1253    ref = src_top - 1;
1254    if (last < -1) {
1255        inv_angle_val_loop = inv_angle_val * last;
1256        LD_UB2(ref, 16, top0, top1);
1257        tmp0 = ref[32];
1258        tmp1 = ref[33];
1259        tmp2 = ref[34];
1260        tmp3 = ref[35];
1261
1262        ST_UB2(top0, top1, ref_tmp, 16);
1263        ref_tmp[32] = tmp0;
1264        ref_tmp[33] = tmp1;
1265        ref_tmp[34] = tmp2;
1266        ref_tmp[35] = tmp3;
1267
1268        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1269            offset = (inv_angle_val_loop + 128) >> 8;
1270            ref_tmp[h_cnt] = src_left_tmp[offset];
1271            inv_angle_val_loop += inv_angle_val;
1272        }
1273
1274        ref = ref_tmp;
1275    }
1276
1277    for (v_cnt = 16; v_cnt--;) {
1278        idx0 = (angle_loop) >> 5;
1279        fact_val0 = (angle_loop) & 31;
1280        angle_loop += angle;
1281
1282        idx1 = (angle_loop) >> 5;
1283        fact_val1 = (angle_loop) & 31;
1284        angle_loop += angle;
1285
1286        top0 = LD_UB(ref + idx0 + 1);
1287        top4 = LD_UB(ref + idx1 + 1);
1288        top1 = LD_UB(ref + idx0 + 17);
1289        top5 = LD_UB(ref + idx1 + 17);
1290        top3 = LD_UB(ref + idx0 + 33);
1291        top7 = LD_UB(ref + idx1 + 33);
1292
1293        fact0 = __msa_fill_h(fact_val0);
1294        fact1 = __msa_fill_h(32 - fact_val0);
1295        fact2 = __msa_fill_h(fact_val1);
1296        fact3 = __msa_fill_h(32 - fact_val1);
1297
1298        top2 = top1;
1299        top6 = top5;
1300
1301        SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1302                   top1, top3, top5, top7);
1303        UNPCK_UB_SH(top0, diff0, diff1);
1304        UNPCK_UB_SH(top1, diff2, diff3);
1305        UNPCK_UB_SH(top2, diff4, diff5);
1306        UNPCK_UB_SH(top3, diff6, diff7);
1307        UNPCK_UB_SH(top4, diff8, diff9);
1308        UNPCK_UB_SH(top5, diff10, diff11);
1309        UNPCK_UB_SH(top6, diff12, diff13);
1310        UNPCK_UB_SH(top7, diff14, diff15);
1311
1312        MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1313             diff2, diff3, diff6, diff7);
1314        MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1315             diff10, diff11, diff14, diff15);
1316
1317        diff2 += diff0 * fact1;
1318        diff3 += diff1 * fact1;
1319        diff6 += diff4 * fact1;
1320        diff7 += diff5 * fact1;
1321        diff10 += diff8 * fact3;
1322        diff11 += diff9 * fact3;
1323        diff14 += diff12 * fact3;
1324        diff15 += diff13 * fact3;
1325
1326        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1327        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1328        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1329                    dst0, dst1, dst2, dst3);
1330
1331        ST_SB2(dst0, dst1, dst, 16);
1332        dst += stride;
1333        ST_SB2(dst2, dst3, dst, 16);
1334        dst += stride;
1335    }
1336}
1337
1338static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top,
1339                                                     const uint8_t *src_left,
1340                                                     uint8_t *dst,
1341                                                     int32_t stride,
1342                                                     int32_t mode)
1343{
1344    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1345    uint8_t ref_array[3 * 32 + 4];
1346    uint8_t *ref_tmp = ref_array + 4;
1347    const uint8_t *ref;
1348    int32_t last, offset;
1349    int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
1350    int32_t idx2, fact_val2, idx3, fact_val3;
1351    int32_t angle, angle_loop, inv_angle_val;
1352    uint64_t tmp0;
1353    v16i8 dst_val0, dst_val1;
1354    v16u8 top0, top1, top2, top3;
1355    v16u8 zero = { 0 };
1356    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1357    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1358
1359    angle = intra_pred_angle_low[mode - 2];
1360    last = angle >> 3;
1361    angle_loop = angle;
1362
1363    ref = src_left - 1;
1364    if (last < -1) {
1365        inv_angle_val = inv_angle[mode - 11];
1366
1367        tmp0 = LD(ref);
1368        SD(tmp0, ref_tmp);
1369
1370        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1371            offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
1372            ref_tmp[h_cnt] = src_top[offset];
1373        }
1374
1375        ref = ref_tmp;
1376    }
1377
1378    idx0 = angle_loop >> 5;
1379    fact_val0 = angle_loop & 31;
1380    angle_loop += angle;
1381
1382    idx1 = angle_loop >> 5;
1383    fact_val1 = angle_loop & 31;
1384    angle_loop += angle;
1385
1386    idx2 = angle_loop >> 5;
1387    fact_val2 = angle_loop & 31;
1388    angle_loop += angle;
1389
1390    idx3 = angle_loop >> 5;
1391    fact_val3 = angle_loop & 31;
1392
1393    top0 = LD_UB(ref + idx0 + 1);
1394    top1 = LD_UB(ref + idx1 + 1);
1395    top2 = LD_UB(ref + idx2 + 1);
1396    top3 = LD_UB(ref + idx3 + 1);
1397
1398    fact0 = __msa_fill_h(fact_val0);
1399    fact1 = __msa_fill_h(32 - fact_val0);
1400    fact2 = __msa_fill_h(fact_val1);
1401    fact3 = __msa_fill_h(32 - fact_val1);
1402    fact4 = __msa_fill_h(fact_val2);
1403    fact5 = __msa_fill_h(32 - fact_val2);
1404    fact6 = __msa_fill_h(fact_val3);
1405    fact7 = __msa_fill_h(32 - fact_val3);
1406
1407    ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
1408    ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
1409    ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1410               diff0, diff2, diff4, diff6);
1411    SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
1412               diff1, diff3, diff5, diff7);
1413    ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1414    ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1415    MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1416
1417    diff1 += diff0 * fact1;
1418    diff3 += diff2 * fact3;
1419
1420    SRARI_H2_SH(diff1, diff3, 5);
1421    PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
1422
1423    diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
1424    diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
1425
1426    diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
1427
1428    dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
1429    dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
1430
1431    ST_W2(dst_val0, 0, 1, dst, stride);
1432    ST_W2(dst_val1, 0, 1, dst + 2 * stride, stride);
1433}
1434
1435static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
1436                                                     const uint8_t *src_left,
1437                                                     uint8_t *dst,
1438                                                     int32_t stride,
1439                                                     int32_t mode)
1440{
1441    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1442    uint8_t ref_array[3 * 32 + 4];
1443    uint8_t *ref_tmp = ref_array + 8;
1444    const uint8_t *ref;
1445    const uint8_t *src_top_tmp = src_top - 1;
1446    uint8_t *dst_org;
1447    int32_t last, offset, tmp0, tmp1, tmp2;
1448    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1449    int32_t idx2, fact_val2, idx3, fact_val3;
1450    int32_t angle, angle_loop, inv_angle_val;
1451    v16i8 top0, top1, top2, top3;
1452    v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
1453    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1454    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1455
1456    angle = intra_pred_angle_low[mode - 2];
1457    last = (angle) >> 2;
1458    angle_loop = angle;
1459
1460    ref = src_left - 1;
1461    if (last < -1) {
1462        inv_angle_val = inv_angle[mode - 11];
1463
1464        tmp0 = LW(ref);
1465        tmp1 = LW(ref + 4);
1466        tmp2 = LW(ref + 8);
1467        SW(tmp0, ref_tmp);
1468        SW(tmp1, ref_tmp + 4);
1469        SW(tmp2, ref_tmp + 8);
1470
1471        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1472            offset = (h_cnt * inv_angle_val + 128) >> 8;
1473            ref_tmp[h_cnt] = src_top_tmp[offset];
1474        }
1475
1476        ref = ref_tmp;
1477    }
1478
1479    for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1480        dst_org = dst;
1481
1482        idx0 = angle_loop >> 5;
1483        fact_val0 = angle_loop & 31;
1484        angle_loop += angle;
1485
1486        idx1 = angle_loop >> 5;
1487        fact_val1 = angle_loop & 31;
1488        angle_loop += angle;
1489
1490        idx2 = angle_loop >> 5;
1491        fact_val2 = angle_loop & 31;
1492        angle_loop += angle;
1493
1494        idx3 = angle_loop >> 5;
1495        fact_val3 = angle_loop & 31;
1496        angle_loop += angle;
1497
1498        top0 = LD_SB(ref + idx0 + 1);
1499        top1 = LD_SB(ref + idx1 + 1);
1500        top2 = LD_SB(ref + idx2 + 1);
1501        top3 = LD_SB(ref + idx3 + 1);
1502
1503        fact0 = __msa_fill_h(fact_val0);
1504        fact1 = __msa_fill_h(32 - fact_val0);
1505        fact2 = __msa_fill_h(fact_val1);
1506        fact3 = __msa_fill_h(32 - fact_val1);
1507        fact4 = __msa_fill_h(fact_val2);
1508        fact5 = __msa_fill_h(32 - fact_val2);
1509        fact6 = __msa_fill_h(fact_val3);
1510        fact7 = __msa_fill_h(32 - fact_val3);
1511
1512        UNPCK_UB_SH(top0, diff0, diff1);
1513        UNPCK_UB_SH(top1, diff2, diff3);
1514        UNPCK_UB_SH(top2, diff4, diff5);
1515        UNPCK_UB_SH(top3, diff6, diff7);
1516        SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1517                   diff1, diff3, diff5, diff7);
1518        MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1519             diff1, diff3, diff5, diff7);
1520
1521        diff1 += diff0 * fact1;
1522        diff3 += diff2 * fact3;
1523        diff5 += diff4 * fact5;
1524        diff7 += diff6 * fact7;
1525
1526        SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1527        PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
1528                    dst_val0, dst_val1, dst_val2, dst_val3);
1529        ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1530        ILVRL_H2_SH(diff1, diff0, diff3, diff4);
1531        ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1532        dst += 4;
1533    }
1534}
1535
1536static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top,
1537                                                      const uint8_t *src_left,
1538                                                      uint8_t *dst,
1539                                                      int32_t stride,
1540                                                      int32_t mode)
1541{
1542    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1543    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1544    int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
1545    v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1546    v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1547    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1548    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1549    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1550    int32_t angle, angle_loop, inv_angle_val, offset;
1551    uint8_t ref_array[3 * 32 + 4];
1552    uint8_t *ref_tmp = ref_array + 16;
1553    const uint8_t *ref, *src_top_tmp = src_top - 1;
1554    uint8_t *dst_org;
1555    int32_t last;
1556
1557    angle = intra_pred_angle_low[mode - 2];
1558    last = (angle) >> 1;
1559    angle_loop = angle;
1560
1561    ref = src_left - 1;
1562    if (last < -1) {
1563        inv_angle_val = inv_angle[mode - 11];
1564
1565        top0 = LD_SB(ref);
1566        tmp0 = LW(ref + 16);
1567        ST_SB(top0, ref_tmp);
1568        SW(tmp0, ref_tmp + 16);
1569
1570        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1571            offset = (h_cnt * inv_angle_val + 128) >> 8;
1572            ref_tmp[h_cnt] = src_top_tmp[offset];
1573        }
1574
1575        ref = ref_tmp;
1576    }
1577
1578    for (v_cnt = 0; v_cnt < 4; v_cnt++) {
1579        dst_org = dst;
1580
1581        idx0 = angle_loop >> 5;
1582        fact_val0 = angle_loop & 31;
1583        angle_loop += angle;
1584
1585        idx1 = angle_loop >> 5;
1586        fact_val1 = angle_loop & 31;
1587        angle_loop += angle;
1588
1589        idx2 = angle_loop >> 5;
1590        fact_val2 = angle_loop & 31;
1591        angle_loop += angle;
1592
1593        idx3 = angle_loop >> 5;
1594        fact_val3 = angle_loop & 31;
1595        angle_loop += angle;
1596
1597        LD_SB2(ref + idx0 + 1, 16, top0, top1);
1598        LD_SB2(ref + idx1 + 1, 16, top2, top3);
1599        LD_SB2(ref + idx2 + 1, 16, top4, top5);
1600        LD_SB2(ref + idx3 + 1, 16, top6, top7);
1601
1602        fact0 = __msa_fill_h(fact_val0);
1603        fact1 = __msa_fill_h(32 - fact_val0);
1604        fact2 = __msa_fill_h(fact_val1);
1605        fact3 = __msa_fill_h(32 - fact_val1);
1606        fact4 = __msa_fill_h(fact_val2);
1607        fact5 = __msa_fill_h(32 - fact_val2);
1608        fact6 = __msa_fill_h(fact_val3);
1609        fact7 = __msa_fill_h(32 - fact_val3);
1610
1611        SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1612                   top1, top3, top5, top7);
1613
1614        UNPCK_UB_SH(top0, diff0, diff1);
1615        UNPCK_UB_SH(top1, diff2, diff3);
1616        UNPCK_UB_SH(top2, diff4, diff5);
1617        UNPCK_UB_SH(top3, diff6, diff7);
1618        UNPCK_UB_SH(top4, diff8, diff9);
1619        UNPCK_UB_SH(top5, diff10, diff11);
1620        UNPCK_UB_SH(top6, diff12, diff13);
1621        UNPCK_UB_SH(top7, diff14, diff15);
1622
1623        MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1624             diff2, diff3, diff6, diff7);
1625        MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1626             diff10, diff11, diff14, diff15);
1627
1628        diff2 += diff0 * fact1;
1629        diff3 += diff1 * fact1;
1630        diff6 += diff4 * fact3;
1631        diff7 += diff5 * fact3;
1632        diff10 += diff8 * fact5;
1633        diff11 += diff9 * fact5;
1634        diff14 += diff12 * fact7;
1635        diff15 += diff13 * fact7;
1636
1637        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1638        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1639        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1640                    dst_val0, dst_val1, dst_val2, dst_val3);
1641        ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1642        ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
1643        ILVRL_H2_SH(diff1, diff0, diff4, diff5);
1644        ILVRL_H2_SH(diff3, diff2, diff6, diff7);
1645        ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1646        dst_org += (8 * stride);
1647        ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1648        dst += 4;
1649    }
1650}
1651
1652static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top,
1653                                                      const uint8_t *src_left,
1654                                                      uint8_t *dst,
1655                                                      int32_t stride,
1656                                                      int32_t mode)
1657{
1658    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1659    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
1660    v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1661    v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1662    v8i16 fact0, fact1, fact2, fact3;
1663    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1664    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1665    int32_t angle, angle_loop, inv_angle_val, offset;
1666    uint8_t ref_array[3 * 32 + 4];
1667    uint8_t *ref_tmp = ref_array + 32;
1668    const uint8_t *ref, *src_top_tmp = src_top - 1;
1669    uint8_t *dst_org;
1670    int32_t last;
1671
1672    angle = intra_pred_angle_low[mode - 2];
1673    last = angle;
1674    angle_loop = angle;
1675
1676    ref = src_left - 1;
1677    if (last < -1) {
1678        inv_angle_val = inv_angle[mode - 11];
1679
1680        LD_SB2(ref, 16, top0, top1);
1681        tmp0 = LW(ref + 32);
1682        ST_SB2(top0, top1, ref_tmp, 16);
1683        SW(tmp0, ref_tmp + 32);
1684
1685        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1686            offset = (h_cnt * inv_angle_val + 128) >> 8;
1687            ref_tmp[h_cnt] = src_top_tmp[offset];
1688        }
1689
1690        ref = ref_tmp;
1691    }
1692
1693    for (v_cnt = 0; v_cnt < 16; v_cnt++) {
1694        dst_org = dst;
1695        idx0 = angle_loop >> 5;
1696        fact_val0 = angle_loop & 31;
1697        angle_loop += angle;
1698
1699        idx1 = angle_loop >> 5;
1700        fact_val1 = angle_loop & 31;
1701        angle_loop += angle;
1702
1703        top0 = LD_SB(ref + idx0 + 1);
1704        top4 = LD_SB(ref + idx1 + 1);
1705        top1 = LD_SB(ref + idx0 + 17);
1706        top5 = LD_SB(ref + idx1 + 17);
1707        top3 = LD_SB(ref + idx0 + 33);
1708        top7 = LD_SB(ref + idx1 + 33);
1709
1710        fact0 = __msa_fill_h(fact_val0);
1711        fact1 = __msa_fill_h(32 - fact_val0);
1712        fact2 = __msa_fill_h(fact_val1);
1713        fact3 = __msa_fill_h(32 - fact_val1);
1714
1715        top2 = top1;
1716        top6 = top5;
1717
1718        SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1719                   top1, top3, top5, top7);
1720
1721        UNPCK_UB_SH(top0, diff0, diff1);
1722        UNPCK_UB_SH(top1, diff2, diff3);
1723        UNPCK_UB_SH(top2, diff4, diff5);
1724        UNPCK_UB_SH(top3, diff6, diff7);
1725        UNPCK_UB_SH(top4, diff8, diff9);
1726        UNPCK_UB_SH(top5, diff10, diff11);
1727        UNPCK_UB_SH(top6, diff12, diff13);
1728        UNPCK_UB_SH(top7, diff14, diff15);
1729
1730        MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1731             diff2, diff3, diff6, diff7);
1732        MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1733             diff10, diff11, diff14, diff15);
1734
1735        diff2 += diff0 * fact1;
1736        diff3 += diff1 * fact1;
1737        diff6 += diff4 * fact1;
1738        diff7 += diff5 * fact1;
1739        diff10 += diff8 * fact3;
1740        diff11 += diff9 * fact3;
1741        diff14 += diff12 * fact3;
1742        diff15 += diff13 * fact3;
1743
1744        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1745        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1746        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1747                    dst_val0, dst_val1, dst_val2, dst_val3);
1748        ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
1749        ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
1750
1751        ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1752        dst_org += (8 * stride);
1753        ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1754        dst_org += (8 * stride);
1755        ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1756        dst_org += (8 * stride);
1757        ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1758        dst_org += (8 * stride);
1759
1760        dst += 2;
1761    }
1762}
1763
1764static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
1765                                         int32_t dst_stride)
1766{
1767    uint32_t row;
1768    v16u8 src1, src2;
1769
1770    src1 = LD_UB(src);
1771    src2 = LD_UB(src + 16);
1772
1773    for (row = 32; row--;) {
1774        ST_UB2(src1, src2, dst, 16);
1775        dst += dst_stride;
1776    }
1777}
1778
1779void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst,
1780                                     const uint8_t *src_top,
1781                                     const uint8_t *src_left,
1782                                     ptrdiff_t stride)
1783{
1784    hevc_intra_pred_plane_4x4_msa(src_top, src_left, dst, stride);
1785}
1786
1787void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst,
1788                                     const uint8_t *src_top,
1789                                     const uint8_t *src_left,
1790                                     ptrdiff_t stride)
1791{
1792    hevc_intra_pred_plane_8x8_msa(src_top, src_left, dst, stride);
1793}
1794
1795void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst,
1796                                     const uint8_t *src_top,
1797                                     const uint8_t *src_left,
1798                                     ptrdiff_t stride)
1799{
1800    hevc_intra_pred_plane_16x16_msa(src_top, src_left, dst, stride);
1801}
1802
1803void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst,
1804                                     const uint8_t *src_top,
1805                                     const uint8_t *src_left,
1806                                     ptrdiff_t stride)
1807{
1808    hevc_intra_pred_plane_32x32_msa(src_top, src_left, dst, stride);
1809}
1810
1811void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
1812                               const uint8_t *src_left,
1813                               ptrdiff_t stride, int log2, int c_idx)
1814{
1815    switch (log2) {
1816    case 2:
1817        hevc_intra_pred_dc_4x4_msa(src_top, src_left, dst, stride, c_idx);
1818        break;
1819
1820    case 3:
1821        hevc_intra_pred_dc_8x8_msa(src_top, src_left, dst, stride, c_idx);
1822        break;
1823
1824    case 4:
1825        hevc_intra_pred_dc_16x16_msa(src_top, src_left, dst, stride, c_idx);
1826        break;
1827
1828    case 5:
1829        hevc_intra_pred_dc_32x32_msa(src_top, src_left, dst, stride);
1830        break;
1831    }
1832}
1833
1834void ff_pred_intra_pred_angular_0_msa(uint8_t *dst,
1835                                      const uint8_t *src_top,
1836                                      const uint8_t *src_left,
1837                                      ptrdiff_t stride, int c_idx, int mode)
1838{
1839    if (mode == 10) {
1840        hevc_intra_pred_horiz_4x4_msa(src_top, src_left, dst, stride, c_idx);
1841    } else if (mode == 26) {
1842        hevc_intra_pred_vert_4x4_msa(src_top, src_left, dst, stride, c_idx);
1843    } else if (mode >= 18) {
1844        hevc_intra_pred_angular_upper_4width_msa(src_top, src_left,
1845                                                 dst, stride, mode);
1846    } else {
1847        hevc_intra_pred_angular_lower_4width_msa(src_top, src_left,
1848                                                 dst, stride, mode);
1849    }
1850}
1851
1852void ff_pred_intra_pred_angular_1_msa(uint8_t *dst,
1853                                      const uint8_t *src_top,
1854                                      const uint8_t *src_left,
1855                                      ptrdiff_t stride, int c_idx, int mode)
1856{
1857    if (mode == 10) {
1858        hevc_intra_pred_horiz_8x8_msa(src_top, src_left, dst, stride, c_idx);
1859    } else if (mode == 26) {
1860        hevc_intra_pred_vert_8x8_msa(src_top, src_left, dst, stride, c_idx);
1861    } else if (mode >= 18) {
1862        hevc_intra_pred_angular_upper_8width_msa(src_top, src_left,
1863                                                 dst, stride, mode);
1864    } else {
1865        hevc_intra_pred_angular_lower_8width_msa(src_top, src_left,
1866                                                 dst, stride, mode);
1867    }
1868}
1869
1870void ff_pred_intra_pred_angular_2_msa(uint8_t *dst,
1871                                      const uint8_t *src_top,
1872                                      const uint8_t *src_left,
1873                                      ptrdiff_t stride, int c_idx, int mode)
1874{
1875    if (mode == 10) {
1876        hevc_intra_pred_horiz_16x16_msa(src_top, src_left, dst, stride, c_idx);
1877    } else if (mode == 26) {
1878        hevc_intra_pred_vert_16x16_msa(src_top, src_left, dst, stride, c_idx);
1879    } else if (mode >= 18) {
1880        hevc_intra_pred_angular_upper_16width_msa(src_top, src_left,
1881                                                  dst, stride, mode);
1882    } else {
1883        hevc_intra_pred_angular_lower_16width_msa(src_top, src_left,
1884                                                  dst, stride, mode);
1885    }
1886}
1887
1888void ff_pred_intra_pred_angular_3_msa(uint8_t *dst,
1889                                      const uint8_t *src_top,
1890                                      const uint8_t *src_left,
1891                                      ptrdiff_t stride, int c_idx, int mode)
1892{
1893    if (mode == 10) {
1894        hevc_intra_pred_horiz_32x32_msa(src_top, src_left, dst, stride);
1895    } else if (mode == 26) {
1896        intra_predict_vert_32x32_msa(src_top, dst, stride);
1897    } else if (mode >= 18) {
1898        hevc_intra_pred_angular_upper_32width_msa(src_top, src_left,
1899                                                  dst, stride, mode);
1900    } else {
1901        hevc_intra_pred_angular_lower_32width_msa(src_top, src_left,
1902                                                  dst, stride, mode);
1903    }
1904}
1905
1906void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
1907{
1908    v16u8 vec0;
1909    HEVCLocalContext *lc = s->HEVClc;
1910    int i;
1911    int hshift = s->ps.sps->hshift[c_idx];
1912    int vshift = s->ps.sps->vshift[c_idx];
1913    int size_in_luma_h = 16 << hshift;
1914    int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
1915    int size_in_luma_v = 16 << vshift;
1916    int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
1917    int x = x0 >> hshift;
1918    int y = y0 >> vshift;
1919    int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1920    int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1921
1922    int cur_tb_addr =
1923        s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
1924
1925    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
1926    uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
1927
1928    int min_pu_width = s->ps.sps->min_pu_width;
1929
1930    enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
1931        lc->tu.intra_pred_mode;
1932    uint32_t a;
1933    uint8_t left_array[2 * 32 + 1];
1934    uint8_t filtered_left_array[2 * 32 + 1];
1935    uint8_t top_array[2 * 32 + 1];
1936    uint8_t filtered_top_array[2 * 32 + 1];
1937
1938    uint8_t *left = left_array + 1;
1939    uint8_t *top = top_array + 1;
1940    uint8_t *filtered_left = filtered_left_array + 1;
1941    uint8_t *filtered_top = filtered_top_array + 1;
1942    int cand_bottom_left = lc->na.cand_bottom_left
1943        && cur_tb_addr >
1944        s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
1945                               (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
1946    int cand_left = lc->na.cand_left;
1947    int cand_up_left = lc->na.cand_up_left;
1948    int cand_up = lc->na.cand_up;
1949    int cand_up_right = lc->na.cand_up_right
1950        && cur_tb_addr >
1951        s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
1952                               ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
1953
1954    int bottom_left_size =
1955        (((y0 + 2 * size_in_luma_v) >
1956          (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
1957                                                 2 * size_in_luma_v)) -
1958         (y0 + size_in_luma_v)) >> vshift;
1959    int top_right_size =
1960        (((x0 + 2 * size_in_luma_h) >
1961          (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
1962         (x0 + size_in_luma_h)) >> hshift;
1963
1964    if (s->ps.pps->constrained_intra_pred_flag == 1) {
1965        int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1966        int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
1967        int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1968        int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1969        if (!size_in_luma_pu_h)
1970            size_in_luma_pu_h++;
1971        if (cand_bottom_left == 1 && on_pu_edge_x) {
1972            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1973            int y_bottom_pu =
1974                ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1975            int max =
1976                ((size_in_luma_pu_v) >
1977                 (s->ps.sps->min_pu_height -
1978                  y_bottom_pu) ? (s->ps.sps->min_pu_height -
1979                                  y_bottom_pu) : (size_in_luma_pu_v));
1980            cand_bottom_left = 0;
1981            for (i = 0; i < max; i += 2)
1982                cand_bottom_left |=
1983                    ((s->ref->tab_mvf[(x_left_pu) +
1984                                      (y_bottom_pu +
1985                                       i) * min_pu_width]).pred_flag ==
1986                     PF_INTRA);
1987        }
1988        if (cand_left == 1 && on_pu_edge_x) {
1989            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1990            int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
1991            int max =
1992                ((size_in_luma_pu_v) >
1993                 (s->ps.sps->min_pu_height -
1994                  y_left_pu) ? (s->ps.sps->min_pu_height -
1995                                y_left_pu) : (size_in_luma_pu_v));
1996            cand_left = 0;
1997            for (i = 0; i < max; i += 2)
1998                cand_left |=
1999                    ((s->ref->tab_mvf[(x_left_pu) +
2000                                      (y_left_pu +
2001                                       i) * min_pu_width]).pred_flag ==
2002                     PF_INTRA);
2003        }
2004        if (cand_up_left == 1) {
2005            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2006            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2007            cand_up_left =
2008                (s->ref->tab_mvf[(x_left_pu) +
2009                                 (y_top_pu) * min_pu_width]).pred_flag ==
2010                PF_INTRA;
2011        }
2012        if (cand_up == 1 && on_pu_edge_y) {
2013            int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2014            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2015            int max =
2016                ((size_in_luma_pu_h) >
2017                 (s->ps.sps->min_pu_width -
2018                  x_top_pu) ? (s->ps.sps->min_pu_width -
2019                               x_top_pu) : (size_in_luma_pu_h));
2020            cand_up = 0;
2021            for (i = 0; i < max; i += 2)
2022                cand_up |=
2023                    ((s->ref->tab_mvf[(x_top_pu + i) +
2024                                      (y_top_pu) *
2025                                      min_pu_width]).pred_flag == PF_INTRA);
2026        }
2027        if (cand_up_right == 1 && on_pu_edge_y) {
2028            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2029            int x_right_pu =
2030                ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2031            int max =
2032                ((size_in_luma_pu_h) >
2033                 (s->ps.sps->min_pu_width -
2034                  x_right_pu) ? (s->ps.sps->min_pu_width -
2035                                 x_right_pu) : (size_in_luma_pu_h));
2036            cand_up_right = 0;
2037            for (i = 0; i < max; i += 2)
2038                cand_up_right |=
2039                    ((s->ref->tab_mvf[(x_right_pu + i) +
2040                                      (y_top_pu) *
2041                                      min_pu_width]).pred_flag == PF_INTRA);
2042        }
2043
2044        vec0 = (v16u8) __msa_ldi_b(128);
2045
2046        ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2047
2048        ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2049
2050        top[-1] = 128;
2051    }
2052    if (cand_up_left) {
2053        left[-1] = src[(-1) + stride * (-1)];
2054        top[-1] = left[-1];
2055    }
2056    if (cand_up) {
2057        vec0 = LD_UB(src - stride);
2058        ST_UB(vec0, top);
2059    }
2060    if (cand_up_right) {
2061        vec0 = LD_UB(src - stride + 16);
2062        ST_UB(vec0, (top + 16));
2063
2064        do {
2065            uint32_t pix =
2066                ((src[(16 + top_right_size - 1) + stride * (-1)]) *
2067                 0x01010101U);
2068            for (i = 0; i < (16 - top_right_size); i += 4)
2069                ((((union unaligned_32 *) (top + 16 + top_right_size +
2070                                           i))->l) = (pix));
2071        } while (0);
2072    }
2073    if (cand_left)
2074        for (i = 0; i < 16; i++)
2075            left[i] = src[(-1) + stride * (i)];
2076    if (cand_bottom_left) {
2077        for (i = 16; i < 16 + bottom_left_size; i++)
2078            left[i] = src[(-1) + stride * (i)];
2079        do {
2080            uint32_t pix =
2081                ((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
2082                 0x01010101U);
2083            for (i = 0; i < (16 - bottom_left_size); i += 4)
2084                ((((union unaligned_32 *) (left + 16 + bottom_left_size +
2085                                           i))->l) = (pix));
2086        } while (0);
2087    }
2088
2089    if (s->ps.pps->constrained_intra_pred_flag == 1) {
2090        if (cand_bottom_left || cand_left || cand_up_left || cand_up
2091            || cand_up_right) {
2092            int size_max_x =
2093                x0 + ((2 * 16) << hshift) <
2094                s->ps.sps->width ? 2 * 16 : (s->ps.sps->width - x0) >> hshift;
2095            int size_max_y =
2096                y0 + ((2 * 16) << vshift) <
2097                s->ps.sps->height ? 2 * 16 : (s->ps.sps->height - y0) >> vshift;
2098            int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2099            if (!cand_up_right) {
2100                size_max_x = x0 + ((16) << hshift) < s->ps.sps->width ?
2101                    16 : (s->ps.sps->width - x0) >> hshift;
2102            }
2103            if (!cand_bottom_left) {
2104                size_max_y = y0 + ((16) << vshift) < s->ps.sps->height ?
2105                    16 : (s->ps.sps->height - y0) >> vshift;
2106            }
2107            if (cand_bottom_left || cand_left || cand_up_left) {
2108                while (j > -1
2109                       &&
2110                       !((s->ref->tab_mvf[(((x0 +
2111                                             ((-1) << hshift)) >> s->ps.sps->
2112                                            log2_min_pu_size)) + (((y0 +
2113                                                                    ((j) <<
2114                                                                     vshift))
2115                                                                   >> s->ps.sps->
2116                                                                   log2_min_pu_size))
2117                                          * min_pu_width]).pred_flag ==
2118                         PF_INTRA))
2119                    j--;
2120                if (!
2121                    ((s->ref->tab_mvf[(((x0 +
2122                                         ((-1) << hshift)) >> s->ps.sps->
2123                                        log2_min_pu_size)) + (((y0 + ((j)
2124                                                                      <<
2125                                                                      vshift))
2126                                                               >> s->ps.sps->
2127                                                               log2_min_pu_size))
2128                                      * min_pu_width]).pred_flag == PF_INTRA)) {
2129                    j = 0;
2130                    while (j < size_max_x
2131                           &&
2132                           !((s->ref->tab_mvf[(((x0 +
2133                                                 ((j) << hshift)) >> s->ps.sps->
2134                                                log2_min_pu_size)) + (((y0 +
2135                                                                        ((-1) <<
2136                                                                         vshift))
2137                                                                       >> s->
2138                                                                       ps.sps->
2139                                                                       log2_min_pu_size))
2140                                              * min_pu_width]).pred_flag ==
2141                             PF_INTRA))
2142                        j++;
2143                    for (i = j; i > (j) - (j + 1); i--)
2144                        if (!
2145                            ((s->ref->tab_mvf[(((x0 +
2146                                                 ((i -
2147                                                   1) << hshift)) >> s->ps.sps->
2148                                                log2_min_pu_size)) + (((y0 +
2149                                                                        ((-1) <<
2150                                                                         vshift))
2151                                                                       >> s->
2152                                                                       ps.sps->
2153                                                                       log2_min_pu_size))
2154                                              * min_pu_width]).pred_flag ==
2155                             PF_INTRA))
2156                            top[i - 1] = top[i];
2157                    left[-1] = top[-1];
2158                }
2159            } else {
2160                j = 0;
2161                while (j < size_max_x
2162                       &&
2163                       !((s->ref->tab_mvf[(((x0 +
2164                                             ((j) << hshift)) >> s->ps.sps->
2165                                            log2_min_pu_size)) + (((y0 + ((-1)
2166                                                                          <<
2167                                                                          vshift))
2168                                                                   >> s->ps.sps->
2169                                                                   log2_min_pu_size))
2170                                          * min_pu_width]).pred_flag ==
2171                         PF_INTRA))
2172                    j++;
2173                if (j > 0)
2174                    if (x0 > 0) {
2175                        for (i = j; i > (j) - (j + 1); i--)
2176                            if (!
2177                                ((s->ref->tab_mvf[(((x0 +
2178                                                     ((i -
2179                                                       1) << hshift)) >>
2180                                                    s->ps.sps->log2_min_pu_size))
2181                                                  + (((y0 + ((-1)
2182                                                             << vshift))
2183                                                      >>
2184                                                      s->ps.sps->log2_min_pu_size))
2185                                                  *
2186                                                  min_pu_width]).pred_flag ==
2187                                 PF_INTRA))
2188                                top[i - 1] = top[i];
2189                    } else {
2190                        for (i = j; i > (j) - (j); i--)
2191                            if (!
2192                                ((s->ref->tab_mvf[(((x0 +
2193                                                     ((i -
2194                                                       1) << hshift)) >>
2195                                                    s->ps.sps->log2_min_pu_size))
2196                                                  + (((y0 + ((-1)
2197                                                             << vshift))
2198                                                      >>
2199                                                      s->ps.sps->log2_min_pu_size))
2200                                                  *
2201                                                  min_pu_width]).pred_flag ==
2202                                 PF_INTRA))
2203                                top[i - 1] = top[i];
2204                        top[-1] = top[0];
2205                    }
2206                left[-1] = top[-1];
2207            }
2208            left[-1] = top[-1];
2209            if (cand_bottom_left || cand_left) {
2210                a = ((left[-1]) * 0x01010101U);
2211                for (i = 0; i < (0) + (size_max_y); i += 4)
2212                    if (!
2213                        ((s->ref->tab_mvf[(((x0 +
2214                                             ((-1) << hshift)) >> s->ps.sps->
2215                                            log2_min_pu_size)) + (((y0 +
2216                                                                    ((i) <<
2217                                                                     vshift))
2218                                                                   >> s->ps.sps->
2219                                                                   log2_min_pu_size))
2220                                          * min_pu_width]).pred_flag ==
2221                         PF_INTRA))
2222                        ((((union unaligned_32 *) (&left[i]))->l) = (a));
2223                    else
2224                        a = ((left[i + 3]) * 0x01010101U);
2225            }
2226            if (!cand_left) {
2227                vec0 = (v16u8) __msa_fill_b(left[-1]);
2228
2229                ST_UB(vec0, left);
2230            }
2231            if (!cand_bottom_left) {
2232
2233                vec0 = (v16u8) __msa_fill_b(left[15]);
2234
2235                ST_UB(vec0, (left + 16));
2236            }
2237            if (x0 != 0 && y0 != 0) {
2238                a = ((left[size_max_y - 1]) * 0x01010101U);
2239                for (i = (size_max_y - 1);
2240                     i > (size_max_y - 1) - (size_max_y); i -= 4)
2241                    if (!
2242                        ((s->ref->tab_mvf[(((x0 +
2243                                             ((-1) << hshift)) >> s->ps.sps->
2244                                            log2_min_pu_size)) + (((y0 +
2245                                                                    ((i -
2246                                                                      3) <<
2247                                                                     vshift))
2248                                                                   >> s->ps.sps->
2249                                                                   log2_min_pu_size))
2250                                          * min_pu_width]).pred_flag ==
2251                         PF_INTRA))
2252                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2253                    else
2254                        a = ((left[i - 3]) * 0x01010101U);
2255                if (!
2256                    ((s->ref->tab_mvf[(((x0 +
2257                                         ((-1) << hshift)) >> s->ps.sps->
2258                                        log2_min_pu_size)) + (((y0 + ((-1)
2259                                                                      <<
2260                                                                      vshift))
2261                                                               >> s->ps.sps->
2262                                                               log2_min_pu_size))
2263                                      * min_pu_width]).pred_flag == PF_INTRA))
2264                    left[-1] = left[0];
2265            } else if (x0 == 0) {
2266                do {
2267                    uint32_t pix = ((0) * 0x01010101U);
2268                    for (i = 0; i < (size_max_y); i += 4)
2269                        ((((union unaligned_32 *) (left + i))->l) = (pix));
2270                } while (0);
2271            } else {
2272                a = ((left[size_max_y - 1]) * 0x01010101U);
2273                for (i = (size_max_y - 1);
2274                     i > (size_max_y - 1) - (size_max_y); i -= 4)
2275                    if (!
2276                        ((s->ref->tab_mvf[(((x0 +
2277                                             ((-1) << hshift)) >> s->ps.sps->
2278                                            log2_min_pu_size)) + (((y0 +
2279                                                                    ((i -
2280                                                                      3) <<
2281                                                                     vshift))
2282                                                                   >> s->ps.sps->
2283                                                                   log2_min_pu_size))
2284                                          * min_pu_width]).pred_flag ==
2285                         PF_INTRA))
2286                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2287                    else
2288                        a = ((left[i - 3]) * 0x01010101U);
2289            }
2290            top[-1] = left[-1];
2291            if (y0 != 0) {
2292                a = ((left[-1]) * 0x01010101U);
2293                for (i = 0; i < (0) + (size_max_x); i += 4)
2294                    if (!
2295                        ((s->ref->tab_mvf[(((x0 +
2296                                             ((i) << hshift)) >> s->ps.sps->
2297                                            log2_min_pu_size)) + (((y0 + ((-1)
2298                                                                          <<
2299                                                                          vshift))
2300                                                                   >> s->ps.sps->
2301                                                                   log2_min_pu_size))
2302                                          * min_pu_width]).pred_flag ==
2303                         PF_INTRA))
2304                        ((((union unaligned_32 *) (&top[i]))->l) = (a));
2305                    else
2306                        a = ((top[i + 3]) * 0x01010101U);
2307            }
2308        }
2309    }
2310
2311    if (!cand_bottom_left) {
2312        if (cand_left) {
2313            vec0 = (v16u8) __msa_fill_b(left[15]);
2314
2315            ST_UB(vec0, (left + 16));
2316
2317        } else if (cand_up_left) {
2318            vec0 = (v16u8) __msa_fill_b(left[-1]);
2319
2320            ST_UB2(vec0, vec0, left, 16);
2321
2322            cand_left = 1;
2323        } else if (cand_up) {
2324            left[-1] = top[0];
2325
2326            vec0 = (v16u8) __msa_fill_b(left[-1]);
2327
2328            ST_UB2(vec0, vec0, left, 16);
2329
2330            cand_up_left = 1;
2331            cand_left = 1;
2332        } else if (cand_up_right) {
2333            vec0 = (v16u8) __msa_fill_b(top[16]);
2334
2335            ST_UB(vec0, top);
2336
2337            left[-1] = top[16];
2338
2339            ST_UB2(vec0, vec0, left, 16);
2340
2341            cand_up = 1;
2342            cand_up_left = 1;
2343            cand_left = 1;
2344        } else {
2345            left[-1] = 128;
2346            vec0 = (v16u8) __msa_ldi_b(128);
2347
2348            ST_UB2(vec0, vec0, top, 16);
2349            ST_UB2(vec0, vec0, left, 16);
2350        }
2351    }
2352
2353    if (!cand_left) {
2354        vec0 = (v16u8) __msa_fill_b(left[16]);
2355        ST_UB(vec0, left);
2356    }
2357    if (!cand_up_left) {
2358        left[-1] = left[0];
2359    }
2360    if (!cand_up) {
2361        vec0 = (v16u8) __msa_fill_b(left[-1]);
2362        ST_UB(vec0, top);
2363    }
2364    if (!cand_up_right) {
2365        vec0 = (v16u8) __msa_fill_b(top[15]);
2366        ST_UB(vec0, (top + 16));
2367    }
2368
2369    top[-1] = left[-1];
2370
2371
2372    if (!s->ps.sps->intra_smoothing_disabled_flag
2373        && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2374        if (mode != INTRA_DC && 16 != 4) {
2375            int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2376            int min_dist_vert_hor =
2377                (((((int) (mode - 26U)) >=
2378                   0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2379                 ((((int) (mode - 10U)) >=
2380                   0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2381                 ? ((((int) (mode - 10U)) >=
2382                     0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2383                 : ((((int) (mode - 26U)) >=
2384                     0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2385            if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
2386                filtered_left[2 * 16 - 1] = left[2 * 16 - 1];
2387                filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
2388                for (i = 2 * 16 - 2; i >= 0; i--)
2389                    filtered_left[i] = (left[i + 1] + 2 * left[i] +
2390                                        left[i - 1] + 2) >> 2;
2391                filtered_top[-1] =
2392                    filtered_left[-1] =
2393                    (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
2394                for (i = 2 * 16 - 2; i >= 0; i--)
2395                    filtered_top[i] = (top[i + 1] + 2 * top[i] +
2396                                       top[i - 1] + 2) >> 2;
2397                left = filtered_left;
2398                top = filtered_top;
2399            }
2400        }
2401    }
2402
2403    switch (mode) {
2404    case INTRA_PLANAR:
2405        s->hpc.pred_planar[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2406                                   (uint8_t *) left, stride);
2407        break;
2408    case INTRA_DC:
2409        s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
2410                       (uint8_t *) left, stride, 4, c_idx);
2411        break;
2412    default:
2413        s->hpc.pred_angular[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2414                                    (uint8_t *) left, stride, c_idx, mode);
2415        break;
2416    }
2417}
2418
2419void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
2420{
2421    v16u8 vec0, vec1;
2422    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2423    v8i16 res0, res1, res2, res3;
2424    v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
2425    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
2426    HEVCLocalContext *lc = s->HEVClc;
2427    int i;
2428    int hshift = s->ps.sps->hshift[c_idx];
2429    int vshift = s->ps.sps->vshift[c_idx];
2430    int size_in_luma_h = 32 << hshift;
2431    int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
2432    int size_in_luma_v = 32 << vshift;
2433    int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
2434    int x = x0 >> hshift;
2435    int y = y0 >> vshift;
2436    int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2437    int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2438
2439    int cur_tb_addr =
2440        s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
2441
2442    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
2443    uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
2444
2445    int min_pu_width = s->ps.sps->min_pu_width;
2446
2447    enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
2448        lc->tu.intra_pred_mode;
2449    uint32_t a;
2450    uint8_t left_array[2 * 32 + 1];
2451    uint8_t filtered_left_array[2 * 32 + 1];
2452    uint8_t top_array[2 * 32 + 1];
2453    uint8_t filtered_top_array[2 * 32 + 1];
2454
2455    uint8_t *left = left_array + 1;
2456    uint8_t *top = top_array + 1;
2457    uint8_t *filtered_left = filtered_left_array + 1;
2458    uint8_t *filtered_top = filtered_top_array + 1;
2459    int cand_bottom_left = lc->na.cand_bottom_left
2460        && cur_tb_addr >
2461        s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
2462                               (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
2463    int cand_left = lc->na.cand_left;
2464    int cand_up_left = lc->na.cand_up_left;
2465    int cand_up = lc->na.cand_up;
2466    int cand_up_right = lc->na.cand_up_right
2467        && cur_tb_addr >
2468        s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
2469                               ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
2470
2471    int bottom_left_size =
2472        (((y0 + 2 * size_in_luma_v) >
2473          (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
2474                                                 2 * size_in_luma_v)) -
2475         (y0 + size_in_luma_v)) >> vshift;
2476    int top_right_size =
2477        (((x0 + 2 * size_in_luma_h) >
2478          (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
2479         (x0 + size_in_luma_h)) >> hshift;
2480
2481    if (s->ps.pps->constrained_intra_pred_flag == 1) {
2482        int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2483        int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2484        int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2485        int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2486        if (!size_in_luma_pu_h)
2487            size_in_luma_pu_h++;
2488        if (cand_bottom_left == 1 && on_pu_edge_x) {
2489            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2490            int y_bottom_pu =
2491                ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2492            int max =
2493                ((size_in_luma_pu_v) >
2494                 (s->ps.sps->min_pu_height -
2495                  y_bottom_pu) ? (s->ps.sps->min_pu_height -
2496                                  y_bottom_pu) : (size_in_luma_pu_v));
2497            cand_bottom_left = 0;
2498            for (i = 0; i < max; i += 2)
2499                cand_bottom_left |=
2500                    ((s->ref->tab_mvf[(x_left_pu) +
2501                                      (y_bottom_pu +
2502                                       i) * min_pu_width]).pred_flag ==
2503                     PF_INTRA);
2504        }
2505        if (cand_left == 1 && on_pu_edge_x) {
2506            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2507            int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
2508            int max =
2509                ((size_in_luma_pu_v) >
2510                 (s->ps.sps->min_pu_height -
2511                  y_left_pu) ? (s->ps.sps->min_pu_height -
2512                                y_left_pu) : (size_in_luma_pu_v));
2513            cand_left = 0;
2514            for (i = 0; i < max; i += 2)
2515                cand_left |=
2516                    ((s->ref->tab_mvf[(x_left_pu) +
2517                                      (y_left_pu +
2518                                       i) * min_pu_width]).pred_flag ==
2519                     PF_INTRA);
2520        }
2521        if (cand_up_left == 1) {
2522            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2523            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2524            cand_up_left =
2525                (s->ref->tab_mvf[(x_left_pu) +
2526                                 (y_top_pu) * min_pu_width]).pred_flag ==
2527                PF_INTRA;
2528        }
2529        if (cand_up == 1 && on_pu_edge_y) {
2530            int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2531            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2532            int max =
2533                ((size_in_luma_pu_h) >
2534                 (s->ps.sps->min_pu_width -
2535                  x_top_pu) ? (s->ps.sps->min_pu_width -
2536                               x_top_pu) : (size_in_luma_pu_h));
2537            cand_up = 0;
2538            for (i = 0; i < max; i += 2)
2539                cand_up |=
2540                    ((s->ref->tab_mvf[(x_top_pu + i) +
2541                                      (y_top_pu) *
2542                                      min_pu_width]).pred_flag == PF_INTRA);
2543        }
2544        if (cand_up_right == 1 && on_pu_edge_y) {
2545            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2546            int x_right_pu =
2547                ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2548            int max =
2549                ((size_in_luma_pu_h) >
2550                 (s->ps.sps->min_pu_width -
2551                  x_right_pu) ? (s->ps.sps->min_pu_width -
2552                                 x_right_pu) : (size_in_luma_pu_h));
2553            cand_up_right = 0;
2554            for (i = 0; i < max; i += 2)
2555                cand_up_right |=
2556                    ((s->ref->tab_mvf[(x_right_pu + i) +
2557                                      (y_top_pu) *
2558                                      min_pu_width]).pred_flag == PF_INTRA);
2559        }
2560        vec0 = (v16u8) __msa_ldi_b(128);
2561
2562        ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2563        ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2564
2565        top[-1] = 128;
2566    }
2567    if (cand_up_left) {
2568        left[-1] = src[(-1) + stride * (-1)];
2569        top[-1] = left[-1];
2570    }
2571    if (cand_up) {
2572        LD_UB2(src - stride, 16, vec0, vec1);
2573        ST_UB2(vec0, vec1, top, 16);
2574    }
2575
2576    if (cand_up_right) {
2577        LD_UB2(src - stride + 32, 16, vec0, vec1);
2578        ST_UB2(vec0, vec1, (top + 32), 16);
2579        do {
2580            uint32_t pix =
2581                ((src[(32 + top_right_size - 1) + stride * (-1)]) *
2582                 0x01010101U);
2583            for (i = 0; i < (32 - top_right_size); i += 4)
2584                ((((union unaligned_32 *) (top + 32 + top_right_size +
2585                                           i))->l) = (pix));
2586        } while (0);
2587    }
2588    if (cand_left)
2589        for (i = 0; i < 32; i++)
2590            left[i] = src[(-1) + stride * (i)];
2591    if (cand_bottom_left) {
2592        for (i = 32; i < 32 + bottom_left_size; i++)
2593            left[i] = src[(-1) + stride * (i)];
2594        do {
2595            uint32_t pix =
2596                ((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
2597                 0x01010101U);
2598            for (i = 0; i < (32 - bottom_left_size); i += 4)
2599                ((((union unaligned_32 *) (left + 32 + bottom_left_size +
2600                                           i))->l) = (pix));
2601        } while (0);
2602    }
2603
2604    if (s->ps.pps->constrained_intra_pred_flag == 1) {
2605        if (cand_bottom_left || cand_left || cand_up_left || cand_up
2606            || cand_up_right) {
2607            int size_max_x =
2608                x0 + ((2 * 32) << hshift) <
2609                s->ps.sps->width ? 2 * 32 : (s->ps.sps->width - x0) >> hshift;
2610            int size_max_y =
2611                y0 + ((2 * 32) << vshift) <
2612                s->ps.sps->height ? 2 * 32 : (s->ps.sps->height - y0) >> vshift;
2613            int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2614            if (!cand_up_right) {
2615                size_max_x = x0 + ((32) << hshift) < s->ps.sps->width ?
2616                    32 : (s->ps.sps->width - x0) >> hshift;
2617            }
2618            if (!cand_bottom_left) {
2619                size_max_y = y0 + ((32) << vshift) < s->ps.sps->height ?
2620                    32 : (s->ps.sps->height - y0) >> vshift;
2621            }
2622            if (cand_bottom_left || cand_left || cand_up_left) {
2623                while (j > -1
2624                       &&
2625                       !((s->ref->tab_mvf[(((x0 +
2626                                             ((-1) << hshift)) >> s->ps.sps->
2627                                            log2_min_pu_size)) + (((y0 +
2628                                                                    ((j) <<
2629                                                                     vshift))
2630                                                                   >> s->ps.sps->
2631                                                                   log2_min_pu_size))
2632                                          * min_pu_width]).pred_flag ==
2633                         PF_INTRA))
2634                    j--;
2635                if (!
2636                    ((s->ref->tab_mvf[(((x0 +
2637                                         ((-1) << hshift)) >> s->ps.sps->
2638                                        log2_min_pu_size)) + (((y0 + ((j)
2639                                                                      <<
2640                                                                      vshift))
2641                                                               >> s->ps.sps->
2642                                                               log2_min_pu_size))
2643                                      * min_pu_width]).pred_flag == PF_INTRA)) {
2644                    j = 0;
2645                    while (j < size_max_x
2646                           &&
2647                           !((s->ref->tab_mvf[(((x0 +
2648                                                 ((j) << hshift)) >> s->ps.sps->
2649                                                log2_min_pu_size)) + (((y0 +
2650                                                                        ((-1) <<
2651                                                                         vshift))
2652                                                                       >> s->
2653                                                                       ps.sps->
2654                                                                       log2_min_pu_size))
2655                                              * min_pu_width]).pred_flag ==
2656                             PF_INTRA))
2657                        j++;
2658                    for (i = j; i > (j) - (j + 1); i--)
2659                        if (!
2660                            ((s->ref->tab_mvf[(((x0 +
2661                                                 ((i -
2662                                                   1) << hshift)) >> s->ps.sps->
2663                                                log2_min_pu_size)) + (((y0 +
2664                                                                        ((-1) <<
2665                                                                         vshift))
2666                                                                       >> s->
2667                                                                       ps.sps->
2668                                                                       log2_min_pu_size))
2669                                              * min_pu_width]).pred_flag ==
2670                             PF_INTRA))
2671                            top[i - 1] = top[i];
2672                    left[-1] = top[-1];
2673                }
2674            } else {
2675                j = 0;
2676                while (j < size_max_x
2677                       &&
2678                       !((s->ref->tab_mvf[(((x0 +
2679                                             ((j) << hshift)) >> s->ps.sps->
2680                                            log2_min_pu_size)) + (((y0 + ((-1)
2681                                                                          <<
2682                                                                          vshift))
2683                                                                   >> s->ps.sps->
2684                                                                   log2_min_pu_size))
2685                                          * min_pu_width]).pred_flag ==
2686                         PF_INTRA))
2687                    j++;
2688                if (j > 0)
2689                    if (x0 > 0) {
2690                        for (i = j; i > (j) - (j + 1); i--)
2691                            if (!
2692                                ((s->ref->tab_mvf[(((x0 +
2693                                                     ((i -
2694                                                       1) << hshift)) >>
2695                                                    s->ps.sps->log2_min_pu_size))
2696                                                  + (((y0 + ((-1)
2697                                                             << vshift))
2698                                                      >>
2699                                                      s->ps.sps->log2_min_pu_size))
2700                                                  *
2701                                                  min_pu_width]).pred_flag ==
2702                                 PF_INTRA))
2703                                top[i - 1] = top[i];
2704                    } else {
2705                        for (i = j; i > (j) - (j); i--)
2706                            if (!
2707                                ((s->ref->tab_mvf[(((x0 +
2708                                                     ((i -
2709                                                       1) << hshift)) >>
2710                                                    s->ps.sps->log2_min_pu_size))
2711                                                  + (((y0 + ((-1)
2712                                                             << vshift))
2713                                                      >>
2714                                                      s->ps.sps->log2_min_pu_size))
2715                                                  *
2716                                                  min_pu_width]).pred_flag ==
2717                                 PF_INTRA))
2718                                top[i - 1] = top[i];
2719                        top[-1] = top[0];
2720                    }
2721                left[-1] = top[-1];
2722            }
2723            left[-1] = top[-1];
2724            if (cand_bottom_left || cand_left) {
2725                a = ((left[-1]) * 0x01010101U);
2726                for (i = 0; i < (0) + (size_max_y); i += 4)
2727                    if (!
2728                        ((s->ref->tab_mvf[(((x0 +
2729                                             ((-1) << hshift)) >> s->ps.sps->
2730                                            log2_min_pu_size)) + (((y0 +
2731                                                                    ((i) <<
2732                                                                     vshift))
2733                                                                   >> s->ps.sps->
2734                                                                   log2_min_pu_size))
2735                                          * min_pu_width]).pred_flag ==
2736                         PF_INTRA))
2737                        ((((union unaligned_32 *) (&left[i]))->l) = (a));
2738                    else
2739                        a = ((left[i + 3]) * 0x01010101U);
2740            }
2741            if (!cand_left) {
2742                vec0 = (v16u8) __msa_fill_b(left[-1]);
2743
2744                ST_UB2(vec0, vec0, left, 16);
2745            }
2746            if (!cand_bottom_left) {
2747                vec0 = (v16u8) __msa_fill_b(left[31]);
2748
2749                ST_UB2(vec0, vec0, (left + 32), 16);
2750            }
2751            if (x0 != 0 && y0 != 0) {
2752                a = ((left[size_max_y - 1]) * 0x01010101U);
2753                for (i = (size_max_y - 1);
2754                     i > (size_max_y - 1) - (size_max_y); i -= 4)
2755                    if (!
2756                        ((s->ref->tab_mvf[(((x0 +
2757                                             ((-1) << hshift)) >> s->ps.sps->
2758                                            log2_min_pu_size)) + (((y0 +
2759                                                                    ((i -
2760                                                                      3) <<
2761                                                                     vshift))
2762                                                                   >> s->ps.sps->
2763                                                                   log2_min_pu_size))
2764                                          * min_pu_width]).pred_flag ==
2765                         PF_INTRA))
2766                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2767                    else
2768                        a = ((left[i - 3]) * 0x01010101U);
2769                if (!
2770                    ((s->ref->tab_mvf[(((x0 +
2771                                         ((-1) << hshift)) >> s->ps.sps->
2772                                        log2_min_pu_size)) + (((y0 + ((-1)
2773                                                                      <<
2774                                                                      vshift))
2775                                                               >> s->ps.sps->
2776                                                               log2_min_pu_size))
2777                                      * min_pu_width]).pred_flag == PF_INTRA))
2778                    left[-1] = left[0];
2779            } else if (x0 == 0) {
2780                do {
2781                    uint32_t pix = ((0) * 0x01010101U);
2782                    for (i = 0; i < (size_max_y); i += 4)
2783                        ((((union unaligned_32 *) (left + i))->l) = (pix));
2784                } while (0);
2785            } else {
2786                a = ((left[size_max_y - 1]) * 0x01010101U);
2787                for (i = (size_max_y - 1);
2788                     i > (size_max_y - 1) - (size_max_y); i -= 4)
2789                    if (!
2790                        ((s->ref->tab_mvf[(((x0 +
2791                                             ((-1) << hshift)) >> s->ps.sps->
2792                                            log2_min_pu_size)) + (((y0 +
2793                                                                    ((i -
2794                                                                      3) <<
2795                                                                     vshift))
2796                                                                   >> s->ps.sps->
2797                                                                   log2_min_pu_size))
2798                                          * min_pu_width]).pred_flag ==
2799                         PF_INTRA))
2800                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2801                    else
2802                        a = ((left[i - 3]) * 0x01010101U);
2803            }
2804            top[-1] = left[-1];
2805            if (y0 != 0) {
2806                a = ((left[-1]) * 0x01010101U);
2807                for (i = 0; i < (0) + (size_max_x); i += 4)
2808                    if (!
2809                        ((s->ref->tab_mvf[(((x0 +
2810                                             ((i) << hshift)) >> s->ps.sps->
2811                                            log2_min_pu_size)) + (((y0 + ((-1)
2812                                                                          <<
2813                                                                          vshift))
2814                                                                   >> s->ps.sps->
2815                                                                   log2_min_pu_size))
2816                                          * min_pu_width]).pred_flag ==
2817                         PF_INTRA))
2818                        ((((union unaligned_32 *) (&top[i]))->l) = (a));
2819                    else
2820                        a = ((top[i + 3]) * 0x01010101U);
2821            }
2822        }
2823    }
2824
2825    if (!cand_bottom_left) {
2826        if (cand_left) {
2827            vec0 = (v16u8) __msa_fill_b(left[31]);
2828
2829            ST_UB2(vec0, vec0, (left + 32), 16);
2830        } else if (cand_up_left) {
2831            vec0 = (v16u8) __msa_fill_b(left[-1]);
2832
2833            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2834
2835            cand_left = 1;
2836        } else if (cand_up) {
2837            left[-1] = top[0];
2838
2839            vec0 = (v16u8) __msa_fill_b(left[-1]);
2840
2841            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2842
2843            cand_up_left = 1;
2844            cand_left = 1;
2845        } else if (cand_up_right) {
2846            vec0 = (v16u8) __msa_fill_b(top[32]);
2847
2848            ST_UB2(vec0, vec0, top, 16);
2849
2850            left[-1] = top[32];
2851
2852            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2853
2854            cand_up = 1;
2855            cand_up_left = 1;
2856            cand_left = 1;
2857        } else {
2858            left[-1] = 128;
2859
2860            vec0 = (v16u8) __msa_ldi_b(128);
2861
2862            ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2863            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2864        }
2865    }
2866
2867    if (!cand_left) {
2868        vec0 = (v16u8) __msa_fill_b(left[32]);
2869
2870        ST_UB2(vec0, vec0, left, 16);
2871    }
2872    if (!cand_up_left) {
2873        left[-1] = left[0];
2874    }
2875    if (!cand_up) {
2876        vec0 = (v16u8) __msa_fill_b(left[-1]);
2877
2878        ST_UB2(vec0, vec0, top, 16);
2879    }
2880    if (!cand_up_right) {
2881        vec0 = (v16u8) __msa_fill_b(top[31]);
2882
2883        ST_UB2(vec0, vec0, (top + 32), 16);
2884    }
2885
2886    top[-1] = left[-1];
2887
2888
2889    if (!s->ps.sps->intra_smoothing_disabled_flag
2890        && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2891        if (mode != INTRA_DC && 32 != 4) {
2892            int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2893            int min_dist_vert_hor =
2894                (((((int) (mode - 26U)) >=
2895                   0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2896                 ((((int) (mode - 10U)) >=
2897                   0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2898                 ? ((((int) (mode - 10U)) >=
2899                     0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2900                 : ((((int) (mode - 26U)) >=
2901                     0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2902            if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
2903                int threshold = 1 << (8 - 5);
2904                if (s->ps.sps->sps_strong_intra_smoothing_enable_flag
2905                    && c_idx == 0
2906                    && ((top[-1] + top[63] - 2 * top[31]) >=
2907                        0 ? (top[-1] + top[63] -
2908                             2 * top[31]) : (-(top[-1] + top[63] -
2909                                               2 * top[31]))) < threshold
2910                    && ((left[-1] + left[63] - 2 * left[31]) >=
2911                        0 ? (left[-1] + left[63] -
2912                             2 * left[31]) : (-(left[-1] + left[63] -
2913                                                2 * left[31]))) < threshold) {
2914
2915
2916                    filtered_top[-1] = top[-1];
2917                    filtered_top[63] = top[63];
2918
2919
2920                    for (i = 0; i < 63; i++) {
2921                        filtered_top[i] =
2922                            ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
2923                    }
2924
2925                    tmp0 = __msa_fill_h(top[-1]);
2926                    tmp1 = __msa_fill_h(top[63]);
2927
2928                    tmp2 = mul_val0 - 8;
2929                    tmp3 = mul_val0 - 16;
2930                    tmp4 = mul_val0 - 24;
2931                    tmp5 = mul_val1 + 8;
2932                    tmp6 = mul_val1 + 16;
2933                    tmp7 = mul_val1 + 24;
2934
2935                    res0 = mul_val0 * tmp0;
2936                    res1 = tmp2 * tmp0;
2937                    res2 = tmp3 * tmp0;
2938                    res3 = tmp4 * tmp0;
2939                    res0 += mul_val1 * tmp1;
2940                    res1 += tmp5 * tmp1;
2941                    res2 += tmp6 * tmp1;
2942                    res3 += tmp7 * tmp1;
2943
2944                    res0 = __msa_srari_h(res0, 6);
2945                    res1 = __msa_srari_h(res1, 6);
2946                    res2 = __msa_srari_h(res2, 6);
2947                    res3 = __msa_srari_h(res3, 6);
2948
2949                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2950                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2951
2952                    ST_UB2(vec0, vec1, filtered_top, 16);
2953
2954                    res0 = mul_val0 - 32;
2955                    tmp2 = mul_val0 - 40;
2956                    tmp3 = mul_val0 - 48;
2957                    tmp4 = mul_val0 - 56;
2958                    res3 = mul_val1 + 32;
2959                    tmp5 = mul_val1 + 40;
2960                    tmp6 = mul_val1 + 48;
2961                    tmp7 = mul_val1 + 56;
2962
2963                    res0 = res0 * tmp0;
2964                    res1 = tmp2 * tmp0;
2965                    res2 = tmp3 * tmp0;
2966                    res0 += res3 * tmp1;
2967                    res3 = tmp4 * tmp0;
2968                    res1 += tmp5 * tmp1;
2969                    res2 += tmp6 * tmp1;
2970                    res3 += tmp7 * tmp1;
2971
2972                    res0 = __msa_srari_h(res0, 6);
2973                    res1 = __msa_srari_h(res1, 6);
2974                    res2 = __msa_srari_h(res2, 6);
2975                    res3 = __msa_srari_h(res3, 6);
2976
2977                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2978                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2979
2980                    ST_UB2(vec0, vec1, (filtered_top + 32), 16);
2981
2982                    filtered_top[63] = top[63];
2983
2984                    tmp0 = __msa_fill_h(left[-1]);
2985                    tmp1 = __msa_fill_h(left[63]);
2986
2987                    tmp2 = mul_val0 - 8;
2988                    tmp3 = mul_val0 - 16;
2989                    tmp4 = mul_val0 - 24;
2990                    tmp5 = mul_val1 + 8;
2991                    tmp6 = mul_val1 + 16;
2992                    tmp7 = mul_val1 + 24;
2993
2994                    res0 = mul_val0 * tmp0;
2995                    res1 = tmp2 * tmp0;
2996                    res2 = tmp3 * tmp0;
2997                    res3 = tmp4 * tmp0;
2998                    res0 += mul_val1 * tmp1;
2999                    res1 += tmp5 * tmp1;
3000                    res2 += tmp6 * tmp1;
3001                    res3 += tmp7 * tmp1;
3002
3003                    res0 = __msa_srari_h(res0, 6);
3004                    res1 = __msa_srari_h(res1, 6);
3005                    res2 = __msa_srari_h(res2, 6);
3006                    res3 = __msa_srari_h(res3, 6);
3007
3008                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3009                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3010
3011                    ST_UB2(vec0, vec1, left, 16);
3012
3013                    res0 = mul_val0 - 32;
3014                    tmp2 = mul_val0 - 40;
3015                    tmp3 = mul_val0 - 48;
3016                    tmp4 = mul_val0 - 56;
3017                    res3 = mul_val1 + 32;
3018                    tmp5 = mul_val1 + 40;
3019                    tmp6 = mul_val1 + 48;
3020                    tmp7 = mul_val1 + 56;
3021
3022                    res0 = res0 * tmp0;
3023                    res1 = tmp2 * tmp0;
3024                    res2 = tmp3 * tmp0;
3025                    res0 += res3 * tmp1;
3026                    res3 = tmp4 * tmp0;
3027                    res1 += tmp5 * tmp1;
3028                    res2 += tmp6 * tmp1;
3029                    res3 += tmp7 * tmp1;
3030
3031                    res0 = __msa_srari_h(res0, 6);
3032                    res1 = __msa_srari_h(res1, 6);
3033                    res2 = __msa_srari_h(res2, 6);
3034                    res3 = __msa_srari_h(res3, 6);
3035
3036                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3037                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3038
3039                    ST_UB2(vec0, vec1, (left + 32), 16);
3040
3041                    left[63] = tmp1[0];
3042
3043                    top = filtered_top;
3044                } else {
3045                    filtered_left[2 * 32 - 1] = left[2 * 32 - 1];
3046                    filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
3047                    for (i = 2 * 32 - 2; i >= 0; i--)
3048                        filtered_left[i] = (left[i + 1] + 2 * left[i] +
3049                                            left[i - 1] + 2) >> 2;
3050                    filtered_top[-1] =
3051                        filtered_left[-1] =
3052                        (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
3053                    for (i = 2 * 32 - 2; i >= 0; i--)
3054                        filtered_top[i] = (top[i + 1] + 2 * top[i] +
3055                                           top[i - 1] + 2) >> 2;
3056                    left = filtered_left;
3057                    top = filtered_top;
3058                }
3059            }
3060        }
3061    }
3062
3063    switch (mode) {
3064    case INTRA_PLANAR:
3065        s->hpc.pred_planar[3] ((uint8_t *) src, (uint8_t *) top,
3066                               (uint8_t *) left, stride);
3067        break;
3068    case INTRA_DC:
3069        s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
3070                       (uint8_t *) left, stride, 5, c_idx);
3071        break;
3072    default:
3073        s->hpc.pred_angular[3] ((uint8_t *) src, (uint8_t *) top,
3074                                (uint8_t *) left, stride, c_idx, mode);
3075        break;
3076    }
3077}
3078