1/*
2 * Copyright (c) 2015 - 2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mips/generic_macros_msa.h"
22#include "h264dsp_mips.h"
23
24static void avc_wgt_4x2_msa(uint8_t *data, ptrdiff_t stride,
25                            int32_t log2_denom, int32_t src_weight,
26                            int32_t offset_in)
27{
28    uint32_t tp0, tp1, offset_val;
29    v16u8 zero = { 0 };
30    v16u8 src0 = { 0 };
31    v8i16 src0_r, tmp0, wgt, denom, offset;
32
33    offset_val = (unsigned) offset_in << log2_denom;
34
35    wgt = __msa_fill_h(src_weight);
36    offset = __msa_fill_h(offset_val);
37    denom = __msa_fill_h(log2_denom);
38
39    LW2(data, stride, tp0, tp1);
40    INSERT_W2_UB(tp0, tp1, src0);
41    src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0);
42    tmp0 = wgt * src0_r;
43    tmp0 = __msa_adds_s_h(tmp0, offset);
44    tmp0 = __msa_maxi_s_h(tmp0, 0);
45    tmp0 = __msa_srlr_h(tmp0, denom);
46    tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
47    src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
48    ST_W2(src0, 0, 1, data, stride);
49}
50
51static void avc_wgt_4x4_msa(uint8_t *data, ptrdiff_t stride,
52                            int32_t log2_denom, int32_t src_weight,
53                            int32_t offset_in)
54{
55    uint32_t tp0, tp1, tp2, tp3, offset_val;
56    v16u8 src0 = { 0 };
57    v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset;
58
59    offset_val = (unsigned) offset_in << log2_denom;
60
61    wgt = __msa_fill_h(src_weight);
62    offset = __msa_fill_h(offset_val);
63    denom = __msa_fill_h(log2_denom);
64
65    LW4(data, stride, tp0, tp1, tp2, tp3);
66    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
67    UNPCK_UB_SH(src0, src0_r, src1_r);
68    MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1);
69    ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1);
70    MAXI_SH2_SH(tmp0, tmp1, 0);
71    tmp0 = __msa_srlr_h(tmp0, denom);
72    tmp1 = __msa_srlr_h(tmp1, denom);
73    SAT_UH2_SH(tmp0, tmp1, 7);
74    src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
75    ST_W4(src0, 0, 1, 2, 3, data, stride);
76}
77
78static void avc_wgt_4x8_msa(uint8_t *data, ptrdiff_t stride,
79                            int32_t log2_denom, int32_t src_weight,
80                            int32_t offset_in)
81{
82    uint32_t tp0, tp1, tp2, tp3, offset_val;
83    v16u8 src0 = { 0 }, src1 = { 0 };
84    v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
85    v8i16 wgt, denom, offset;
86
87    offset_val = (unsigned) offset_in << log2_denom;
88
89    wgt = __msa_fill_h(src_weight);
90    offset = __msa_fill_h(offset_val);
91    denom = __msa_fill_h(log2_denom);
92
93    LW4(data, stride, tp0, tp1, tp2, tp3);
94    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
95    LW4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
96    INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
97    UNPCK_UB_SH(src0, src0_r, src1_r);
98    UNPCK_UB_SH(src1, src2_r, src3_r);
99    MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
100         tmp3);
101    ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
102                tmp1, tmp2, tmp3);
103    MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
104    SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
105    SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
106    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
107    ST_W8(src0, src1, 0, 1, 2, 3, 0, 1, 2, 3, data, stride);
108}
109
110static void avc_wgt_8x4_msa(uint8_t *data, ptrdiff_t stride,
111                            int32_t log2_denom, int32_t src_weight,
112                            int32_t offset_in)
113{
114    uint32_t offset_val;
115    uint64_t tp0, tp1, tp2, tp3;
116    v16u8 src0 = { 0 }, src1 = { 0 };
117    v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
118    v8i16 wgt, denom, offset;
119
120    offset_val = (unsigned) offset_in << log2_denom;
121
122    wgt = __msa_fill_h(src_weight);
123    offset = __msa_fill_h(offset_val);
124    denom = __msa_fill_h(log2_denom);
125
126    LD4(data, stride, tp0, tp1, tp2, tp3);
127    INSERT_D2_UB(tp0, tp1, src0);
128    INSERT_D2_UB(tp2, tp3, src1);
129    UNPCK_UB_SH(src0, src0_r, src1_r);
130    UNPCK_UB_SH(src1, src2_r, src3_r);
131    MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
132         tmp3);
133    ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
134                tmp1, tmp2, tmp3);
135    MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
136    SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
137    SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
138    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
139    ST_D4(src0, src1, 0, 1, 0, 1, data, stride);
140}
141
142static void avc_wgt_8x8_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom,
143                            int32_t src_weight, int32_t offset_in)
144{
145    uint32_t offset_val;
146    uint64_t tp0, tp1, tp2, tp3;
147    v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
148    v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
149    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
150    v8i16 wgt, denom, offset;
151
152    offset_val = (unsigned) offset_in << log2_denom;
153
154    wgt = __msa_fill_h(src_weight);
155    offset = __msa_fill_h(offset_val);
156    denom = __msa_fill_h(log2_denom);
157
158    LD4(data, stride, tp0, tp1, tp2, tp3);
159    INSERT_D2_UB(tp0, tp1, src0);
160    INSERT_D2_UB(tp2, tp3, src1);
161    LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
162    INSERT_D2_UB(tp0, tp1, src2);
163    INSERT_D2_UB(tp2, tp3, src3);
164    UNPCK_UB_SH(src0, src0_r, src1_r);
165    UNPCK_UB_SH(src1, src2_r, src3_r);
166    UNPCK_UB_SH(src2, src4_r, src5_r);
167    UNPCK_UB_SH(src3, src6_r, src7_r);
168    MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
169         tmp3);
170    MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6,
171         tmp7);
172    ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
173                tmp1, tmp2, tmp3);
174    ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
175                tmp5, tmp6, tmp7);
176    MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
177    SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
178    SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
179    PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
180                src2, src3);
181    ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
182}
183
184static void avc_wgt_8x16_msa(uint8_t *data, ptrdiff_t stride,
185                             int32_t log2_denom, int32_t src_weight,
186                             int32_t offset_in)
187{
188    uint32_t offset_val, cnt;
189    uint64_t tp0, tp1, tp2, tp3;
190    v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
191    v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
192    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
193    v8i16 wgt, denom, offset;
194
195    offset_val = (unsigned) offset_in << log2_denom;
196
197    wgt = __msa_fill_h(src_weight);
198    offset = __msa_fill_h(offset_val);
199    denom = __msa_fill_h(log2_denom);
200
201    for (cnt = 2; cnt--;) {
202        LD4(data, stride, tp0, tp1, tp2, tp3);
203        INSERT_D2_UB(tp0, tp1, src0);
204        INSERT_D2_UB(tp2, tp3, src1);
205        LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
206        INSERT_D2_UB(tp0, tp1, src2);
207        INSERT_D2_UB(tp2, tp3, src3);
208        UNPCK_UB_SH(src0, src0_r, src1_r);
209        UNPCK_UB_SH(src1, src2_r, src3_r);
210        UNPCK_UB_SH(src2, src4_r, src5_r);
211        UNPCK_UB_SH(src3, src6_r, src7_r);
212        MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1,
213             tmp2, tmp3);
214        MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5,
215             tmp6, tmp7);
216        ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
217                    tmp0, tmp1, tmp2, tmp3);
218        ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
219                    tmp4, tmp5, tmp6, tmp7);
220        MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
221        SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
222        SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
223        PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
224                    src2, src3);
225        ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
226        data += 8 * stride;
227    }
228}
229
230static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
231                              int32_t log2_denom, int32_t src_weight,
232                              int32_t dst_weight, int32_t offset_in)
233{
234    uint32_t tp0, tp1;
235    v16i8 src_wgt, dst_wgt, wgt, vec0;
236    v16u8 src0 = { 0 }, dst0 = { 0 };
237    v8i16 tmp0, denom, offset, max255 = __msa_ldi_h(255);
238
239    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
240    offset_in += (128 * (src_weight + dst_weight));
241
242    src_wgt = __msa_fill_b(src_weight);
243    dst_wgt = __msa_fill_b(dst_weight);
244    offset = __msa_fill_h(offset_in);
245    denom = __msa_fill_h(log2_denom + 1);
246
247    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
248
249    LW2(src, stride, tp0, tp1);
250    INSERT_W2_UB(tp0, tp1, src0);
251    LW2(dst, stride, tp0, tp1);
252    INSERT_W2_UB(tp0, tp1, dst0);
253    XORI_B2_128_UB(src0, dst0);
254    vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8) src0);
255    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
256    tmp0 >>= denom;
257    tmp0 = __msa_maxi_s_h(tmp0, 0);
258    tmp0 = __msa_min_s_h(max255, tmp0);
259    dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
260    ST_W2(dst0, 0, 1, dst, stride);
261}
262
263static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
264                              int32_t log2_denom, int32_t src_weight,
265                              int32_t dst_weight, int32_t offset_in)
266{
267    uint32_t tp0, tp1, tp2, tp3;
268    v16i8 src_wgt, dst_wgt, wgt, vec0, vec1;
269    v16u8 src0, dst0;
270    v8i16 tmp0, tmp1, denom, offset;
271
272    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
273    offset_in += (128 * (src_weight + dst_weight));
274
275    src_wgt = __msa_fill_b(src_weight);
276    dst_wgt = __msa_fill_b(dst_weight);
277    offset = __msa_fill_h(offset_in);
278    denom = __msa_fill_h(log2_denom + 1);
279
280    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
281
282    LW4(src, stride, tp0, tp1, tp2, tp3);
283    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
284    LW4(dst, stride, tp0, tp1, tp2, tp3);
285    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
286    XORI_B2_128_UB(src0, dst0);
287    ILVRL_B2_SB(dst0, src0, vec0, vec1);
288    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
289    tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
290    tmp0 >>= denom;
291    tmp1 >>= denom;
292    CLIP_SH2_0_255(tmp0, tmp1);
293    dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
294    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
295}
296
297static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
298                              int32_t log2_denom, int32_t src_weight,
299                              int32_t dst_weight, int32_t offset_in)
300{
301    uint32_t tp0, tp1, tp2, tp3;
302    v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
303    v16u8 src0, src1, dst0, dst1;
304    v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
305
306    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
307    offset_in += (128 * (src_weight + dst_weight));
308
309    src_wgt = __msa_fill_b(src_weight);
310    dst_wgt = __msa_fill_b(dst_weight);
311    offset = __msa_fill_h(offset_in);
312    denom = __msa_fill_h(log2_denom + 1);
313    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
314
315    LW4(src, stride, tp0, tp1, tp2, tp3);
316    src += 4 * stride;
317    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
318    LW4(src, stride, tp0, tp1, tp2, tp3);
319    INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
320    LW4(dst, stride, tp0, tp1, tp2, tp3);
321    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
322    LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
323    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
324    XORI_B4_128_UB(src0, src1, dst0, dst1);
325    ILVRL_B2_SB(dst0, src0, vec0, vec1);
326    ILVRL_B2_SB(dst1, src1, vec2, vec3);
327    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
328    tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
329    tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
330    tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
331    SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
332    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
333    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
334    ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
335}
336
337static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
338                              int32_t log2_denom, int32_t src_weight,
339                              int32_t dst_weight, int32_t offset_in)
340{
341    uint64_t tp0, tp1, tp2, tp3;
342    v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
343    v16u8 src0, src1, dst0, dst1;
344    v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
345
346    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
347    offset_in += (128 * (src_weight + dst_weight));
348
349    src_wgt = __msa_fill_b(src_weight);
350    dst_wgt = __msa_fill_b(dst_weight);
351    offset = __msa_fill_h(offset_in);
352    denom = __msa_fill_h(log2_denom + 1);
353
354    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
355
356    LD4(src, stride, tp0, tp1, tp2, tp3);
357    INSERT_D2_UB(tp0, tp1, src0);
358    INSERT_D2_UB(tp2, tp3, src1);
359    LD4(dst, stride, tp0, tp1, tp2, tp3);
360    INSERT_D2_UB(tp0, tp1, dst0);
361    INSERT_D2_UB(tp2, tp3, dst1);
362    XORI_B4_128_UB(src0, src1, dst0, dst1);
363    ILVRL_B2_SB(dst0, src0, vec0, vec1);
364    ILVRL_B2_SB(dst1, src1, vec2, vec3);
365    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
366    tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
367    tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
368    tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
369    SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
370    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
371    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
372    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
373}
374
375static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
376                              int32_t log2_denom, int32_t src_weight,
377                              int32_t dst_weight, int32_t offset_in)
378{
379    uint64_t tp0, tp1, tp2, tp3;
380    v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
381    v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
382    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
383
384    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
385    offset_in += (128 * (src_weight + dst_weight));
386
387    src_wgt = __msa_fill_b(src_weight);
388    dst_wgt = __msa_fill_b(dst_weight);
389    offset = __msa_fill_h(offset_in);
390    denom = __msa_fill_h(log2_denom + 1);
391    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
392
393    LD4(src, stride, tp0, tp1, tp2, tp3);
394    INSERT_D2_UB(tp0, tp1, src0);
395    INSERT_D2_UB(tp2, tp3, src1);
396    LD4(src + 4 * stride, stride, tp0, tp1, tp2, tp3);
397    INSERT_D2_UB(tp0, tp1, src2);
398    INSERT_D2_UB(tp2, tp3, src3);
399    LD4(dst, stride, tp0, tp1, tp2, tp3);
400    INSERT_D2_UB(tp0, tp1, dst0);
401    INSERT_D2_UB(tp2, tp3, dst1);
402    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
403    INSERT_D2_UB(tp0, tp1, dst2);
404    INSERT_D2_UB(tp2, tp3, dst3);
405    XORI_B8_128_UB(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
406    ILVRL_B2_SB(dst0, src0, vec0, vec1);
407    ILVRL_B2_SB(dst1, src1, vec2, vec3);
408    ILVRL_B2_SB(dst2, src2, vec4, vec5);
409    ILVRL_B2_SB(dst3, src3, vec6, vec7);
410    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
411    tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
412    tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
413    tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
414    tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
415    tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
416    tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
417    tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
418    SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
419    SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
420    CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
421    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
422    PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
423    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
424}
425
426static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
427                               int32_t log2_denom, int32_t src_weight,
428                               int32_t dst_weight, int32_t offset_in)
429{
430    uint8_t cnt;
431    uint64_t tp0, tp1, tp2, tp3;
432    v16i8 src_wgt, dst_wgt, wgt;
433    v16u8 src0, src1, src2, src3;
434    v16u8 dst0, dst1, dst2, dst3;
435    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
436    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
437    v8i16 denom, offset;
438
439    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
440    offset_in += (128 * (src_weight + dst_weight));
441
442    src_wgt = __msa_fill_b(src_weight);
443    dst_wgt = __msa_fill_b(dst_weight);
444    offset = __msa_fill_h(offset_in);
445    denom = __msa_fill_h(log2_denom + 1);
446    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
447
448    for (cnt = 2; cnt--;) {
449        LD4(src, stride, tp0, tp1, tp2, tp3);
450        src += 4 * stride;
451        INSERT_D2_UB(tp0, tp1, src0);
452        INSERT_D2_UB(tp2, tp3, src1);
453        LD4(src, stride, tp0, tp1, tp2, tp3);
454        src += 4 * stride;
455        INSERT_D2_UB(tp0, tp1, src2);
456        INSERT_D2_UB(tp2, tp3, src3);
457        LD4(dst, stride, tp0, tp1, tp2, tp3);
458        INSERT_D2_UB(tp0, tp1, dst0);
459        INSERT_D2_UB(tp2, tp3, dst1);
460        LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
461        INSERT_D2_UB(tp0, tp1, dst2);
462        INSERT_D2_UB(tp2, tp3, dst3);
463        XORI_B4_128_UB(src0, src1, src2, src3);
464        XORI_B4_128_UB(dst0, dst1, dst2, dst3);
465        ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
466                   vec0, vec2, vec4, vec6);
467        ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
468                   vec1, vec3, vec5, vec7);
469
470        temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
471        temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
472        temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
473        temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
474        temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
475        temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
476        temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
477        temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
478
479        SRA_4V(temp0, temp1, temp2, temp3, denom);
480        SRA_4V(temp4, temp5, temp6, temp7, denom);
481        CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
482        PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
483                    dst0, dst1, dst2, dst3);
484        ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
485        dst += 8 * stride;
486    }
487}
488
489#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,          \
490                                 q3_or_p3_org_in, p1_or_q1_org_in,          \
491                                 p2_or_q2_org_in, q1_or_p1_org_in,          \
492                                 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)  \
493{                                                                           \
494    v8i16 threshold;                                                        \
495    v8i16 const3 = __msa_ldi_h(3);                                          \
496                                                                            \
497    threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in);                      \
498    threshold += (p1_or_q1_org_in);                                         \
499                                                                            \
500    (p0_or_q0_out) = threshold << 1;                                        \
501    (p0_or_q0_out) += (p2_or_q2_org_in);                                    \
502    (p0_or_q0_out) += (q1_or_p1_org_in);                                    \
503    (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3);                      \
504                                                                            \
505    (p1_or_q1_out) = (p2_or_q2_org_in) + threshold;                         \
506    (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2);                      \
507                                                                            \
508    (p2_or_q2_out) = (p2_or_q2_org_in) * const3;                            \
509    (p2_or_q2_out) += (p3_or_q3_org_in);                                    \
510    (p2_or_q2_out) += (p3_or_q3_org_in);                                    \
511    (p2_or_q2_out) += threshold;                                            \
512    (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3);                      \
513}
514
515/* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
516#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,   \
517                         p1_or_q1_org_in, p0_or_q0_out)      \
518{                                                            \
519    (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in);  \
520    (p0_or_q0_out) += (p1_or_q1_org_in);                     \
521    (p0_or_q0_out) += (p1_or_q1_org_in);                     \
522    (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2);       \
523}
524
525#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,    \
526                         p1_or_q1_org_in, p2_or_q2_org_in,    \
527                         negate_tc_in, tc_in, p1_or_q1_out)   \
528{                                                             \
529    v8i16 clip3, temp;                                        \
530                                                              \
531    clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in,   \
532                                   (v8u16) q0_or_p0_org_in);  \
533    temp = p1_or_q1_org_in << 1;                              \
534    clip3 = clip3 - temp;                                     \
535    clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);            \
536    CLIP_SH(clip3, negate_tc_in, tc_in);                      \
537    p1_or_q1_out = p1_or_q1_org_in + clip3;                   \
538}
539
540#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,          \
541                     p1_or_q1_org_in, q1_or_p1_org_in,          \
542                     negate_threshold_in, threshold_in,         \
543                     p0_or_q0_out, q0_or_p0_out)                \
544{                                                               \
545    v8i16 q0_sub_p0, p1_sub_q1, delta;                          \
546                                                                \
547    q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in;              \
548    p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in;              \
549    q0_sub_p0 <<= 2;                                            \
550    p1_sub_q1 += 4;                                             \
551    delta = q0_sub_p0 + p1_sub_q1;                              \
552    delta >>= 3;                                                \
553                                                                \
554    CLIP_SH(delta, negate_threshold_in, threshold_in);          \
555                                                                \
556    p0_or_q0_out = p0_or_q0_org_in + delta;                     \
557    q0_or_p0_out = q0_or_p0_org_in - delta;                     \
558                                                                \
559    CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out);                 \
560}
561
562#define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res)      \
563{                                                                        \
564    uint32_t load0, load1, load2, load3;                                 \
565    v16u8 src0 = { 0 };                                                  \
566    v16u8 src1 = { 0 };                                                  \
567    v16u8 src2 = { 0 };                                                  \
568    v16u8 src3 = { 0 };                                                  \
569    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                            \
570    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;           \
571    v8i16 tc, q0_sub_p0, p1_sub_q1, delta;                               \
572    v8i16 res0_r, res1_r;                                                \
573    v16i8 zeros = { 0 };                                                 \
574    v16u8 res0, res1;                                                    \
575                                                                         \
576    LW4((src - 2), stride, load0, load1, load2, load3);                  \
577    src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);               \
578    src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);               \
579    src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2);               \
580    src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3);               \
581                                                                         \
582    TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3);  \
583                                                                         \
584    p0_asub_q0 = __msa_asub_u_b(src2, src1);                             \
585    p1_asub_p0 = __msa_asub_u_b(src1, src0);                             \
586    q1_asub_q0 = __msa_asub_u_b(src2, src3);                             \
587                                                                         \
588    tc = __msa_fill_h(tc_val);                                           \
589                                                                         \
590    is_less_than_alpha = (p0_asub_q0 < alpha);                           \
591    is_less_than_beta = (p1_asub_p0 < beta);                             \
592    is_less_than = is_less_than_alpha & is_less_than_beta;               \
593    is_less_than_beta = (q1_asub_q0 < beta);                             \
594    is_less_than = is_less_than_beta & is_less_than;                     \
595                                                                         \
596    ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);            \
597    HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);             \
598                                                                         \
599    q0_sub_p0 <<= 2;                                                     \
600    delta = q0_sub_p0 + p1_sub_q1;                                       \
601    delta = __msa_srari_h(delta, 3);                                     \
602                                                                         \
603    CLIP_SH(delta, -tc, tc);                                             \
604                                                                         \
605    ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                \
606                                                                         \
607    res0_r += delta;                                                     \
608    res1_r -= delta;                                                     \
609                                                                         \
610    CLIP_SH2_0_255(res0_r, res1_r);                                      \
611    PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);             \
612                                                                         \
613    res0 = __msa_bmnz_v(src1, res0, is_less_than);                       \
614    res1 = __msa_bmnz_v(src2, res1, is_less_than);                       \
615                                                                         \
616    res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);              \
617}
618
619#define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3)  \
620{                                                            \
621    v16i8 zero_m = { 0 };                                    \
622                                                             \
623    out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0);   \
624    out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2);    \
625    SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3);   \
626}
627
628#define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)  \
629{                                                                          \
630    uint32_t load0, load1;                                                 \
631    v16u8 src0 = { 0 };                                                    \
632    v16u8 src1 = { 0 };                                                    \
633    v16u8 src2 = { 0 };                                                    \
634    v16u8 src3 = { 0 };                                                    \
635    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                              \
636    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;             \
637    v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r;                 \
638    v16i8 zeros = { 0 };                                                   \
639    v16u8 res0, res1;                                                      \
640                                                                           \
641    load0 = LW(src - 2);                                                   \
642    load1 = LW(src - 2 + stride);                                          \
643                                                                           \
644    src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);                 \
645    src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);                 \
646                                                                           \
647    TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3);                 \
648                                                                           \
649    p0_asub_q0 = __msa_asub_u_b(src2, src1);                               \
650    p1_asub_p0 = __msa_asub_u_b(src1, src0);                               \
651    q1_asub_q0 = __msa_asub_u_b(src2, src3);                               \
652                                                                           \
653    tc = __msa_fill_h(tc_val);                                             \
654                                                                           \
655    is_less_than_alpha = (p0_asub_q0 < alpha);                             \
656    is_less_than_beta = (p1_asub_p0 < beta);                               \
657    is_less_than = is_less_than_alpha & is_less_than_beta;                 \
658    is_less_than_beta = (q1_asub_q0 < beta);                               \
659    is_less_than = is_less_than_beta & is_less_than;                       \
660                                                                           \
661    ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);              \
662    HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);               \
663                                                                           \
664    q0_sub_p0 <<= 2;                                                       \
665    delta = q0_sub_p0 + p1_sub_q1;                                         \
666    delta = __msa_srari_h(delta, 3);                                       \
667    CLIP_SH(delta, -tc, tc);                                               \
668                                                                           \
669    ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                  \
670                                                                           \
671    res0_r += delta;                                                       \
672    res1_r -= delta;                                                       \
673                                                                           \
674    CLIP_SH2_0_255(res0_r, res1_r);                                        \
675    PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);               \
676                                                                           \
677    res0 = __msa_bmnz_v(src1, res0, is_less_than);                         \
678    res1 = __msa_bmnz_v(src2, res1, is_less_than);                         \
679                                                                           \
680    res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);                \
681}
682
683static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data,
684                                                   uint8_t alpha_in,
685                                                   uint8_t beta_in,
686                                                   ptrdiff_t img_width)
687{
688    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
689    v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
690    v16u8 p1_org, p0_org, q0_org, q1_org;
691
692    LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
693
694    p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
695    p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
696    q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
697
698    is_less_than_alpha = (p0_asub_q0 < alpha_in);
699    is_less_than_beta = (p1_asub_p0 < beta_in);
700    is_less_than = is_less_than_beta & is_less_than_alpha;
701    is_less_than_beta = (q1_asub_q0 < beta_in);
702    is_less_than = is_less_than_beta & is_less_than;
703
704    if (!__msa_test_bz_v(is_less_than)) {
705        v16u8 p2_asub_p0, q2_asub_q0, p0, q0, negate_is_less_than_beta;
706        v8i16 p0_r = { 0 };
707        v8i16 q0_r = { 0 };
708        v8i16 p0_l = { 0 };
709        v8i16 q0_l = { 0 };
710        v16i8 zero = { 0 };
711        v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
712        v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
713        v16u8 q2_org = LD_UB(data + (2 * img_width));
714        v16u8 p2_org = LD_UB(data - (3 * img_width));
715        v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2);
716
717        UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
718        UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
719        UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
720
721        tmp_flag = (p0_asub_q0 < tmp_flag);
722
723        p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
724        is_less_than_beta = (p2_asub_p0 < beta_in);
725        is_less_than_beta = is_less_than_beta & tmp_flag;
726        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
727        is_less_than_beta = is_less_than_beta & is_less_than;
728        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
729
730        q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
731        q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
732
733        /* combine and store */
734        if (!__msa_test_bz_v(is_less_than_beta)) {
735            v8i16 p3_org_l, p3_org_r;
736            v16u8 p3_org = LD_UB(data - (img_width << 2));
737            v16u8 p2, p1;
738            v8i16 p2_r = { 0 };
739            v8i16 p2_l = { 0 };
740            v8i16 p1_r = { 0 };
741            v8i16 p1_l = { 0 };
742
743            ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
744            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
745                                     p2_r, q1_org_r, p0_r, p1_r, p2_r);
746
747            ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
748            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
749                                     p2_l, q1_org_l, p0_l, p1_l, p2_l);
750
751            PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
752
753            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
754            p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
755            p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
756
757            ST_UB(p1_org, data - (2 * img_width));
758            ST_UB(p2_org, data - (3 * img_width));
759        }
760
761        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
762        AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
763
764        /* combine */
765        p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
766        p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
767
768        ST_UB(p0_org, data - img_width);
769
770        /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
771        q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
772        is_less_than_beta = (q2_asub_q0 < beta_in);
773        is_less_than_beta = is_less_than_beta & tmp_flag;
774        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
775        is_less_than_beta = is_less_than_beta & is_less_than;
776        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
777
778        /* combine and store */
779        if (!__msa_test_bz_v(is_less_than_beta)) {
780            v8i16 q3_org_r, q3_org_l;
781            v16u8 q3_org = LD_UB(data + (3 * img_width));
782            v16u8 q1, q2;
783            v8i16 q2_r = { 0 };
784            v8i16 q2_l = { 0 };
785            v8i16 q1_r = { 0 };
786            v8i16 q1_l = { 0 };
787
788            ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
789            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
790                                     q2_r, p1_org_r, q0_r, q1_r, q2_r);
791
792            ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
793            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
794                                     q2_l, p1_org_l, q0_l, q1_l, q2_l);
795
796            PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
797            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
798            q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
799            q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
800
801            ST_UB(q1_org, data + img_width);
802            ST_UB(q2_org, data + 2 * img_width);
803        }
804
805        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
806        AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
807
808        /* combine */
809        q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
810        q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
811
812        ST_UB(q0_org, data);
813    }
814}
815
816static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data,
817                                                   uint8_t alpha_in,
818                                                   uint8_t beta_in,
819                                                   ptrdiff_t img_width)
820{
821    uint8_t *src = data - 4;
822    v16u8 alpha, beta, p0_asub_q0;
823    v16u8 is_less_than_alpha, is_less_than, is_less_than_beta;
824    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
825    v16u8 p1_asub_p0, q1_asub_q0;
826
827
828    {
829        v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
830        v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
831
832        LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
833        LD_UB8(src + (8 * img_width), img_width,
834               row8, row9, row10, row11, row12, row13, row14, row15);
835
836        TRANSPOSE16x8_UB_UB(row0, row1, row2, row3,
837                            row4, row5, row6, row7,
838                            row8, row9, row10, row11,
839                            row12, row13, row14, row15,
840                            p3_org, p2_org, p1_org, p0_org,
841                            q0_org, q1_org, q2_org, q3_org);
842    }
843
844    p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
845    p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
846    q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
847
848    alpha = (v16u8) __msa_fill_b(alpha_in);
849    beta = (v16u8) __msa_fill_b(beta_in);
850
851    is_less_than_alpha = (p0_asub_q0 < alpha);
852    is_less_than_beta = (p1_asub_p0 < beta);
853    is_less_than = is_less_than_beta & is_less_than_alpha;
854    is_less_than_beta = (q1_asub_q0 < beta);
855    is_less_than = is_less_than_beta & is_less_than;
856
857    if (!__msa_test_bz_v(is_less_than)) {
858        v8i16 p0_r = { 0 };
859        v8i16 q0_r = { 0 };
860        v8i16 p0_l = { 0 };
861        v8i16 q0_l = { 0 };
862        v16i8 zero = { 0 };
863        v16u8 tmp_flag, p0, q0, p2_asub_p0, q2_asub_q0;
864        v16u8 negate_is_less_than_beta;
865        v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
866        v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
867
868        UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
869        UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
870        UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
871        UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l);
872
873        tmp_flag = alpha >> 2;
874        tmp_flag = tmp_flag + 2;
875        tmp_flag = (p0_asub_q0 < tmp_flag);
876
877        p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
878        is_less_than_beta = (p2_asub_p0 < beta);
879        is_less_than_beta = tmp_flag & is_less_than_beta;
880        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
881        is_less_than_beta = is_less_than_beta & is_less_than;
882        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
883
884        if (!__msa_test_bz_v(is_less_than_beta)) {
885            v16u8 p2, p1;
886            v8i16 p3_org_r, p3_org_l;
887            v8i16 p2_l = { 0 };
888            v8i16 p2_r = { 0 };
889            v8i16 p1_l = { 0 };
890            v8i16 p1_r = { 0 };
891
892            ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
893            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
894                                     p2_r, q1_org_r, p0_r, p1_r, p2_r);
895
896            ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
897            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
898                                         p2_l, q1_org_l, p0_l, p1_l, p2_l);
899
900            PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
901            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
902            p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
903            p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
904        }
905
906        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
907        AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
908
909        p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
910        p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
911
912        q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
913        is_less_than_beta = (q2_asub_q0 < beta);
914
915        is_less_than_beta = is_less_than_beta & tmp_flag;
916        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
917
918        is_less_than_beta = is_less_than_beta & is_less_than;
919        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
920
921        if (!__msa_test_bz_v(is_less_than_beta)) {
922            v16u8 q1, q2;
923            v8i16 q3_org_r, q3_org_l;
924            v8i16 q1_l = { 0 };
925            v8i16 q1_r = { 0 };
926            v8i16 q2_l = { 0 };
927            v8i16 q2_r = { 0 };
928
929            ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
930            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
931                                     q2_r, p1_org_r, q0_r, q1_r, q2_r);
932
933            ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
934            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
935                                     q2_l, p1_org_l, q0_l, q1_l, q2_l);
936
937            PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
938            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
939            q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
940            q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
941        }
942
943        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
944        AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
945
946        q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
947        q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
948
949    {
950        v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
951
952        ILVRL_B2_SH(p1_org, p2_org, tp0, tp2);
953        ILVRL_B2_SH(q0_org, p0_org, tp1, tp3);
954        ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
955
956        ILVRL_H2_SH(tp1, tp0, tmp3, tmp4);
957        ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
958
959        src = data - 3;
960        ST_W4(tmp3, 0, 1, 2, 3, src, img_width);
961        ST_H4(tmp2, 0, 1, 2, 3, src + 4, img_width);
962        src += 4 * img_width;
963        ST_W4(tmp4, 0, 1, 2, 3, src, img_width);
964        ST_H4(tmp2, 4, 5, 6, 7, src + 4, img_width);
965        src += 4 * img_width;
966
967        ST_W4(tmp6, 0, 1, 2, 3, src, img_width);
968        ST_H4(tmp5, 0, 1, 2, 3, src + 4, img_width);
969        src += 4 * img_width;
970        ST_W4(tmp7, 0, 1, 2, 3, src, img_width);
971        ST_H4(tmp5, 4, 5, 6, 7, src + 4, img_width);
972    }
973    }
974}
975
976static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
977                                                   ptrdiff_t stride,
978                                                   int32_t alpha_in,
979                                                   int32_t beta_in)
980{
981    uint64_t load0, load1;
982    uint32_t out0, out2;
983    uint16_t out1, out3;
984    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
985    v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
986    v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
987    v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
988    v8i16 tmp0, tmp1, tmp2, tmp3;
989    v16u8 alpha, beta;
990    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
991    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
992    v16u8 is_less_than_beta1, is_less_than_beta2;
993    v16i8 src0 = { 0 };
994    v16i8 src1 = { 0 };
995    v16i8 src2 = { 0 };
996    v16i8 src3 = { 0 };
997    v16i8 src4 = { 0 };
998    v16i8 src5 = { 0 };
999    v16i8 src6 = { 0 };
1000    v16i8 src7 = { 0 };
1001    v16i8 zeros = { 0 };
1002
1003    load0 = LD(src - 4);
1004    load1 = LD(src + stride - 4);
1005    src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0);
1006    src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1);
1007
1008    load0 = LD(src + (2 * stride) - 4);
1009    load1 = LD(src + (3 * stride) - 4);
1010    src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
1011    src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
1012
1013    load0 = LD(src + (4 * stride) - 4);
1014    load1 = LD(src + (5 * stride) - 4);
1015    src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
1016    src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
1017
1018    load0 = LD(src + (6 * stride) - 4);
1019    load1 = LD(src + (7 * stride) - 4);
1020    src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
1021    src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
1022
1023    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
1024               src0, src1, src2, src3);
1025
1026    ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
1027    ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
1028
1029    ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
1030    ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
1031    SLDI_B4_SB(zeros, src6, zeros, src1, zeros, src3, zeros, src5,
1032               8, src0, src2, src4, src7);
1033
1034    p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1035    p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1036    q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1037
1038    alpha = (v16u8) __msa_fill_b(alpha_in);
1039    beta = (v16u8) __msa_fill_b(beta_in);
1040
1041    is_less_than_alpha = (p0_asub_q0 < alpha);
1042    is_less_than_beta = (p1_asub_p0 < beta);
1043    is_less_than = is_less_than_alpha & is_less_than_beta;
1044    is_less_than_beta = (q1_asub_q0 < beta);
1045    is_less_than = is_less_than & is_less_than_beta;
1046
1047    alpha >>= 2;
1048    alpha += 2;
1049
1050    is_less_than_alpha = (p0_asub_q0 < alpha);
1051
1052    p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1053    is_less_than_beta1 = (p2_asub_p0 < beta);
1054    q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1055    is_less_than_beta2 = (q2_asub_q0 < beta);
1056
1057    ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
1058               src0_r, src1_r, src2_r, src3_r);
1059    ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
1060               src4_r, src5_r, src6_r, src7_r);
1061
1062    dst2_x_r = src1_r + src2_r + src3_r;
1063    dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
1064    dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
1065    dst1_r = src0_r + src1_r + src2_r + src3_r;
1066    dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
1067
1068    dst0_r = (2 * src6_r) + (3 * src0_r);
1069    dst0_r += src1_r + src2_r + src3_r;
1070    dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
1071    dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1072    dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1073
1074    PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
1075    dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
1076
1077    dst3_x_r = src2_r + src3_r + src4_r;
1078    dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
1079    dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
1080    dst4_r = src2_r + src3_r + src4_r + src5_r;
1081    dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
1082
1083    dst5_r = (2 * src7_r) + (3 * src5_r);
1084    dst5_r += src4_r + src3_r + src2_r;
1085    dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
1086    dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1087    dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1088
1089    PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
1090    dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
1091
1092    dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1093    dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1094    dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1095    dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1096
1097    PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
1098
1099    dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
1100    dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
1101    dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
1102    dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
1103
1104    is_less_than = is_less_than_alpha & is_less_than;
1105    dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
1106    is_less_than_beta1 = is_less_than_beta1 & is_less_than;
1107    dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
1108
1109    dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
1110    dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
1111    dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
1112    is_less_than_beta2 = is_less_than_beta2 & is_less_than;
1113    dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
1114    dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
1115    dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
1116
1117    ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
1118    dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
1119    ILVRL_H2_SH(dst1, dst0, tmp0, tmp1);
1120    ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
1121
1122    ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
1123    SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5);
1124    dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
1125    dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
1126    SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y);
1127
1128    out0 = __msa_copy_u_w((v4i32) dst0, 0);
1129    out1 = __msa_copy_u_h((v8i16) dst0, 2);
1130    out2 = __msa_copy_u_w((v4i32) dst1, 0);
1131    out3 = __msa_copy_u_h((v8i16) dst1, 2);
1132
1133    SW(out0, (src - 3));
1134    SH(out1, (src + 1));
1135    src += stride;
1136    SW(out2, (src - 3));
1137    SH(out3, (src + 1));
1138    src += stride;
1139
1140    out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
1141    out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
1142    out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
1143    out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
1144
1145    SW(out0, (src - 3));
1146    SH(out1, (src + 1));
1147    src += stride;
1148    SW(out2, (src - 3));
1149    SH(out3, (src + 1));
1150    src += stride;
1151
1152    out0 = __msa_copy_u_w((v4i32) dst4, 0);
1153    out1 = __msa_copy_u_h((v8i16) dst4, 2);
1154    out2 = __msa_copy_u_w((v4i32) dst5, 0);
1155    out3 = __msa_copy_u_h((v8i16) dst5, 2);
1156
1157    SW(out0, (src - 3));
1158    SH(out1, (src + 1));
1159    src += stride;
1160    SW(out2, (src - 3));
1161    SH(out3, (src + 1));
1162    src += stride;
1163
1164    out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
1165    out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
1166    out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
1167    out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
1168
1169    SW(out0, (src - 3));
1170    SH(out1, (src + 1));
1171    src += stride;
1172    SW(out2, (src - 3));
1173    SH(out3, (src + 1));
1174}
1175
1176static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr,
1177                                                       uint8_t alpha_in,
1178                                                       uint8_t beta_in,
1179                                                       ptrdiff_t img_width)
1180{
1181    v16u8 alpha, beta;
1182    v16u8 is_less_than;
1183    v8i16 p0_or_q0, q0_or_p0;
1184    v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1185    v16i8 zero = { 0 };
1186    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1187    v16u8 is_less_than_alpha, is_less_than_beta;
1188    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1189
1190    alpha = (v16u8) __msa_fill_b(alpha_in);
1191    beta = (v16u8) __msa_fill_b(beta_in);
1192
1193    LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
1194           p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
1195
1196    p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1197    p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1198    q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1199
1200    is_less_than_alpha = (p0_asub_q0 < alpha);
1201    is_less_than_beta = (p1_asub_p0 < beta);
1202    is_less_than = is_less_than_beta & is_less_than_alpha;
1203    is_less_than_beta = (q1_asub_q0 < beta);
1204    is_less_than = is_less_than_beta & is_less_than;
1205
1206    is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1207
1208    if (!__msa_test_bz_v(is_less_than)) {
1209        ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1210                   zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1211        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1212        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1213        PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1214
1215        p0_or_q0_org =
1216            __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1217        q0_or_p0_org =
1218            __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1219
1220        ST_UB(q0_or_p0_org, data_cb_or_cr);
1221        ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
1222    }
1223}
1224
1225static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr,
1226                                                       uint8_t alpha_in,
1227                                                       uint8_t beta_in,
1228                                                       ptrdiff_t img_width)
1229{
1230    v8i16 tmp1;
1231    v16u8 alpha, beta, is_less_than;
1232    v8i16 p0_or_q0, q0_or_p0;
1233    v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1234    v16i8 zero = { 0 };
1235    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1236    v16u8 is_less_than_alpha, is_less_than_beta;
1237    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1238
1239    {
1240        v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1241
1242        LD_UB8((data_cb_or_cr - 2), img_width,
1243               row0, row1, row2, row3, row4, row5, row6, row7);
1244
1245        TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1246                           p1_or_q1_org, p0_or_q0_org,
1247                           q0_or_p0_org, q1_or_p1_org);
1248    }
1249
1250    alpha = (v16u8) __msa_fill_b(alpha_in);
1251    beta = (v16u8) __msa_fill_b(beta_in);
1252
1253    p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1254    p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1255    q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1256
1257    is_less_than_alpha = (p0_asub_q0 < alpha);
1258    is_less_than_beta = (p1_asub_p0 < beta);
1259    is_less_than = is_less_than_beta & is_less_than_alpha;
1260    is_less_than_beta = (q1_asub_q0 < beta);
1261    is_less_than = is_less_than_beta & is_less_than;
1262    is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1263
1264    if (!__msa_test_bz_v(is_less_than)) {
1265        ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1266                   zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1267
1268        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1269        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1270
1271        /* convert 16 bit output into 8 bit output */
1272        PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1273
1274        p0_or_q0_org =
1275            __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1276        q0_or_p0_org =
1277            __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1278        tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
1279
1280        data_cb_or_cr -= 1;
1281        ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width);
1282        data_cb_or_cr += 4 * img_width;
1283        ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width);
1284    }
1285}
1286
1287static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t* pPix, uint32_t iStride,
1288                                                   uint8_t iAlpha, uint8_t iBeta,
1289                                                   uint8_t* pTc)
1290{
1291    v16u8 p0, p1, p2, q0, q1, q2;
1292    v16i8 iTc, negiTc, negTc, flags, f;
1293    v8i16 p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r, q1_l, q1_r, q2_l, q2_r;
1294    v8i16 tc_l, tc_r, negTc_l, negTc_r;
1295    v8i16 iTc_l, iTc_r, negiTc_l, negiTc_r;
1296    // Use for temporary variable
1297    v8i16 t0, t1, t2, t3;
1298    v16u8 alpha, beta;
1299    v16u8 bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0;
1300    v16i8 const_1_b = __msa_ldi_b(1);
1301    v8i16 const_1_h = __msa_ldi_h(1);
1302    v8i16 const_4_h = __msa_ldi_h(4);
1303    v8i16 const_not_255_h = __msa_ldi_h(~255);
1304    v16i8 zero = { 0 };
1305    v16i8 tc = { pTc[0  >> 2], pTc[1  >> 2], pTc[2  >> 2], pTc[3  >> 2],
1306                 pTc[4  >> 2], pTc[5  >> 2], pTc[6  >> 2], pTc[7  >> 2],
1307                 pTc[8  >> 2], pTc[9  >> 2], pTc[10 >> 2], pTc[11 >> 2],
1308                 pTc[12 >> 2], pTc[13 >> 2], pTc[14 >> 2], pTc[15 >> 2] };
1309    negTc = zero - tc;
1310    iTc = tc;
1311
1312    // Load data from pPix
1313    LD_SH8(pPix - 3, iStride, t0, t1, t2, t3, q1_l, q1_r, q2_l, q2_r);
1314    LD_SH8(pPix + 8 * iStride - 3, iStride, p0_l, p0_r, p1_l, p1_r,
1315           p2_l, p2_r, q0_l, q0_r);
1316    TRANSPOSE16x8_UB_UB(t0, t1, t2, t3, q1_l, q1_r, q2_l, q2_r,
1317                        p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r,
1318                        p2, p1, p0, q0, q1, q2, alpha, beta);
1319
1320    alpha = (v16u8)__msa_fill_b(iAlpha);
1321    beta  = (v16u8)__msa_fill_b(iBeta);
1322
1323    bDetaP0Q0 = __msa_asub_u_b(p0, q0);
1324    bDetaP1P0 = __msa_asub_u_b(p1, p0);
1325    bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
1326    bDetaP2P0 = __msa_asub_u_b(p2, p0);
1327    bDetaQ2Q0 = __msa_asub_u_b(q2, q0);
1328    bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha);
1329    bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
1330    bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
1331    bDetaP2P0 = (v16u8)__msa_clt_u_b(bDetaP2P0, beta);
1332    bDetaQ2Q0 = (v16u8)__msa_clt_u_b(bDetaQ2Q0, beta);
1333
1334    // Unsigned extend p0, p1, p2, q0, q1, q2 from 8 bits to 16 bits
1335    ILVRL_B2_SH(zero, p0, p0_r, p0_l);
1336    ILVRL_B2_SH(zero, p1, p1_r, p1_l);
1337    ILVRL_B2_SH(zero, p2, p2_r, p2_l);
1338    ILVRL_B2_SH(zero, q0, q0_r, q0_l);
1339    ILVRL_B2_SH(zero, q1, q1_r, q1_l);
1340    ILVRL_B2_SH(zero, q2, q2_r, q2_l);
1341    // Signed extend tc, negTc from 8 bits to 16 bits
1342    flags = __msa_clt_s_b(tc, zero);
1343    ILVRL_B2(v8i16, flags, tc, tc_r, tc_l);
1344    flags = __msa_clt_s_b(negTc, zero);
1345    ILVRL_B2(v8i16, flags, negTc, negTc_r, negTc_l);
1346
1347    f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
1348    flags = f & (v16i8)bDetaP2P0;
1349    flags = __msa_ceq_b(flags, zero);
1350    iTc += ((~flags) & const_1_b);
1351    flags = f & (v16i8)bDetaQ2Q0;
1352    flags = __msa_ceq_b(flags, zero);
1353    iTc += ((~flags) & const_1_b);
1354    negiTc = zero - iTc;
1355    // Signed extend iTc, negiTc from 8 bits to 16 bits
1356    flags = __msa_clt_s_b(iTc, zero);
1357    ILVRL_B2(v8i16, flags, iTc, iTc_r, iTc_l);
1358    flags = __msa_clt_s_b(negiTc, zero);
1359    ILVRL_B2(v8i16, flags, negiTc, negiTc_r, negiTc_l);
1360
1361    // Calculate the left part
1362    // p1
1363    t0 = (p2_l + ((p0_l + q0_l + const_1_h) >> 1) - (p1_l << 1)) >> 1;
1364    t0 = __msa_max_s_h(negTc_l, t0);
1365    t0 = __msa_min_s_h(tc_l, t0);
1366    t1 = p1_l + t0;
1367    // q1
1368    t0 = (q2_l + ((p0_l + q0_l + const_1_h) >> 1) - (q1_l << 1)) >> 1;
1369    t0 = __msa_max_s_h(negTc_l, t0);
1370    t0 = __msa_min_s_h(tc_l, t0);
1371    t2 = q1_l + t0;
1372    // iDeta
1373    t0 = (((q0_l - p0_l) << 2) + (p1_l - q1_l) + const_4_h) >> 3;
1374    t0 = __msa_max_s_h(negiTc_l, t0);
1375    t0 = __msa_min_s_h(iTc_l, t0);
1376    p1_l = t1;
1377    q1_l = t2;
1378    // p0
1379    t1 = p0_l + t0;
1380    t2 = t1 & const_not_255_h;
1381    t3 = __msa_cle_s_h((v8i16)zero, t1);
1382    flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1383    p0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1384    // q0
1385    t1 = q0_l - t0;
1386    t2 = t1 & const_not_255_h;
1387    t3 = __msa_cle_s_h((v8i16)zero, t1);
1388    flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1389    q0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1390
1391    // Calculate the right part
1392    // p1
1393    t0 = (p2_r + ((p0_r + q0_r + const_1_h) >> 1) - (p1_r << 1)) >> 1;
1394    t0 = __msa_max_s_h(negTc_r, t0);
1395    t0 = __msa_min_s_h(tc_r, t0);
1396    t1 = p1_r + t0;
1397    // q1
1398    t0 = (q2_r + ((p0_r + q0_r + const_1_h) >> 1) - (q1_r << 1)) >> 1;
1399    t0 = __msa_max_s_h(negTc_r, t0);
1400    t0 = __msa_min_s_h(tc_r, t0);
1401    t2 = q1_r + t0;
1402    // iDeta
1403    t0 = (((q0_r - p0_r) << 2) + (p1_r - q1_r) + const_4_h) >> 3;
1404    t0 = __msa_max_s_h(negiTc_r, t0);
1405    t0 = __msa_min_s_h(iTc_r, t0);
1406    p1_r = t1;
1407    q1_r = t2;
1408    // p0
1409    t1 = p0_r + t0;
1410    t2 = t1 & const_not_255_h;
1411    t3 = __msa_cle_s_h((v8i16)zero, t1);
1412    flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1413    p0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1414    // q0
1415    t1 = q0_r - t0;
1416    t2 = t1 & const_not_255_h;
1417    t3 = __msa_cle_s_h((v8i16)zero, t1);
1418    flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1419    q0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1420
1421    // Combined left and right
1422    PCKEV_B4(v8i16, p1_l, p1_r, p0_l, p0_r, q0_l, q0_r, q1_l, q1_r,
1423             t0, t1, t2, t3);
1424    flags = (v16i8)__msa_cle_s_b(zero, tc);
1425    flags &= f;
1426    p0 = (v16u8)(((v16i8)t1 & flags) + (p0 & (~flags)));
1427    q0 = (v16u8)(((v16i8)t2 & flags) + (q0 & (~flags)));
1428    // Using t1, t2 as temporary flags
1429    t1 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaP2P0, zero))));
1430    p1 = (v16u8)(t0 & t1) + (p1 & (v16u8)(~t1));
1431    t2 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaQ2Q0, zero))));
1432    q1 = (v16u8)(t3 & t2) + (q1 & (v16u8)(~t2));
1433
1434    ILVRL_B2_SH(p0, p1, t0, t1);
1435    ILVRL_B2_SH(q1, q0, t2, t3);
1436    ILVRL_H2_UB(t2, t0, p1, p0);
1437    ILVRL_H2_UB(t3, t1, q0, q1);
1438    // Store data to pPix
1439    ST_W8(p1, p0, 0, 1, 2, 3, 0, 1, 2, 3, pPix - 2, iStride);
1440    ST_W8(q0, q1, 0, 1, 2, 3, 0, 1, 2, 3, pPix + 8 * iStride - 2, iStride);
1441}
1442
1443static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data,
1444                                                   uint8_t bs0, uint8_t bs1,
1445                                                   uint8_t bs2, uint8_t bs3,
1446                                                   uint8_t tc0, uint8_t tc1,
1447                                                   uint8_t tc2, uint8_t tc3,
1448                                                   uint8_t alpha_in,
1449                                                   uint8_t beta_in,
1450                                                   ptrdiff_t image_width)
1451{
1452    v16u8 tmp_vec;
1453    v16u8 bs = { 0 };
1454
1455    tmp_vec = (v16u8) __msa_fill_b(bs0);
1456    bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1457    tmp_vec = (v16u8) __msa_fill_b(bs1);
1458    bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1459    tmp_vec = (v16u8) __msa_fill_b(bs2);
1460    bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1461    tmp_vec = (v16u8) __msa_fill_b(bs3);
1462    bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1463
1464    if (!__msa_test_bz_v(bs)) {
1465        v16u8 alpha, beta, is_less_than, is_less_than_beta;
1466        v16u8 p0, q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1467        v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1468        v16u8 is_less_than_alpha, is_bs_greater_than0;
1469        v8i16 p0_r, q0_r, p0_l, q0_l;
1470        v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1471        v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1472        v16i8 zero = { 0 };
1473        v16i8 tc = { 0 };
1474
1475        tmp_vec = (v16u8) __msa_fill_b(tc0);
1476        tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1477        tmp_vec = (v16u8) __msa_fill_b(tc1);
1478        tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1479        tmp_vec = (v16u8) __msa_fill_b(tc2);
1480        tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1481        tmp_vec = (v16u8) __msa_fill_b(tc3);
1482        tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1483
1484        alpha = (v16u8) __msa_fill_b(alpha_in);
1485        beta = (v16u8) __msa_fill_b(beta_in);
1486
1487        LD_UB5(data - (3 * image_width), image_width,
1488               p2_org, p1_org, p0_org, q0_org, q1_org);
1489
1490        is_bs_greater_than0 = ((v16u8) zero < bs);
1491        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1492        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1493        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1494
1495        is_less_than_alpha = (p0_asub_q0 < alpha);
1496        is_less_than_beta = (p1_asub_p0 < beta);
1497        is_less_than = is_less_than_beta & is_less_than_alpha;
1498        is_less_than_beta = (q1_asub_q0 < beta);
1499        is_less_than = is_less_than_beta & is_less_than;
1500        is_less_than = is_less_than & is_bs_greater_than0;
1501
1502        if (!__msa_test_bz_v(is_less_than)) {
1503            v16i8 sign_negate_tc, negate_tc;
1504            v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1505            v16u8 p2_asub_p0, q2_asub_q0;
1506
1507            q2_org = LD_UB(data + (2 * image_width));
1508            negate_tc = zero - tc;
1509            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1510
1511            ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1512
1513            UNPCK_UB_SH(tc, tc_r, tc_l);
1514            UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1515            UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1516            UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1517
1518            p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1519            is_less_than_beta = (p2_asub_p0 < beta);
1520            is_less_than_beta = is_less_than_beta & is_less_than;
1521
1522            if (!__msa_test_bz_v(is_less_than_beta)) {
1523                v16u8 p1;
1524                v8i16 p1_r = { 0 };
1525                v8i16 p1_l = { 0 };
1526                v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1527                v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1528
1529                AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1530                                 negate_tc_r, tc_r, p1_r);
1531                AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1532                                 i16_negatetc_l, tc_l, p1_l);
1533
1534                p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1535                p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1536                ST_UB(p1_org, data - (2 * image_width));
1537
1538                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1539                tc = tc + (v16i8) is_less_than_beta;
1540            }
1541
1542            q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1543            is_less_than_beta = (q2_asub_q0 < beta);
1544            is_less_than_beta = is_less_than_beta & is_less_than;
1545
1546            q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1547            q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1548
1549            if (!__msa_test_bz_v(is_less_than_beta)) {
1550                v16u8 q1;
1551                v8i16 q1_r = { 0 };
1552                v8i16 q1_l = { 0 };
1553                v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1554                v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1555
1556                AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1557                                 negate_tc_r, tc_r, q1_r);
1558                AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1559                                 i16_negatetc_l, tc_l, q1_l);
1560
1561                q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1562                q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1563                ST_UB(q1_org, data + image_width);
1564
1565                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1566                tc = tc + (v16i8) is_less_than_beta;
1567            }
1568            {
1569                v16i8 negate_thresh, sign_negate_thresh;
1570                v8i16 threshold_r, threshold_l;
1571                v8i16 negate_thresh_l, negate_thresh_r;
1572
1573                negate_thresh = zero - tc;
1574                sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1575
1576                ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1577                           threshold_r, negate_thresh_r);
1578                AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1579                             negate_thresh_r, threshold_r, p0_r, q0_r);
1580
1581                threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
1582                negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1583                                                       negate_thresh);
1584                AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1585                             negate_thresh_l, threshold_l, p0_l, q0_l);
1586            }
1587
1588            PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1589
1590            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1591            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1592
1593            ST_UB(p0_org, (data - image_width));
1594            ST_UB(q0_org, data);
1595        }
1596    }
1597}
1598
1599static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, ptrdiff_t stride,
1600                                             int32_t alpha_in, int32_t beta_in,
1601                                             int8_t *tc0)
1602{
1603    uint8_t *data = in;
1604    uint32_t out0, out1, out2, out3;
1605    uint64_t load;
1606    uint32_t tc_val;
1607    v16u8 alpha, beta;
1608    v16i8 inp0 = { 0 };
1609    v16i8 inp1 = { 0 };
1610    v16i8 inp2 = { 0 };
1611    v16i8 inp3 = { 0 };
1612    v16i8 inp4 = { 0 };
1613    v16i8 inp5 = { 0 };
1614    v16i8 inp6 = { 0 };
1615    v16i8 inp7 = { 0 };
1616    v16i8 src0, src1, src2, src3;
1617    v8i16 src4, src5, src6, src7;
1618    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1619    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1620    v16u8 is_less_than_beta1, is_less_than_beta2;
1621    v8i16 tc, tc_orig_r, tc_plus1;
1622    v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
1623    v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
1624    v8i16 src2_r, src3_r;
1625    v8i16 p2_r, p1_r, q2_r, q1_r;
1626    v16u8 p2, q2, p0, q0;
1627    v4i32 dst0, dst1;
1628    v16i8 zeros = { 0 };
1629
1630    alpha = (v16u8) __msa_fill_b(alpha_in);
1631    beta = (v16u8) __msa_fill_b(beta_in);
1632
1633    if (tc0[0] < 0) {
1634        data += (2 * stride);
1635    } else {
1636        load = LD(data - 3);
1637        inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
1638        load = LD(data - 3 + stride);
1639        inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
1640        data += (2 * stride);
1641    }
1642
1643    if (tc0[1] < 0) {
1644        data += (2 * stride);
1645    } else {
1646        load = LD(data - 3);
1647        inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
1648        load = LD(data - 3 + stride);
1649        inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
1650        data += (2 * stride);
1651    }
1652
1653    if (tc0[2] < 0) {
1654        data += (2 * stride);
1655    } else {
1656        load = LD(data - 3);
1657        inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
1658        load = LD(data - 3 + stride);
1659        inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
1660        data += (2 * stride);
1661    }
1662
1663    if (tc0[3] < 0) {
1664        data += (2 * stride);
1665    } else {
1666        load = LD(data - 3);
1667        inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
1668        load = LD(data - 3 + stride);
1669        inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
1670        data += (2 * stride);
1671    }
1672
1673    ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
1674               src0, src1, src2, src3);
1675
1676    ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
1677    ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
1678
1679    src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
1680    src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
1681    src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
1682    src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
1683    src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
1684    src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
1685
1686    p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1687    p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1688    q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1689    p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1690    q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1691
1692    is_less_than_alpha = (p0_asub_q0 < alpha);
1693    is_less_than_beta = (p1_asub_p0 < beta);
1694    is_less_than = is_less_than_alpha & is_less_than_beta;
1695    is_less_than_beta = (q1_asub_q0 < beta);
1696    is_less_than = is_less_than_beta & is_less_than;
1697
1698    is_less_than_beta1 = (p2_asub_p0 < beta);
1699    is_less_than_beta2 = (q2_asub_q0 < beta);
1700
1701    p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
1702    p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1703    p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
1704
1705    ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
1706    p2_r += p0_add_q0;
1707    p2_r >>= 1;
1708    p2_r -= p1_r;
1709    ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
1710    q2_r += p0_add_q0;
1711    q2_r >>= 1;
1712    q2_r -= q1_r;
1713
1714    tc_val = LW(tc0);
1715    tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
1716    tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
1717    is_tc_orig1 = tc_orig;
1718    is_tc_orig2 = tc_orig;
1719    tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
1720    tc = tc_orig_r;
1721
1722    CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
1723    CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
1724
1725    p2_r += p1_r;
1726    q2_r += q1_r;
1727
1728    PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2);
1729
1730    is_tc_orig1 = (zeros < is_tc_orig1);
1731    is_tc_orig2 = is_tc_orig1;
1732    is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
1733    is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
1734    is_tc_orig1 = is_less_than & is_tc_orig1;
1735    is_tc_orig2 = is_less_than & is_tc_orig2;
1736
1737    p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
1738    q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
1739
1740    q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1741    q0_sub_p0 <<= 2;
1742    p1_sub_q1 = p1_r - q1_r;
1743    q0_sub_p0 += p1_sub_q1;
1744    q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
1745
1746    tc_plus1 = tc + 1;
1747    is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
1748                                              (v16i8) is_less_than_beta1);
1749    tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
1750    tc_plus1 = tc + 1;
1751    is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
1752                                              (v16i8) is_less_than_beta2);
1753    tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
1754
1755    CLIP_SH(q0_sub_p0, -tc, tc);
1756
1757    ILVR_B2_SH(zeros, src2, zeros, src3, src2_r, src3_r);
1758    src2_r += q0_sub_p0;
1759    src3_r -= q0_sub_p0;
1760
1761    CLIP_SH2_0_255(src2_r, src3_r);
1762
1763    PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
1764
1765    p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
1766    q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
1767
1768    ILVR_B2_UB(p0, p2, q2, q0, p2, q2);
1769
1770    ILVRL_H2_SW(q2, p2, dst0, dst1);
1771
1772    data = in;
1773
1774    out0 = __msa_copy_u_w(dst0, 0);
1775    out1 = __msa_copy_u_w(dst0, 1);
1776    out2 = __msa_copy_u_w(dst0, 2);
1777    out3 = __msa_copy_u_w(dst0, 3);
1778
1779    if (tc0[0] < 0) {
1780        data += (2 * stride);
1781    } else {
1782        SW(out0, (data - 2));
1783        data += stride;
1784        SW(out1, (data - 2));
1785        data += stride;
1786    }
1787
1788    if (tc0[1] < 0) {
1789        data += (2 * stride);
1790    } else {
1791        SW(out2, (data - 2));
1792        data += stride;
1793        SW(out3, (data - 2));
1794        data += stride;
1795    }
1796
1797    out0 = __msa_copy_u_w(dst1, 0);
1798    out1 = __msa_copy_u_w(dst1, 1);
1799    out2 = __msa_copy_u_w(dst1, 2);
1800    out3 = __msa_copy_u_w(dst1, 3);
1801
1802    if (tc0[2] < 0) {
1803        data += (2 * stride);
1804    } else {
1805        SW(out0, (data - 2));
1806        data += stride;
1807        SW(out1, (data - 2));
1808        data += stride;
1809    }
1810
1811    if (tc0[3] >= 0) {
1812        SW(out2, (data - 2));
1813        data += stride;
1814        SW(out3, (data - 2));
1815    }
1816}
1817
1818static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data,
1819                                                       uint8_t bs0, uint8_t bs1,
1820                                                       uint8_t bs2, uint8_t bs3,
1821                                                       uint8_t tc0, uint8_t tc1,
1822                                                       uint8_t tc2, uint8_t tc3,
1823                                                       uint8_t alpha_in,
1824                                                       uint8_t beta_in,
1825                                                       ptrdiff_t img_width)
1826{
1827    v16u8 alpha, beta;
1828    v8i16 tmp_vec;
1829    v8i16 bs = { 0 };
1830    v8i16 tc = { 0 };
1831    v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
1832    v16u8 is_less_than;
1833    v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
1834    v8i16 p0_r, q0_r;
1835    v16u8 p1_org, p0_org, q0_org, q1_org;
1836    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1837    v16i8 negate_tc, sign_negate_tc;
1838    v8i16 tc_r, negate_tc_r;
1839    v16i8 zero = { 0 };
1840
1841    tmp_vec = (v8i16) __msa_fill_b(bs0);
1842    bs = __msa_insve_h(bs, 0, tmp_vec);
1843    tmp_vec = (v8i16) __msa_fill_b(bs1);
1844    bs = __msa_insve_h(bs, 1, tmp_vec);
1845    tmp_vec = (v8i16) __msa_fill_b(bs2);
1846    bs = __msa_insve_h(bs, 2, tmp_vec);
1847    tmp_vec = (v8i16) __msa_fill_b(bs3);
1848    bs = __msa_insve_h(bs, 3, tmp_vec);
1849
1850    if (!__msa_test_bz_v((v16u8) bs)) {
1851        tmp_vec = (v8i16) __msa_fill_b(tc0);
1852        tc = __msa_insve_h(tc, 0, tmp_vec);
1853        tmp_vec = (v8i16) __msa_fill_b(tc1);
1854        tc = __msa_insve_h(tc, 1, tmp_vec);
1855        tmp_vec = (v8i16) __msa_fill_b(tc2);
1856        tc = __msa_insve_h(tc, 2, tmp_vec);
1857        tmp_vec = (v8i16) __msa_fill_b(tc3);
1858        tc = __msa_insve_h(tc, 3, tmp_vec);
1859
1860        is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
1861
1862        alpha = (v16u8) __msa_fill_b(alpha_in);
1863        beta = (v16u8) __msa_fill_b(beta_in);
1864
1865        LD_UB4(data - (img_width << 1), img_width,
1866               p1_org, p0_org, q0_org, q1_org);
1867
1868        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1869        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1870        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1871
1872        is_less_than_alpha = (p0_asub_q0 < alpha);
1873        is_less_than_beta = (p1_asub_p0 < beta);
1874        is_less_than = is_less_than_beta & is_less_than_alpha;
1875        is_less_than_beta = (q1_asub_q0 < beta);
1876        is_less_than = is_less_than_beta & is_less_than;
1877        is_less_than = is_less_than & is_bs_greater_than0;
1878
1879        is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1880
1881        if (!__msa_test_bz_v(is_less_than)) {
1882            negate_tc = zero - (v16i8) tc;
1883            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1884
1885            ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
1886
1887            ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
1888                       p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1889
1890            AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
1891                         tc_r, p0_r, q0_r);
1892
1893            PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
1894
1895            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1896            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1897
1898            ST_UB(q0_org, data);
1899            ST_UB(p0_org, (data - img_width));
1900        }
1901    }
1902}
1903
1904static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data,
1905                                                       uint8_t bs0, uint8_t bs1,
1906                                                       uint8_t bs2, uint8_t bs3,
1907                                                       uint8_t tc0, uint8_t tc1,
1908                                                       uint8_t tc2, uint8_t tc3,
1909                                                       uint8_t alpha_in,
1910                                                       uint8_t beta_in,
1911                                                       ptrdiff_t img_width)
1912{
1913    uint8_t *src;
1914    v16u8 alpha, beta;
1915    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1916    v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
1917    v16u8 p0, q0;
1918    v8i16 p0_r = { 0 };
1919    v8i16 q0_r = { 0 };
1920    v16u8 p1_org, p0_org, q0_org, q1_org;
1921    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1922    v16u8 is_bs_greater_than0;
1923    v8i16 tc_r, negate_tc_r;
1924    v16i8 negate_tc, sign_negate_tc;
1925    v16i8 zero = { 0 };
1926    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1927    v8i16 tmp1, tmp_vec, bs = { 0 };
1928    v8i16 tc = { 0 };
1929
1930    tmp_vec = (v8i16) __msa_fill_b(bs0);
1931    bs = __msa_insve_h(bs, 0, tmp_vec);
1932    tmp_vec = (v8i16) __msa_fill_b(bs1);
1933    bs = __msa_insve_h(bs, 1, tmp_vec);
1934    tmp_vec = (v8i16) __msa_fill_b(bs2);
1935    bs = __msa_insve_h(bs, 2, tmp_vec);
1936    tmp_vec = (v8i16) __msa_fill_b(bs3);
1937    bs = __msa_insve_h(bs, 3, tmp_vec);
1938
1939    if (!__msa_test_bz_v((v16u8) bs)) {
1940        tmp_vec = (v8i16) __msa_fill_b(tc0);
1941        tc = __msa_insve_h(tc, 0, tmp_vec);
1942        tmp_vec = (v8i16) __msa_fill_b(tc1);
1943        tc = __msa_insve_h(tc, 1, tmp_vec);
1944        tmp_vec = (v8i16) __msa_fill_b(tc2);
1945        tc = __msa_insve_h(tc, 2, tmp_vec);
1946        tmp_vec = (v8i16) __msa_fill_b(tc3);
1947        tc = __msa_insve_h(tc, 3, tmp_vec);
1948
1949        is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
1950
1951        LD_UB8((data - 2), img_width,
1952               row0, row1, row2, row3, row4, row5, row6, row7);
1953
1954        TRANSPOSE8x4_UB_UB(row0, row1, row2, row3,
1955                           row4, row5, row6, row7,
1956                           p1_org, p0_org, q0_org, q1_org);
1957
1958        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1959        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1960        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1961
1962        alpha = (v16u8) __msa_fill_b(alpha_in);
1963        beta = (v16u8) __msa_fill_b(beta_in);
1964
1965        is_less_than_alpha = (p0_asub_q0 < alpha);
1966        is_less_than_beta = (p1_asub_p0 < beta);
1967        is_less_than = is_less_than_beta & is_less_than_alpha;
1968        is_less_than_beta = (q1_asub_q0 < beta);
1969        is_less_than = is_less_than_beta & is_less_than;
1970        is_less_than = is_bs_greater_than0 & is_less_than;
1971
1972        is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1973
1974        if (!__msa_test_bz_v(is_less_than)) {
1975            ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
1976                       p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1977
1978            negate_tc = zero - (v16i8) tc;
1979            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1980
1981            ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
1982
1983            AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
1984                         tc_r, p0_r, q0_r);
1985
1986            PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
1987
1988            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1989            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1990            tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
1991            src = data - 1;
1992            ST_H4(tmp1, 0, 1, 2, 3, src, img_width);
1993            src += 4 * img_width;
1994            ST_H4(tmp1, 4, 5, 6, 7, src, img_width);
1995        }
1996    }
1997}
1998
1999static void avc_h_loop_filter_chroma422_msa(uint8_t *src, ptrdiff_t stride,
2000                                            int32_t alpha_in, int32_t beta_in,
2001                                            int8_t *tc0)
2002{
2003    int32_t col, tc_val;
2004    v16u8 alpha, beta, res;
2005
2006    alpha = (v16u8) __msa_fill_b(alpha_in);
2007    beta = (v16u8) __msa_fill_b(beta_in);
2008
2009    for (col = 0; col < 4; col++) {
2010        tc_val = (tc0[col] - 1) + 1;
2011
2012        if (tc_val <= 0) {
2013            src += (4 * stride);
2014            continue;
2015        }
2016
2017        AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2018        ST_H4(res, 0, 1, 2, 3, (src - 1), stride);
2019        src += (4 * stride);
2020    }
2021}
2022
2023static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src,
2024                                                  ptrdiff_t stride,
2025                                                  int32_t alpha_in,
2026                                                  int32_t beta_in,
2027                                                  int8_t *tc0)
2028{
2029    int32_t col, tc_val;
2030    int16_t out0, out1;
2031    v16u8 alpha, beta, res;
2032
2033    alpha = (v16u8) __msa_fill_b(alpha_in);
2034    beta = (v16u8) __msa_fill_b(beta_in);
2035
2036    for (col = 0; col < 4; col++) {
2037        tc_val = (tc0[col] - 1) + 1;
2038
2039        if (tc_val <= 0) {
2040            src += 4 * stride;
2041            continue;
2042        }
2043
2044        AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2045
2046        out0 = __msa_copy_s_h((v8i16) res, 0);
2047        out1 = __msa_copy_s_h((v8i16) res, 1);
2048
2049        SH(out0, (src - 1));
2050        src += stride;
2051        SH(out1, (src - 1));
2052        src += stride;
2053    }
2054}
2055
2056void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2057                                  int alpha, int beta, int8_t *tc)
2058{
2059//    uint8_t bs0 = 1;
2060//    uint8_t bs1 = 1;
2061//    uint8_t bs2 = 1;
2062//    uint8_t bs3 = 1;
2063//
2064//    if (tc[0] < 0)
2065//        bs0 = 0;
2066//    if (tc[1] < 0)
2067//        bs1 = 0;
2068//    if (tc[2] < 0)
2069//        bs2 = 0;
2070//    if (tc[3] < 0)
2071//        bs3 = 0;
2072//
2073//    avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2074//                                           tc[0], tc[1], tc[2], tc[3],
2075//                                           alpha, beta, img_width);
2076    avc_loopfilter_luma_inter_edge_ver_msa(data, img_width, alpha, beta, tc);
2077}
2078
2079void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2080                                  int alpha, int beta, int8_t *tc)
2081{
2082
2083    uint8_t bs0 = 1;
2084    uint8_t bs1 = 1;
2085    uint8_t bs2 = 1;
2086    uint8_t bs3 = 1;
2087
2088    if (tc[0] < 0)
2089        bs0 = 0;
2090    if (tc[1] < 0)
2091        bs1 = 0;
2092    if (tc[2] < 0)
2093        bs2 = 0;
2094    if (tc[3] < 0)
2095        bs3 = 0;
2096
2097    avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2098                                           tc[0], tc[1], tc[2], tc[3],
2099                                           alpha, beta, img_width);
2100}
2101
2102void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2103                                    int alpha, int beta, int8_t *tc)
2104{
2105    uint8_t bs0 = 1;
2106    uint8_t bs1 = 1;
2107    uint8_t bs2 = 1;
2108    uint8_t bs3 = 1;
2109
2110    if (tc[0] < 0)
2111        bs0 = 0;
2112    if (tc[1] < 0)
2113        bs1 = 0;
2114    if (tc[2] < 0)
2115        bs2 = 0;
2116    if (tc[3] < 0)
2117        bs3 = 0;
2118
2119    avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2120                                               tc[0], tc[1], tc[2], tc[3],
2121                                               alpha, beta, img_width);
2122}
2123
2124void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2125                                    int alpha, int beta, int8_t *tc)
2126{
2127    uint8_t bs0 = 1;
2128    uint8_t bs1 = 1;
2129    uint8_t bs2 = 1;
2130    uint8_t bs3 = 1;
2131
2132    if (tc[0] < 0)
2133        bs0 = 0;
2134    if (tc[1] < 0)
2135        bs1 = 0;
2136    if (tc[2] < 0)
2137        bs2 = 0;
2138    if (tc[3] < 0)
2139        bs3 = 0;
2140
2141    avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2142                                               tc[0], tc[1], tc[2], tc[3],
2143                                               alpha, beta, img_width);
2144}
2145
2146void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2147                                  int alpha, int beta)
2148{
2149    avc_loopfilter_luma_intra_edge_ver_msa(data, (uint8_t) alpha,
2150                                           (uint8_t) beta,
2151                                           img_width);
2152}
2153
2154void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2155                                  int alpha, int beta)
2156{
2157    avc_loopfilter_luma_intra_edge_hor_msa(data, (uint8_t) alpha,
2158                                           (uint8_t) beta,
2159                                           img_width);
2160}
2161
2162void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2163                                    int alpha, int beta)
2164{
2165    avc_loopfilter_cb_or_cr_intra_edge_ver_msa(data, (uint8_t) alpha,
2166                                               (uint8_t) beta,
2167                                               img_width);
2168}
2169
2170void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2171                                    int alpha, int beta)
2172{
2173    avc_loopfilter_cb_or_cr_intra_edge_hor_msa(data, (uint8_t) alpha,
2174                                               (uint8_t) beta,
2175                                               img_width);
2176}
2177
2178void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src,
2179                                         ptrdiff_t ystride,
2180                                         int32_t alpha, int32_t beta,
2181                                         int8_t *tc0)
2182{
2183    avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0);
2184}
2185
2186void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src,
2187                                               ptrdiff_t ystride,
2188                                               int32_t alpha,
2189                                               int32_t beta,
2190                                               int8_t *tc0)
2191{
2192    avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0);
2193}
2194
2195void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src,
2196                                          ptrdiff_t ystride,
2197                                          int32_t alpha,
2198                                          int32_t beta,
2199                                          int8_t *tc0)
2200{
2201    avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0);
2202}
2203
2204void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
2205                                                ptrdiff_t ystride,
2206                                                int32_t alpha,
2207                                                int32_t beta)
2208{
2209    avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta);
2210}
2211
2212void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride,
2213                                   int height, int log2_denom,
2214                                   int weight_src, int offset_in)
2215{
2216    uint32_t offset_val;
2217    v16i8 zero = { 0 };
2218    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2219    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2220    v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
2221    v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r;
2222    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2223    v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2224    v8i16 wgt, denom, offset;
2225
2226    offset_val = (unsigned) offset_in << log2_denom;
2227
2228    wgt = __msa_fill_h(weight_src);
2229    offset = __msa_fill_h(offset_val);
2230    denom = __msa_fill_h(log2_denom);
2231
2232    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2233    ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, src1_r,
2234               src2_r, src3_r);
2235    ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, src1_l,
2236               src2_l, src3_l);
2237    ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, src5_r,
2238               src6_r, src7_r);
2239    ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, src5_l,
2240               src6_l, src7_l);
2241    MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2,
2242         tmp3);
2243    MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6,
2244         tmp7);
2245    MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10,
2246         tmp11);
2247    MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2248         tmp14, tmp15);
2249    ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
2250                tmp1, tmp2, tmp3);
2251    ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
2252                tmp5, tmp6, tmp7);
2253    ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, tmp8,
2254                tmp9, tmp10, tmp11);
2255    ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2256                tmp12, tmp13, tmp14, tmp15);
2257    MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2258    MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2259    SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2260    SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2261    SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2262    SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2263    PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2264                dst2, dst3);
2265    PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2266                dst5, dst6, dst7);
2267    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2268    src += 8 * stride;
2269
2270    if (16 == height) {
2271        LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2272        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r,
2273                   src1_r, src2_r, src3_r);
2274        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l,
2275                   src1_l, src2_l, src3_l);
2276        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r,
2277                   src5_r, src6_r, src7_r);
2278        ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l,
2279                   src5_l, src6_l, src7_l);
2280        MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1,
2281             tmp2, tmp3);
2282        MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5,
2283             tmp6, tmp7);
2284        MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9,
2285             tmp10, tmp11);
2286        MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2287             tmp14, tmp15);
2288        ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
2289                    tmp0, tmp1, tmp2, tmp3);
2290        ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
2291                    tmp4, tmp5, tmp6, tmp7);
2292        ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset,
2293                    tmp8, tmp9, tmp10, tmp11);
2294        ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2295                    tmp12, tmp13, tmp14, tmp15);
2296        MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2297        MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2298        SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2299        SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2300        SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2301        SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2302        PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2303                    dst2, dst3);
2304        PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2305                    dst5, dst6, dst7);
2306        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2307    }
2308}
2309
2310void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride,
2311                                  int height, int log2_denom,
2312                                  int weight_src, int offset)
2313{
2314    if (4 == height) {
2315        avc_wgt_8x4_msa(src, stride, log2_denom, weight_src, offset);
2316    } else if (8 == height) {
2317        avc_wgt_8x8_msa(src, stride, log2_denom, weight_src, offset);
2318    } else {
2319        avc_wgt_8x16_msa(src, stride, log2_denom, weight_src, offset);
2320    }
2321}
2322
2323void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride,
2324                                  int height, int log2_denom,
2325                                  int weight_src, int offset)
2326{
2327    if (2 == height) {
2328        avc_wgt_4x2_msa(src, stride, log2_denom, weight_src, offset);
2329    } else if (4 == height) {
2330        avc_wgt_4x4_msa(src, stride, log2_denom, weight_src, offset);
2331    } else {
2332        avc_wgt_4x8_msa(src, stride, log2_denom, weight_src, offset);
2333    }
2334}
2335
2336void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
2337                                     ptrdiff_t stride, int height,
2338                                     int log2_denom, int weight_dst,
2339                                     int weight_src, int offset_in)
2340{
2341    v16i8 src_wgt, dst_wgt, wgt;
2342    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2343    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2344    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2345    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2346    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2347    v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2348    v8i16 denom, offset;
2349
2350    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
2351    offset_in += (128 * (weight_src + weight_dst));
2352
2353    src_wgt = __msa_fill_b(weight_src);
2354    dst_wgt = __msa_fill_b(weight_dst);
2355    offset = __msa_fill_h(offset_in);
2356    denom = __msa_fill_h(log2_denom + 1);
2357
2358    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
2359
2360    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2361    src += 8 * stride;
2362    LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2363    XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2364    XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2365    ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2, vec4,
2366               vec6);
2367    ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3, vec5,
2368               vec7);
2369    ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2370               vec12, vec14);
2371    ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2372               vec13, vec15);
2373    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2374    tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2375    tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2376    tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2377    tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2378    tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2379    tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2380    tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2381    tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2382    tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2383    tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2384    tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2385    tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2386    tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2387    tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2388    tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2389    SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2390    SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2391    SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2392    SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2393    CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2394    CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2395    PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2396                dst2, dst3);
2397    PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2398                dst5, dst6, dst7);
2399    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2400    dst += 8 * stride;
2401
2402    if (16 == height) {
2403        LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2404        LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2405        XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2406        XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2407        ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2,
2408                   vec4, vec6);
2409        ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3,
2410                   vec5, vec7);
2411        ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2412                   vec12, vec14);
2413        ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2414                   vec13, vec15);
2415        tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2416        tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2417        tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2418        tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2419        tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2420        tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2421        tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2422        tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2423        tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2424        tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2425        tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2426        tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2427        tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2428        tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2429        tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2430        tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2431        SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2432        SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2433        SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2434        SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2435        CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2436        CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2437        PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2438                    dst2, dst3);
2439        PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2440                    dst5, dst6, dst7);
2441        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2442    }
2443}
2444
2445void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src,
2446                                    ptrdiff_t stride, int height,
2447                                    int log2_denom, int weight_dst,
2448                                    int weight_src, int offset)
2449{
2450    if (4 == height) {
2451        avc_biwgt_8x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2452                          offset);
2453    } else if (8 == height) {
2454        avc_biwgt_8x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2455                          offset);
2456    } else {
2457        avc_biwgt_8x16_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2458                           offset);
2459    }
2460}
2461
2462void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src,
2463                                    ptrdiff_t stride, int height,
2464                                    int log2_denom, int weight_dst,
2465                                    int weight_src, int offset)
2466{
2467    if (2 == height) {
2468        avc_biwgt_4x2_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2469                          offset);
2470    } else if (4 == height) {
2471        avc_biwgt_4x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2472                          offset);
2473    } else {
2474        avc_biwgt_4x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2475                          offset);
2476    }
2477}
2478