1/*
2 * Copyright (c) 2015 -2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mips/generic_macros_msa.h"
22#include "libavcodec/mips/hevcdsp_mips.h"
23
24static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
25                                         int32_t beta, int32_t *tc,
26                                         uint8_t *p_is_pcm, uint8_t *q_is_pcm)
27{
28    uint8_t *p3 = src - (stride << 2);
29    uint8_t *p2 = src - ((stride << 1) + stride);
30    uint8_t *p1 = src - (stride << 1);
31    uint8_t *p0 = src - stride;
32    uint8_t *q0 = src;
33    uint8_t *q1 = src + stride;
34    uint8_t *q2 = src + (stride << 1);
35    uint8_t *q3 = src + (stride << 1) + stride;
36    uint8_t flag0, flag1;
37    int32_t dp00, dq00, dp30, dq30, d00, d30;
38    int32_t d0030, d0434;
39    int32_t dp04, dq04, dp34, dq34, d04, d34;
40    int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
41    int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
42    uint64_t dst_val0, dst_val1;
43    v16u8 dst0, dst1, dst2, dst3, dst4, dst5;
44    v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
45    v2i64 cmp3;
46    v8u16 temp0, temp1;
47    v8i16 temp2;
48    v8i16 tc_pos, tc_neg;
49    v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
50    v16i8 zero = { 0 };
51    v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
52
53    dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
54    dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
55    dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
56    dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
57    d00 = dp00 + dq00;
58    d30 = dp30 + dq30;
59    dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
60    dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
61    dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
62    dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
63    d04 = dp04 + dq04;
64    d34 = dp34 + dq34;
65
66    p_is_pcm0 = p_is_pcm[0];
67    p_is_pcm4 = p_is_pcm[1];
68    q_is_pcm0 = q_is_pcm[0];
69    q_is_pcm4 = q_is_pcm[1];
70
71    cmp0 = __msa_fill_d(p_is_pcm0);
72    cmp1 = __msa_fill_d(p_is_pcm4);
73    p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
74    p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
75
76    d0030 = (d00 + d30) >= beta;
77    d0434 = (d04 + d34) >= beta;
78
79    cmp0 = (v2i64) __msa_fill_w(d0030);
80    cmp1 = (v2i64) __msa_fill_w(d0434);
81    cmp3 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
82    cmp3 = (v2i64) __msa_ceqi_w((v4i32) cmp3, 0);
83
84    if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
85        (!d0030 || !d0434)) {
86        p3_src = LD_UH(p3);
87        p2_src = LD_UH(p2);
88        p1_src = LD_UH(p1);
89        p0_src = LD_UH(p0);
90
91        cmp0 = __msa_fill_d(q_is_pcm0);
92        cmp1 = __msa_fill_d(q_is_pcm4);
93        q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
94        q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
95
96        tc0 = tc[0];
97        beta30 = beta >> 3;
98        beta20 = beta >> 2;
99        tc250 = ((tc0 * 5 + 1) >> 1);
100        tc4 = tc[1];
101        tc254 = ((tc4 * 5 + 1) >> 1);
102
103        cmp0 = (v2i64) __msa_fill_h(tc0);
104        cmp1 = (v2i64) __msa_fill_h(tc4);
105
106        ILVR_B4_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
107                   p3_src, p2_src, p1_src, p0_src);
108        q0_src = LD_UH(q0);
109        q1_src = LD_UH(q1);
110        q2_src = LD_UH(q2);
111        q3_src = LD_UH(q3);
112
113        flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
114                abs(p0[0] - q0[0]) < tc250;
115        flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
116                abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 &&
117                (d30 << 1) < beta20);
118
119        tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
120        ILVR_B4_UH(zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
121                   q0_src, q1_src, q2_src, q3_src);
122        flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
123                abs(p0[4] - q0[4]) < tc254;
124        flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
125                abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 &&
126                (d34 << 1) < beta20);
127
128        cmp0 = (v2i64) __msa_fill_w(flag0);
129        cmp1 = (v2i64) __msa_fill_w(flag1);
130        cmp2 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
131        cmp2 = (v2i64) __msa_ceqi_w((v4i32) cmp2, 0);
132
133        if (flag0 && flag1) { /* strong only */
134            /* strong filter */
135            tc_pos <<= 1;
136            tc_neg = -tc_pos;
137
138            /* p part */
139            temp0 = (p1_src + p0_src + q0_src);
140            temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
141            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
142            temp2 = (v8i16) (temp1 - p2_src);
143            CLIP_SH(temp2, tc_neg, tc_pos);
144            dst0 = (v16u8) (temp2 + (v8i16) p2_src);
145
146            temp1 = temp0 + p2_src;
147            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
148            temp2 = (v8i16) (temp1 - p1_src);
149            CLIP_SH(temp2, tc_neg, tc_pos);
150            dst1 = (v16u8) (temp2 + (v8i16) p1_src);
151
152            temp1 = (temp0 << 1) + p2_src + q1_src;
153            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
154            temp2 = (v8i16) (temp1 - p0_src);
155            CLIP_SH(temp2, tc_neg, tc_pos);
156            dst2 = (v16u8) (temp2 + (v8i16) p0_src);
157
158            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
159            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
160            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
161
162            /* q part */
163            temp0 = (q1_src + p0_src + q0_src);
164
165            temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
166            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
167            temp2 = (v8i16) (temp1 - q2_src);
168            CLIP_SH(temp2, tc_neg, tc_pos);
169            dst5 = (v16u8) (temp2 + (v8i16) q2_src);
170
171            temp1 = temp0 + q2_src;
172            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
173            temp2 = (v8i16) (temp1 - q1_src);
174            CLIP_SH(temp2, tc_neg, tc_pos);
175            dst4 = (v16u8) (temp2 + (v8i16) q1_src);
176
177            temp1 = (temp0 << 1) + p1_src + q2_src;
178            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
179            temp2 = (v8i16) (temp1 - q0_src);
180            CLIP_SH(temp2, tc_neg, tc_pos);
181            dst3 = (v16u8) (temp2 + (v8i16) q0_src);
182
183            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
184            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
185            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
186
187            /* pack results to 8 bit */
188            PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
189            dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
190
191            /* pack src to 8 bit */
192            PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
193            dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
194
195            dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
196            dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
197            dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
198
199            dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
200            dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
201
202            ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
203            SD(dst_val0, p2 + 4 * stride);
204            SD(dst_val1, p2 + 5 * stride);
205            /* strong filter ends */
206        } else if (flag0 == flag1) { /* weak only */
207            /* weak filter */
208            tc_neg = -tc_pos;
209
210            diff0 = (v8i16) (q0_src - p0_src);
211            diff1 = (v8i16) (q1_src - p1_src);
212            diff0 = (diff0 << 3) + diff0;
213            diff1 = (diff1 << 1) + diff1;
214            delta0 = diff0 - diff1;
215            delta0 = __msa_srari_h(delta0, 4);
216
217            temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
218            abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
219            abs_delta0 = (v8u16) abs_delta0 < temp1;
220
221            CLIP_SH(delta0, tc_neg, tc_pos);
222
223            temp2 = (v8i16) (delta0 + p0_src);
224            CLIP_SH_0_255(temp2);
225            temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
226                                        (v16u8) p_is_pcm_vec);
227
228            temp2 = (v8i16) (q0_src - delta0);
229            CLIP_SH_0_255(temp2);
230            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
231                                        (v16u8) q_is_pcm_vec);
232
233            p_is_pcm_vec = ~p_is_pcm_vec;
234            q_is_pcm_vec = ~q_is_pcm_vec;
235            tmp = (beta + (beta >> 1)) >> 3;
236            cmp0 = __msa_fill_d(dp00 + dp30 < tmp);
237            cmp1 = __msa_fill_d(dp04 + dp34 < tmp);
238            cmp0 = __msa_ilvev_d(cmp1, cmp0);
239            cmp0 = __msa_ceqi_d(cmp0, 0);
240            p_is_pcm_vec = p_is_pcm_vec | cmp0;
241
242            cmp0 = __msa_fill_d(dq00 + dq30 < tmp);
243            cmp1 = __msa_fill_d(dq04 + dq34 < tmp);
244            cmp0 = __msa_ilvev_d(cmp1, cmp0);
245            cmp0 = __msa_ceqi_d(cmp0, 0);
246            q_is_pcm_vec = q_is_pcm_vec | cmp0;
247
248            tc_pos >>= 1;
249            tc_neg = -tc_pos;
250
251            delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
252            delta1 -= (v8i16) p1_src;
253            delta1 += delta0;
254            delta1 >>= 1;
255            CLIP_SH(delta1, tc_neg, tc_pos);
256            delta1 = (v8i16) p1_src + (v8i16) delta1;
257            CLIP_SH_0_255(delta1);
258            delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
259                                          (v16u8) p_is_pcm_vec);
260
261            delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
262            delta2 = delta2 - (v8i16) q1_src;
263            delta2 = delta2 - delta0;
264            delta2 = delta2 >> 1;
265            CLIP_SH(delta2, tc_neg, tc_pos);
266            delta2 = (v8i16) q1_src + (v8i16) delta2;
267            CLIP_SH_0_255(delta2);
268            delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
269                                          (v16u8) q_is_pcm_vec);
270
271            dst1 = (v16u8) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
272                                       (v16u8) abs_delta0);
273            dst2 = (v16u8) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
274                                       (v16u8) abs_delta0);
275            dst3 = (v16u8) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
276                                       (v16u8) abs_delta0);
277            dst4 = (v16u8) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
278                                       (v16u8) abs_delta0);
279            /* pack results to 8 bit */
280            PCKEV_B2_UB(dst2, dst1, dst4, dst3, dst0, dst1);
281
282            /* pack src to 8 bit */
283            PCKEV_B2_UB(p0_src, p1_src, q1_src, q0_src, dst2, dst3);
284
285            dst0 = __msa_bmz_v(dst0, dst2, (v16u8) cmp3);
286            dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3);
287
288            p2 += stride;
289            ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
290            /* weak filter ends */
291        } else { /* strong + weak */
292            /* strong filter */
293            tc_pos <<= 1;
294            tc_neg = -tc_pos;
295
296            /* p part */
297            temp0 = (p1_src + p0_src + q0_src);
298            temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
299            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
300            temp2 = (v8i16) (temp1 - p2_src);
301            CLIP_SH(temp2, tc_neg, tc_pos);
302            dst0 = (v16u8) (temp2 + (v8i16) p2_src);
303
304            temp1 = temp0 + p2_src;
305            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
306            temp2 = (v8i16) (temp1 - p1_src);
307            CLIP_SH(temp2, tc_neg, tc_pos);
308            dst1 = (v16u8) (temp2 + (v8i16) p1_src);
309
310            temp1 = (temp0 << 1) + p2_src + q1_src;
311            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
312            temp2 = (v8i16) (temp1 - p0_src);
313            CLIP_SH(temp2, tc_neg, tc_pos);
314            dst2 = (v16u8) (temp2 + (v8i16) p0_src);
315
316            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
317            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
318            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
319
320            /* q part */
321            temp0 = (q1_src + p0_src + q0_src);
322
323            temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
324            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
325            temp2 = (v8i16) (temp1 - q2_src);
326            CLIP_SH(temp2, tc_neg, tc_pos);
327            dst5 = (v16u8) (temp2 + (v8i16) q2_src);
328
329            temp1 = temp0 + q2_src;
330            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
331            temp2 = (v8i16) (temp1 - q1_src);
332            CLIP_SH(temp2, tc_neg, tc_pos);
333            dst4 = (v16u8) (temp2 + (v8i16) q1_src);
334
335            temp1 = (temp0 << 1) + p1_src + q2_src;
336            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
337            temp2 = (v8i16) (temp1 - q0_src);
338            CLIP_SH(temp2, tc_neg, tc_pos);
339            dst3 = (v16u8) (temp2 + (v8i16) q0_src);
340
341            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
342            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
343            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
344
345            /* pack strong results to 8 bit */
346            PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
347            dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
348            /* strong filter ends */
349
350            /* weak filter */
351            tc_pos >>= 1;
352            tc_neg = -tc_pos;
353
354            diff0 = (v8i16) (q0_src - p0_src);
355            diff1 = (v8i16) (q1_src - p1_src);
356            diff0 = (diff0 << 3) + diff0;
357            diff1 = (diff1 << 1) + diff1;
358            delta0 = diff0 - diff1;
359            delta0 = __msa_srari_h(delta0, 4);
360
361            temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
362            abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
363            abs_delta0 = (v8u16) abs_delta0 < temp1;
364
365            CLIP_SH(delta0, tc_neg, tc_pos);
366
367            temp2 = (v8i16) (delta0 + p0_src);
368            CLIP_SH_0_255(temp2);
369            temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
370                                        (v16u8) p_is_pcm_vec);
371
372            temp2 = (v8i16) (q0_src - delta0);
373            CLIP_SH_0_255(temp2);
374            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
375                                        (v16u8) q_is_pcm_vec);
376
377            p_is_pcm_vec = ~p_is_pcm_vec;
378            q_is_pcm_vec = ~q_is_pcm_vec;
379            tmp = (beta + (beta >> 1)) >> 3;
380            cmp0 = __msa_fill_d(dp00 + dp30 < tmp);
381            cmp1 = __msa_fill_d(dp04 + dp34 < tmp);
382            cmp0 = __msa_ilvev_d(cmp1, cmp0);
383            p_is_pcm_vec = p_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
384
385            cmp0 = __msa_fill_d(dq00 + dq30 < tmp);
386            cmp1 = __msa_fill_d(dq04 + dq34 < tmp);
387            cmp0 = __msa_ilvev_d(cmp1, cmp0);
388            q_is_pcm_vec = q_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
389
390            tc_pos >>= 1;
391            tc_neg = -tc_pos;
392
393            delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
394            delta1 -= (v8i16) p1_src;
395            delta1 += delta0;
396            delta1 >>= 1;
397            CLIP_SH(delta1, tc_neg, tc_pos);
398            delta1 = (v8i16) p1_src + (v8i16) delta1;
399            CLIP_SH_0_255(delta1);
400            delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
401                                          (v16u8) p_is_pcm_vec);
402
403            delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
404            delta2 = delta2 - (v8i16) q1_src;
405            delta2 = delta2 - delta0;
406            delta2 = delta2 >> 1;
407            CLIP_SH(delta2, tc_neg, tc_pos);
408            delta2 = (v8i16) q1_src + (v8i16) delta2;
409            CLIP_SH_0_255(delta2);
410            delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
411                                          (v16u8) q_is_pcm_vec);
412
413            delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
414                                         (v16u8) abs_delta0);
415            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
416                                        (v16u8) abs_delta0);
417            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
418                                        (v16u8) abs_delta0);
419            delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
420                                         (v16u8) abs_delta0);
421            /* weak filter ends */
422
423            /* pack weak results to 8 bit */
424            PCKEV_B2_UB(delta1, p2_src, temp2, temp0, dst3, dst4);
425            dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) delta2);
426
427            /* select between weak or strong */
428            dst0 = __msa_bmnz_v(dst0, dst3, (v16u8) cmp2);
429            dst1 = __msa_bmnz_v(dst1, dst4, (v16u8) cmp2);
430            dst2 = __msa_bmnz_v(dst2, dst5, (v16u8) cmp2);
431
432            /* pack src to 8 bit */
433            PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
434            dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
435
436            dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
437            dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
438            dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
439
440            dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
441            dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
442
443            ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
444            SD(dst_val0, p2 + 4 * stride);
445            SD(dst_val1, p2 + 5 * stride);
446        }
447    }
448}
449
450static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
451                                         int32_t beta, int32_t *tc,
452                                         uint8_t *p_is_pcm, uint8_t *q_is_pcm)
453{
454    uint8_t *p3 = src;
455    uint8_t *p2 = src + 3 * stride;
456    uint8_t *p1 = src + (stride << 2);
457    uint8_t *p0 = src + 7 * stride;
458    uint8_t flag0, flag1;
459    uint16_t tmp0, tmp1;
460    uint32_t tmp2, tmp3;
461    int32_t dp00, dq00, dp30, dq30, d00, d30;
462    int32_t d0030, d0434;
463    int32_t dp04, dq04, dp34, dq34, d04, d34;
464    int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
465    int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
466    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
467    v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
468    v2i64 cmp3;
469    v8u16 temp0, temp1;
470    v8i16 temp2;
471    v8i16 tc_pos, tc_neg;
472    v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
473    v16i8 zero = { 0 };
474    v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
475
476    dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
477    dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
478    dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
479    dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
480    d00 = dp00 + dq00;
481    d30 = dp30 + dq30;
482    p_is_pcm0 = p_is_pcm[0];
483    q_is_pcm0 = q_is_pcm[0];
484
485    dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
486    dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
487    dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
488    dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
489    d04 = dp04 + dq04;
490    d34 = dp34 + dq34;
491    p_is_pcm4 = p_is_pcm[1];
492    q_is_pcm4 = q_is_pcm[1];
493
494    cmp0 = __msa_fill_d(p_is_pcm0);
495    cmp1 = __msa_fill_d(p_is_pcm4);
496    p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
497    p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
498
499    d0030 = (d00 + d30) >= beta;
500    d0434 = (d04 + d34) >= beta;
501
502    cmp0 = __msa_fill_d(d0030);
503    cmp1 = __msa_fill_d(d0434);
504    cmp3 = __msa_ilvev_d(cmp1, cmp0);
505    cmp3 = (v2i64) __msa_ceqi_d(cmp3, 0);
506
507    if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
508        (!d0030 || !d0434)) {
509        src -= 4;
510        LD_UH8(src, stride, p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
511               q2_src, q3_src);
512
513        cmp0 = __msa_fill_d(q_is_pcm0);
514        cmp1 = __msa_fill_d(q_is_pcm4);
515        q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
516        q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
517
518        tc0 = tc[0];
519        beta30 = beta >> 3;
520        beta20 = beta >> 2;
521        tc250 = ((tc0 * 5 + 1) >> 1);
522
523        tc4 = tc[1];
524        tc254 = ((tc4 * 5 + 1) >> 1);
525        cmp0 = (v2i64) __msa_fill_h(tc0 << 1);
526        cmp1 = (v2i64) __msa_fill_h(tc4 << 1);
527        tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
528
529        TRANSPOSE8x8_UB_UH(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
530                           q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
531                           q0_src, q1_src, q2_src, q3_src);
532
533        flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
534                abs(p3[-1] - p3[0]) < tc250;
535        flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
536                abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
537                (d30 << 1) < beta20);
538        cmp0 = __msa_fill_d(flag0);
539        ILVR_B4_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
540                   p3_src, p2_src, p1_src, p0_src);
541
542        flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
543                abs(p1[-1] - p1[0]) < tc254;
544        flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
545                abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
546                (d34 << 1) < beta20);
547        ILVR_B4_UH(zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
548                   q0_src, q1_src, q2_src, q3_src);
549
550        cmp1 = __msa_fill_d(flag1);
551        cmp2 = __msa_ilvev_d(cmp1, cmp0);
552        cmp2 = __msa_ceqi_d(cmp2, 0);
553
554        if (flag0 && flag1) { /* strong only */
555            /* strong filter */
556            tc_neg = -tc_pos;
557
558            /* p part */
559            temp0 = (p1_src + p0_src + q0_src);
560
561            temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
562            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
563            temp2 = (v8i16) (temp1 - p2_src);
564            CLIP_SH(temp2, tc_neg, tc_pos);
565            dst0 = (v16u8) (temp2 + (v8i16) p2_src);
566
567            temp1 = temp0 + p2_src;
568            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
569            temp2 = (v8i16) (temp1 - p1_src);
570            CLIP_SH(temp2, tc_neg, tc_pos);
571            dst1 = (v16u8) (temp2 + (v8i16) p1_src);
572
573            temp1 = (temp0 << 1) + p2_src + q1_src;
574            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
575            temp2 = (v8i16) (temp1 - p0_src);
576            CLIP_SH(temp2, tc_neg, tc_pos);
577            dst2 = (v16u8) (temp2 + (v8i16) p0_src);
578
579            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
580            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
581            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
582
583            /* q part */
584            temp0 = (q1_src + p0_src + q0_src);
585            temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
586            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
587            temp2 = (v8i16) (temp1 - q2_src);
588            CLIP_SH(temp2, tc_neg, tc_pos);
589            dst5 = (v16u8) (temp2 + (v8i16) q2_src);
590
591            temp1 = temp0 + q2_src;
592            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
593            temp2 = (v8i16) (temp1 - q1_src);
594            CLIP_SH(temp2, tc_neg, tc_pos);
595            dst4 = (v16u8) (temp2 + (v8i16) q1_src);
596
597            temp1 = (temp0 << 1) + p1_src + q2_src;
598            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
599            temp2 = (v8i16) (temp1 - q0_src);
600            CLIP_SH(temp2, tc_neg, tc_pos);
601            dst3 = (v16u8) (temp2 + (v8i16) q0_src);
602
603            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
604            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
605            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
606            /* strong filter ends */
607        } else if (flag0 == flag1) { /* weak only */
608            /* weak filter */
609            tc_pos >>= 1;
610            tc_neg = -tc_pos;
611
612            diff0 = (v8i16) (q0_src - p0_src);
613            diff1 = (v8i16) (q1_src - p1_src);
614            diff0 = (diff0 << 3) + diff0;
615            diff1 = (diff1 << 1) + diff1;
616            delta0 = diff0 - diff1;
617            delta0 = __msa_srari_h(delta0, 4);
618
619            temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
620            abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
621            abs_delta0 = (v8u16) abs_delta0 < temp1;
622
623            CLIP_SH(delta0, tc_neg, tc_pos);
624            temp2 = (v8i16) (delta0 + p0_src);
625            CLIP_SH_0_255(temp2);
626            temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
627                                        (v16u8) p_is_pcm_vec);
628
629            temp2 = (v8i16) (q0_src - delta0);
630            CLIP_SH_0_255(temp2);
631            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
632                                        (v16u8) q_is_pcm_vec);
633
634            tmp = ((beta + (beta >> 1)) >> 3);
635            cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp));
636            cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp));
637            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
638            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
639
640            cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
641            cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
642            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
643            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
644
645            tc_pos >>= 1;
646            tc_neg = -tc_pos;
647
648            delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
649            delta1 -= (v8i16) p1_src;
650            delta1 += delta0;
651            delta1 >>= 1;
652            CLIP_SH(delta1, tc_neg, tc_pos);
653            delta1 = (v8i16) p1_src + (v8i16) delta1;
654            CLIP_SH_0_255(delta1);
655            delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
656                                          (v16u8) p_is_pcm_vec);
657
658            delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
659            delta2 = delta2 - (v8i16) q1_src;
660            delta2 = delta2 - delta0;
661            delta2 = delta2 >> 1;
662            CLIP_SH(delta2, tc_neg, tc_pos);
663            delta2 = (v8i16) q1_src + (v8i16) delta2;
664            CLIP_SH_0_255(delta2);
665            delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
666                                          (v16u8) q_is_pcm_vec);
667
668            dst0 = __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
669                               (v16u8) abs_delta0);
670            dst1 = __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
671                               (v16u8) abs_delta0);
672            dst2 = __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
673                               (v16u8) abs_delta0);
674            dst3 = __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
675                               (v16u8) abs_delta0);
676            /* weak filter ends */
677
678            dst0 = __msa_bmz_v(dst0, (v16u8) p1_src, (v16u8) cmp3);
679            dst1 = __msa_bmz_v(dst1, (v16u8) p0_src, (v16u8) cmp3);
680            dst2 = __msa_bmz_v(dst2, (v16u8) q0_src, (v16u8) cmp3);
681            dst3 = __msa_bmz_v(dst3, (v16u8) q1_src, (v16u8) cmp3);
682
683            PCKEV_B2_UB(dst2, dst0, dst3, dst1, dst0, dst1);
684
685            /* transpose */
686            ILVRL_B2_UB(dst1, dst0, dst4, dst5);
687            ILVRL_H2_UB(dst5, dst4, dst0, dst1);
688
689            src += 2;
690
691            tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
692            tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
693            SW(tmp2, src);
694            src += stride;
695            SW(tmp3, src);
696            src += stride;
697
698            tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
699            tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
700            SW(tmp2, src);
701            src += stride;
702            SW(tmp3, src);
703            src += stride;
704
705            tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
706            tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
707            SW(tmp2, src);
708            src += stride;
709            SW(tmp3, src);
710            src += stride;
711
712            tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
713            tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
714            SW(tmp2, src);
715            src += stride;
716            SW(tmp3, src);
717
718            return;
719        } else { /* strong + weak */
720            /* strong filter */
721            tc_neg = -tc_pos;
722
723            /* p part */
724            temp0 = (p1_src + p0_src + q0_src);
725
726            temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
727            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
728            temp2 = (v8i16) (temp1 - p2_src);
729            CLIP_SH(temp2, tc_neg, tc_pos);
730            dst0 = (v16u8) (temp2 + (v8i16) p2_src);
731
732            temp1 = temp0 + p2_src;
733            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
734            temp2 = (v8i16) (temp1 - p1_src);
735            CLIP_SH(temp2, tc_neg, tc_pos);
736            dst1 = (v16u8) (temp2 + (v8i16) p1_src);
737
738            temp1 = (temp0 << 1) + p2_src + q1_src;
739            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
740            temp2 = (v8i16) (temp1 - p0_src);
741            CLIP_SH(temp2, tc_neg, tc_pos);
742            dst2 = (v16u8) (temp2 + (v8i16) p0_src);
743
744            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
745            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
746            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
747
748            /* q part */
749            temp0 = (q1_src + p0_src + q0_src);
750            temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
751            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
752            temp2 = (v8i16) (temp1 - q2_src);
753            CLIP_SH(temp2, tc_neg, tc_pos);
754            dst5 = (v16u8) (temp2 + (v8i16) q2_src);
755
756            temp1 = temp0 + q2_src;
757            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
758            temp2 = (v8i16) (temp1 - q1_src);
759            CLIP_SH(temp2, tc_neg, tc_pos);
760            dst4 = (v16u8) (temp2 + (v8i16) q1_src);
761
762            temp1 = (temp0 << 1) + p1_src + q2_src;
763            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
764            temp2 = (v8i16) (temp1 - q0_src);
765            CLIP_SH(temp2, tc_neg, tc_pos);
766            dst3 = (v16u8) (temp2 + (v8i16) q0_src);
767
768            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
769            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
770            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
771            /* strong filter ends */
772
773            /* weak filter */
774            tc_pos >>= 1;
775            tc_neg = -tc_pos;
776
777            diff0 = (v8i16) (q0_src - p0_src);
778            diff1 = (v8i16) (q1_src - p1_src);
779            diff0 = (diff0 << 3) + diff0;
780            diff1 = (diff1 << 1) + diff1;
781            delta0 = diff0 - diff1;
782            delta0 = __msa_srari_h(delta0, 4);
783
784            temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
785            abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
786            abs_delta0 = (v8u16) abs_delta0 < temp1;
787
788            CLIP_SH(delta0, tc_neg, tc_pos);
789
790            temp2 = (v8i16) (delta0 + p0_src);
791            CLIP_SH_0_255(temp2);
792            temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
793                                        (v16u8) p_is_pcm_vec);
794
795            temp2 = (v8i16) (q0_src - delta0);
796            CLIP_SH_0_255(temp2);
797            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
798                                        (v16u8) q_is_pcm_vec);
799
800            tmp = (beta + (beta >> 1)) >> 3;
801            cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp));
802            cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp));
803            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
804            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
805
806            cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
807            cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
808            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
809            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
810
811            tc_pos >>= 1;
812            tc_neg = -tc_pos;
813
814            delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
815            delta1 -= (v8i16) p1_src;
816            delta1 += delta0;
817            delta1 >>= 1;
818            CLIP_SH(delta1, tc_neg, tc_pos);
819            delta1 = (v8i16) p1_src + (v8i16) delta1;
820            CLIP_SH_0_255(delta1);
821            delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
822                                          (v16u8) p_is_pcm_vec);
823
824            delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
825            delta2 = delta2 - (v8i16) q1_src;
826            delta2 = delta2 - delta0;
827            delta2 = delta2 >> 1;
828            CLIP_SH(delta2, tc_neg, tc_pos);
829            delta2 = (v8i16) q1_src + (v8i16) delta2;
830            CLIP_SH_0_255(delta2);
831            delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
832                                          (v16u8) q_is_pcm_vec);
833            delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
834                                         (v16u8) abs_delta0);
835            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
836                                        (v16u8) abs_delta0);
837            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
838                                        (v16u8) abs_delta0);
839            delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
840                                         (v16u8) abs_delta0);
841            /* weak filter ends*/
842
843            /* select between weak or strong */
844            dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
845            dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
846            dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
847            dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
848            dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
849            dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
850        }
851
852        dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) cmp3);
853        dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) cmp3);
854        dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) cmp3);
855        dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) cmp3);
856        dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) cmp3);
857        dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) cmp3);
858
859        /* pack results to 8 bit */
860        PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst4, dst4, dst5, dst5, dst0, dst1,
861                    dst2, dst3);
862
863        /* transpose */
864        ILVRL_B2_UB(dst1, dst0, dst4, dst5);
865        ILVRL_B2_UB(dst3, dst2, dst6, dst7);
866        ILVRL_H2_UB(dst5, dst4, dst0, dst1);
867        ILVRL_H2_UB(dst7, dst6, dst2, dst3);
868
869        src += 1;
870
871        tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
872        tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
873        tmp0 = __msa_copy_u_h((v8i16) dst2, 0);
874        tmp1 = __msa_copy_u_h((v8i16) dst2, 2);
875        SW(tmp2, src);
876        SH(tmp0, src + 4);
877        src += stride;
878        SW(tmp3, src);
879        SH(tmp1, src + 4);
880        src += stride;
881
882        tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
883        tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
884        tmp0 = __msa_copy_u_h((v8i16) dst2, 4);
885        tmp1 = __msa_copy_u_h((v8i16) dst2, 6);
886        SW(tmp2, src);
887        SH(tmp0, src + 4);
888        src += stride;
889        SW(tmp3, src);
890        SH(tmp1, src + 4);
891        src += stride;
892
893        tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
894        tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
895        tmp0 = __msa_copy_u_h((v8i16) dst3, 0);
896        tmp1 = __msa_copy_u_h((v8i16) dst3, 2);
897        SW(tmp2, src);
898        SH(tmp0, src + 4);
899        src += stride;
900        SW(tmp3, src);
901        SH(tmp1, src + 4);
902        src += stride;
903
904        tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
905        tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
906        tmp0 = __msa_copy_u_h((v8i16) dst3, 4);
907        tmp1 = __msa_copy_u_h((v8i16) dst3, 6);
908        SW(tmp2, src);
909        SH(tmp0, src + 4);
910        src += stride;
911        SW(tmp3, src);
912        SH(tmp1, src + 4);
913    }
914}
915
916static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride,
917                                           int32_t *tc, uint8_t *p_is_pcm,
918                                           uint8_t *q_is_pcm)
919{
920    uint8_t *p1_ptr = src - (stride << 1);
921    uint8_t *p0_ptr = src - stride;
922    uint8_t *q0_ptr = src;
923    uint8_t *q1_ptr = src + stride;
924    v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
925    v8u16 p1, p0, q0, q1;
926    v8i16 tc_pos, tc_neg;
927    v16i8 zero = { 0 };
928    v8i16 temp0, temp1, delta;
929
930    if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
931        cmp0 = (v2i64) __msa_fill_h(tc[0]);
932        cmp1 = (v2i64) __msa_fill_h(tc[1]);
933        tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
934        tc_neg = -tc_pos;
935
936        cmp0 = __msa_fill_d(p_is_pcm[0]);
937        cmp1 = __msa_fill_d(p_is_pcm[1]);
938        p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
939        p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
940
941        cmp0 = __msa_fill_d(q_is_pcm[0]);
942        cmp1 = __msa_fill_d(q_is_pcm[1]);
943        q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
944        q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
945
946        p1 = LD_UH(p1_ptr);
947        p0 = LD_UH(p0_ptr);
948        q0 = LD_UH(q0_ptr);
949        q1 = LD_UH(q1_ptr);
950
951        ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
952
953        temp0 = (v8i16) (q0 - p0);
954        temp1 = (v8i16) (p1 - q1);
955        temp0 <<= 2;
956        temp0 += temp1;
957        delta = __msa_srari_h((v8i16) temp0, 3);
958        CLIP_SH(delta, tc_neg, tc_pos);
959
960        temp0 = (v8i16) ((v8i16) p0 + delta);
961        CLIP_SH_0_255(temp0);
962        temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
963                                    (v16u8) p_is_pcm_vec);
964
965        temp1 = (v8i16) ((v8i16) q0 - delta);
966        CLIP_SH_0_255(temp1);
967        temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
968                                    (v16u8) q_is_pcm_vec);
969
970        tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
971        temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
972        temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
973
974        temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
975        ST_D2(temp0, 0, 1, p0_ptr, stride);
976    }
977}
978
979static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride,
980                                           int32_t *tc, uint8_t *p_is_pcm,
981                                           uint8_t *q_is_pcm)
982{
983    v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
984    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
985    v8u16 p1, p0, q0, q1;
986    v8i16 tc_pos, tc_neg;
987    v16i8 zero = { 0 };
988    v8i16 temp0, temp1, delta;
989
990    if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
991        cmp0 = (v2i64) __msa_fill_h(tc[0]);
992        cmp1 = (v2i64) __msa_fill_h(tc[1]);
993        tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
994        tc_neg = -tc_pos;
995
996        cmp0 = __msa_fill_d(p_is_pcm[0]);
997        cmp1 = __msa_fill_d(p_is_pcm[1]);
998        p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
999        p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
1000
1001        cmp0 = __msa_fill_d(q_is_pcm[0]);
1002        cmp1 = __msa_fill_d(q_is_pcm[1]);
1003        q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
1004        q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
1005
1006        src -= 2;
1007        LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1008        TRANSPOSE8x4_UB_UH(src0, src1, src2, src3, src4, src5, src6, src7,
1009                           p1, p0, q0, q1);
1010        ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
1011
1012        temp0 = (v8i16) (q0 - p0);
1013        temp1 = (v8i16) (p1 - q1);
1014        temp0 <<= 2;
1015        temp0 += temp1;
1016        delta = __msa_srari_h((v8i16) temp0, 3);
1017        CLIP_SH(delta, tc_neg, tc_pos);
1018
1019        temp0 = (v8i16) ((v8i16) p0 + delta);
1020        CLIP_SH_0_255(temp0);
1021        temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
1022                                    (v16u8) p_is_pcm_vec);
1023
1024        temp1 = (v8i16) ((v8i16) q0 - delta);
1025        CLIP_SH_0_255(temp1);
1026        temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
1027                                    (v16u8) q_is_pcm_vec);
1028
1029        tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
1030        temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
1031        temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
1032
1033        temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
1034
1035        src += 1;
1036        ST_H8(temp0, 0, 1, 2, 3, 4, 5, 6, 7, src, stride);
1037    }
1038}
1039
1040static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride,
1041                                            uint8_t *src, int32_t src_stride,
1042                                            int32_t sao_left_class,
1043                                            int16_t *sao_offset_val,
1044                                            int32_t height)
1045{
1046    v16u8 src0, src1, src2, src3;
1047    v16i8 src0_r, src1_r;
1048    v16i8 offset, offset_val, mask;
1049    v16i8 dst0, offset0, offset1;
1050    v16i8 zero = { 0 };
1051
1052    offset_val = LD_SB(sao_offset_val + 1);
1053    offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1054
1055    offset_val = __msa_pckev_b(offset_val, offset_val);
1056    offset1 = (v16i8) __msa_insve_w((v4i32) zero, 3, (v4i32) offset_val);
1057    offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31));
1058    offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
1059
1060    /* load in advance. */
1061    LD_UB4(src, src_stride, src0, src1, src2, src3);
1062
1063    if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1064        SWAP(offset0, offset1);
1065    }
1066
1067    for (height -= 4; height; height -= 4) {
1068        src += (4 * src_stride);
1069
1070        ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r);
1071
1072        src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
1073        mask = __msa_srli_b(src0_r, 3);
1074        offset = __msa_vshf_b(mask, offset1, offset0);
1075
1076        src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
1077        dst0 = __msa_adds_s_b(src0_r, offset);
1078        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1079
1080        /* load in advance. */
1081        LD_UB4(src, src_stride, src0, src1, src2, src3);
1082
1083        /* store results */
1084        ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
1085        dst += (4 * dst_stride);
1086    }
1087
1088    ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r);
1089
1090    src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
1091    mask = __msa_srli_b(src0_r, 3);
1092    offset = __msa_vshf_b(mask, offset1, offset0);
1093
1094    src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
1095    dst0 = __msa_adds_s_b(src0_r, offset);
1096    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1097
1098    /* store results */
1099    ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
1100}
1101
1102static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
1103                                            uint8_t *src, int32_t src_stride,
1104                                            int32_t sao_left_class,
1105                                            int16_t *sao_offset_val,
1106                                            int32_t height)
1107{
1108    v16u8 src0, src1, src2, src3;
1109    v16i8 src0_r, src1_r, mask0, mask1;
1110    v16i8 offset_mask0, offset_mask1, offset_val;
1111    v16i8 offset0, offset1, dst0, dst1;
1112    v16i8 zero = { 0 };
1113
1114    offset_val = LD_SB(sao_offset_val + 1);
1115    offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1116    offset_val = __msa_pckev_b(offset_val, offset_val);
1117    offset1 = (v16i8) __msa_insve_w((v4i32) zero, 3, (v4i32) offset_val);
1118    offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31));
1119    offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
1120
1121    /* load in advance. */
1122    LD_UB4(src, src_stride, src0, src1, src2, src3);
1123
1124    if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1125        SWAP(offset0, offset1);
1126    }
1127
1128    for (height -= 4; height; height -= 4) {
1129        src += src_stride << 2;
1130
1131        ILVR_D2_SB(src1, src0, src3, src2, src0_r, src1_r);
1132
1133        mask0 = __msa_srli_b(src0_r, 3);
1134        mask1 = __msa_srli_b(src1_r, 3);
1135
1136        offset_mask0 = __msa_vshf_b(mask0, offset1, offset0);
1137        offset_mask1 = __msa_vshf_b(mask1, offset1, offset0);
1138
1139        /* load in advance. */
1140        LD_UB4(src, src_stride, src0, src1, src2, src3);
1141
1142        XORI_B2_128_SB(src0_r, src1_r);
1143
1144        dst0 = __msa_adds_s_b(src0_r, offset_mask0);
1145        dst1 = __msa_adds_s_b(src1_r, offset_mask1);
1146
1147        XORI_B2_128_SB(dst0, dst1);
1148
1149        /* store results */
1150        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1151        dst += dst_stride << 2;
1152    }
1153
1154    ILVR_D2_SB(src1, src0, src3, src2, src0_r, src1_r);
1155
1156    mask0 = __msa_srli_b(src0_r, 3);
1157    mask1 = __msa_srli_b(src1_r, 3);
1158
1159    offset_mask0 = __msa_vshf_b(mask0, offset1, offset0);
1160    offset_mask1 = __msa_vshf_b(mask1, offset1, offset0);
1161
1162    XORI_B2_128_SB(src0_r, src1_r);
1163
1164    dst0 = __msa_adds_s_b(src0_r, offset_mask0);
1165    dst1 = __msa_adds_s_b(src1_r, offset_mask1);
1166
1167    XORI_B2_128_SB(dst0, dst1);
1168
1169    /* store results */
1170    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1171}
1172
1173static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst,
1174                                                int32_t dst_stride,
1175                                                uint8_t *src,
1176                                                int32_t src_stride,
1177                                                int32_t sao_left_class,
1178                                                int16_t *sao_offset_val,
1179                                                int32_t width, int32_t height)
1180{
1181    int32_t w_cnt;
1182    v16u8 src0, src1, src2, src3;
1183    v16i8 out0, out1, out2, out3;
1184    v16i8 mask0, mask1, mask2, mask3;
1185    v16i8 tmp0, tmp1, tmp2, tmp3, offset_val;
1186    v16i8 offset0, offset1;
1187    v16i8 zero = { 0 };
1188
1189    offset_val = LD_SB(sao_offset_val + 1);
1190    offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1191    offset_val = __msa_pckev_b(offset_val, offset_val);
1192    offset1 = (v16i8) __msa_insve_w((v4i32) zero, 3, (v4i32) offset_val);
1193    offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31));
1194    offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
1195
1196    if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1197        SWAP(offset0, offset1);
1198    }
1199
1200    while (height > 0) {
1201        /* load in advance */
1202        LD_UB4(src, src_stride, src0, src1, src2, src3);
1203
1204        for (w_cnt = 16; w_cnt < width; w_cnt += 16) {
1205            mask0 = __msa_srli_b((v16i8) src0, 3);
1206            mask1 = __msa_srli_b((v16i8) src1, 3);
1207            mask2 = __msa_srli_b((v16i8) src2, 3);
1208            mask3 = __msa_srli_b((v16i8) src3, 3);
1209
1210            VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1,
1211                       tmp0, tmp1);
1212            VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3,
1213                       tmp2, tmp3);
1214            XORI_B4_128_UB(src0, src1, src2, src3);
1215
1216            out0 = __msa_adds_s_b((v16i8) src0, tmp0);
1217            out1 = __msa_adds_s_b((v16i8) src1, tmp1);
1218            out2 = __msa_adds_s_b((v16i8) src2, tmp2);
1219            out3 = __msa_adds_s_b((v16i8) src3, tmp3);
1220
1221            /* load for next iteration */
1222            LD_UB4(src + w_cnt, src_stride, src0, src1, src2, src3);
1223
1224            XORI_B4_128_SB(out0, out1, out2, out3);
1225
1226            ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride);
1227        }
1228
1229        mask0 = __msa_srli_b((v16i8) src0, 3);
1230        mask1 = __msa_srli_b((v16i8) src1, 3);
1231        mask2 = __msa_srli_b((v16i8) src2, 3);
1232        mask3 = __msa_srli_b((v16i8) src3, 3);
1233
1234        VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1, tmp0,
1235                   tmp1);
1236        VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3, tmp2,
1237                   tmp3);
1238        XORI_B4_128_UB(src0, src1, src2, src3);
1239
1240        out0 = __msa_adds_s_b((v16i8) src0, tmp0);
1241        out1 = __msa_adds_s_b((v16i8) src1, tmp1);
1242        out2 = __msa_adds_s_b((v16i8) src2, tmp2);
1243        out3 = __msa_adds_s_b((v16i8) src3, tmp3);
1244
1245        XORI_B4_128_SB(out0, out1, out2, out3);
1246
1247        ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride);
1248
1249        src += src_stride << 2;
1250        dst += dst_stride << 2;
1251        height -= 4;
1252    }
1253}
1254
1255static void hevc_sao_edge_filter_0degree_4width_msa(uint8_t *dst,
1256                                                    int32_t dst_stride,
1257                                                    uint8_t *src,
1258                                                    int32_t src_stride,
1259                                                    int16_t *sao_offset_val,
1260                                                    int32_t height)
1261{
1262    uint32_t dst_val0, dst_val1;
1263    v16u8 cmp_minus10, diff_minus10, diff_minus11, src_minus10, src_minus11;
1264    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1265    v16i8 sao_offset = LD_SB(sao_offset_val);
1266    v16i8 src_plus10, offset, src0, dst0;
1267    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1268    v16i8 zero = { 0 };
1269
1270    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1271    src -= 1;
1272
1273    /* load in advance */
1274    LD_UB2(src, src_stride, src_minus10, src_minus11);
1275
1276    for (height -= 2; height; height -= 2) {
1277        src += (2 * src_stride);
1278
1279        src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
1280                                            (v2i64) src_minus10);
1281
1282        src0 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 1);
1283        src_plus10 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 2);
1284
1285        cmp_minus10 = ((v16u8) src0 == src_minus10);
1286        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1287        cmp_minus10 = (src_minus10 < (v16u8) src0);
1288        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1289
1290        cmp_minus10 = ((v16u8) src0 == (v16u8) src_plus10);
1291        diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1292        cmp_minus10 = ((v16u8) src_plus10 < (v16u8) src0);
1293        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1294
1295        offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1296
1297        /* load in advance */
1298        LD_UB2(src, src_stride, src_minus10, src_minus11);
1299
1300        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1301                   offset, offset);
1302
1303        src0 = (v16i8) __msa_xori_b((v16u8) src0, 128);
1304        dst0 = __msa_adds_s_b(src0, offset);
1305        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1306
1307        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1308        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1309        SW(dst_val0, dst);
1310        dst += dst_stride;
1311        SW(dst_val1, dst);
1312        dst += dst_stride;
1313    }
1314
1315    src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
1316                                        (v2i64) src_minus10);
1317
1318    src0 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 1);
1319    src_plus10 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 2);
1320
1321    cmp_minus10 = ((v16u8) src0 == src_minus10);
1322    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1323    cmp_minus10 = (src_minus10 < (v16u8) src0);
1324    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1325
1326    cmp_minus10 = ((v16u8) src0 == (v16u8) src_plus10);
1327    diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1328    cmp_minus10 = ((v16u8) src_plus10 < (v16u8) src0);
1329    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1330
1331    offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1332    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1333               offset, offset);
1334
1335    src0 = (v16i8) __msa_xori_b((v16u8) src0, 128);
1336    dst0 = __msa_adds_s_b(src0, offset);
1337    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1338
1339    dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1340    dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1341
1342    SW(dst_val0, dst);
1343    dst += dst_stride;
1344    SW(dst_val1, dst);
1345}
1346
1347static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst,
1348                                                    int32_t dst_stride,
1349                                                    uint8_t *src,
1350                                                    int32_t src_stride,
1351                                                    int16_t *sao_offset_val,
1352                                                    int32_t height)
1353{
1354    uint64_t dst_val0, dst_val1;
1355    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1356    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1357    v16u8 cmp_minus10, diff_minus10, diff_minus11;
1358    v16u8 src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
1359    v16i8 offset, sao_offset = LD_SB(sao_offset_val);
1360    v16i8 zeros = { 0 };
1361
1362    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1363    src -= 1;
1364
1365    /* load in advance */
1366    LD_UB2(src, src_stride, src_minus10, src_minus11);
1367
1368    for (height -= 2; height; height -= 2) {
1369        src += (src_stride << 1);
1370
1371        SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 1, src0, src1);
1372        SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11);
1373
1374        PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10,
1375                    src_minus10, src_plus10);
1376        src0 = (v16u8) __msa_pckev_d((v2i64) src1, (v2i64) src0);
1377
1378        cmp_minus10 = (src0 == src_minus10);
1379        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1380        cmp_minus10 = (src_minus10 < src0);
1381        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1382
1383        cmp_minus10 = (src0 == src_plus10);
1384        diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1385        cmp_minus10 = (src_plus10 < src0);
1386        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1387
1388        offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1389
1390        /* load in advance */
1391        LD_UB2(src, src_stride, src_minus10, src_minus11);
1392
1393        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1394                   offset, offset);
1395
1396        src0 = __msa_xori_b(src0, 128);
1397        dst0 = (v16u8) __msa_adds_s_b((v16i8) src0, offset);
1398        dst0 = __msa_xori_b(dst0, 128);
1399
1400        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1401        dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1402        SD(dst_val0, dst);
1403        dst += dst_stride;
1404        SD(dst_val1, dst);
1405        dst += dst_stride;
1406    }
1407
1408    SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 1, src0, src1);
1409    SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11);
1410
1411    PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10, src_minus10,
1412                src_plus10);
1413    src0 = (v16u8) __msa_pckev_d((v2i64) src1, (v2i64) src0);
1414
1415    cmp_minus10 = ((v16u8) src0 == src_minus10);
1416    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1417    cmp_minus10 = (src_minus10 < (v16u8) src0);
1418    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1419
1420    cmp_minus10 = (src0 ==  src_plus10);
1421    diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1422    cmp_minus10 = (src_plus10 < src0);
1423    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1424
1425    offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1426
1427    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1428               offset, offset);
1429
1430    src0 = __msa_xori_b(src0, 128);
1431    dst0 = (v16u8) __msa_adds_s_b((v16i8) src0, offset);
1432    dst0 = __msa_xori_b(dst0, 128);
1433
1434    dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1435    dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1436    SD(dst_val0, dst);
1437    dst += dst_stride;
1438    SD(dst_val1, dst);
1439}
1440
1441static void hevc_sao_edge_filter_0degree_16multiple_msa(uint8_t *dst,
1442                                                        int32_t dst_stride,
1443                                                        uint8_t *src,
1444                                                        int32_t src_stride,
1445                                                        int16_t *sao_offset_val,
1446                                                        int32_t width,
1447                                                        int32_t height)
1448{
1449    uint8_t *dst_ptr, *src_minus1;
1450    int32_t v_cnt;
1451    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1452    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1453    v16i8 sao_offset;
1454    v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1455    v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1456    v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1457    v16u8 diff_plus13;
1458    v16u8 src10, src11, src12, src13, dst0, dst1, dst2, dst3;
1459    v16u8 src_minus10, src_minus11, src_minus12, src_minus13;
1460    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1461    v16i8 src_zero0, src_zero1, src_zero2, src_zero3;
1462    v16i8 src_plus10, src_plus11, src_plus12, src_plus13;
1463
1464    sao_offset = LD_SB(sao_offset_val);
1465    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1466
1467    for (; height; height -= 4) {
1468        src_minus1 = src - 1;
1469        LD_UB4(src_minus1, src_stride,
1470               src_minus10, src_minus11, src_minus12, src_minus13);
1471
1472        for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1473            src_minus1 += 16;
1474            dst_ptr = dst + v_cnt;
1475            LD_UB4(src_minus1, src_stride, src10, src11, src12, src13);
1476
1477            SLDI_B4_SB(src10, src_minus10, src11, src_minus11,
1478                       src12, src_minus12, src13, src_minus13, 1,
1479                       src_zero0, src_zero1, src_zero2, src_zero3);
1480            SLDI_B4_SB(src10, src_minus10, src11, src_minus11,
1481                       src12, src_minus12, src13, src_minus13, 2,
1482                       src_plus10, src_plus11, src_plus12, src_plus13);
1483
1484            cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1485            cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
1486            cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1487            cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
1488            cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
1489            cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
1490            cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
1491            cmp_plus13 = ((v16u8) src_zero3 == (v16u8) src_plus13);
1492
1493            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1494            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1495            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1496            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1497            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1498            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1499            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1500            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1501
1502            cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1503            cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
1504            cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1505            cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
1506            cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
1507            cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
1508            cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
1509            cmp_plus13 = ((v16u8) src_plus13 < (v16u8) src_zero3);
1510
1511            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1512            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1513            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1514            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1515            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1516            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1517            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1518            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1519
1520            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1521            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1522                       offset_mask0, offset_mask0, offset_mask0);
1523            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1524            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1525                       offset_mask1, offset_mask1, offset_mask1);
1526            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1527            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask2,
1528                       offset_mask2, offset_mask2, offset_mask2);
1529            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1530            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask3,
1531                       offset_mask3, offset_mask3, offset_mask3);
1532
1533            XORI_B4_128_SB(src_zero0, src_zero1, src_zero2, src_zero3);
1534
1535            dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
1536            dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
1537            dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
1538            dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
1539
1540            XORI_B4_128_UB(dst0, dst1, dst2, dst3);
1541
1542            src_minus10 = src10;
1543            ST_UB(dst0, dst_ptr);
1544            src_minus11 = src11;
1545            ST_UB(dst1, dst_ptr + dst_stride);
1546            src_minus12 = src12;
1547            ST_UB(dst2, dst_ptr + (dst_stride << 1));
1548            src_minus13 = src13;
1549            ST_UB(dst3, dst_ptr + (dst_stride * 3));
1550        }
1551
1552        src += (src_stride << 2);
1553        dst += (dst_stride << 2);
1554    }
1555}
1556
1557static void hevc_sao_edge_filter_90degree_4width_msa(uint8_t *dst,
1558                                                     int32_t dst_stride,
1559                                                     uint8_t *src,
1560                                                     int32_t src_stride,
1561                                                     int16_t *sao_offset_val,
1562                                                     int32_t height)
1563{
1564    uint32_t dst_val0, dst_val1;
1565    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1566    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1567    v16i8 dst0;
1568    v16i8 sao_offset = LD_SB(sao_offset_val);
1569    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1570    v16u8 src_minus10, src_minus11, src10, src11;
1571    v16i8 src_zero0, src_zero1;
1572    v16i8 offset;
1573    v8i16 offset_mask0, offset_mask1;
1574
1575    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1576
1577    /* load in advance */
1578    LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
1579    LD_UB2(src + src_stride, src_stride, src10, src11);
1580
1581    for (height -= 2; height; height -= 2) {
1582        src += (src_stride << 1);
1583
1584        src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1585        src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1586        src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1587        src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1588
1589        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1590        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1591        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1592        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1593
1594        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1595        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1596        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1597        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1598
1599        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1600        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1601
1602        offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1603        dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1604
1605        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1606                   offset, offset);
1607
1608        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1609        dst0 = __msa_adds_s_b(dst0, offset);
1610        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1611
1612        src_minus10 = src10;
1613        src_minus11 = src11;
1614
1615        /* load in advance */
1616        LD_UB2(src + src_stride, src_stride, src10, src11);
1617
1618        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1619        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1620        SW(dst_val0, dst);
1621        dst += dst_stride;
1622        SW(dst_val1, dst);
1623
1624        dst += dst_stride;
1625    }
1626
1627    src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1628    src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1629    src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1630    src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1631
1632    cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1633    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1634    cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1635    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1636
1637    cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1638    diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1639    cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1640    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1641
1642    offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1643    offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1644
1645    offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1646    dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1647
1648    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset,
1649               offset, offset, offset);
1650
1651    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1652    dst0 = __msa_adds_s_b(dst0, offset);
1653    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1654
1655    dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1656    dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1657    SW(dst_val0, dst);
1658    dst += dst_stride;
1659    SW(dst_val1, dst);
1660}
1661
1662static void hevc_sao_edge_filter_90degree_8width_msa(uint8_t *dst,
1663                                                     int32_t dst_stride,
1664                                                     uint8_t *src,
1665                                                     int32_t src_stride,
1666                                                     int16_t *sao_offset_val,
1667                                                     int32_t height)
1668{
1669    uint64_t dst_val0, dst_val1;
1670    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1671    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1672    v16i8 offset, sao_offset = LD_SB(sao_offset_val);
1673    v16i8 src_zero0, src_zero1, dst0;
1674    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1675    v16u8 src_minus10, src_minus11, src10, src11;
1676    v8i16 offset_mask0, offset_mask1;
1677
1678    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1679
1680    /* load in advance */
1681    LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
1682    LD_UB2(src + src_stride, src_stride, src10, src11);
1683
1684    for (height -= 2; height; height -= 2) {
1685        src += (src_stride << 1);
1686
1687        src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1688        src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1689        src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1690        src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1691
1692        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1693        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1694        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1695        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1696
1697        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1698        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1699        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1700        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1701
1702        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1703        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1704
1705        offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1706        dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1707
1708        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset,
1709                   offset, offset, offset);
1710
1711        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1712        dst0 = __msa_adds_s_b(dst0, offset);
1713        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1714
1715        src_minus10 = src10;
1716        src_minus11 = src11;
1717
1718        /* load in advance */
1719        LD_UB2(src + src_stride, src_stride, src10, src11);
1720
1721        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1722        dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1723        SD(dst_val0, dst);
1724        dst += dst_stride;
1725        SD(dst_val1, dst);
1726        dst += dst_stride;
1727    }
1728
1729    src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1730    src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1731    src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1732    src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1733
1734    cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1735    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1736    cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1737    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1738
1739    cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1740    diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1741    cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1742    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1743
1744    offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1745    offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1746
1747    offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1748    dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1749
1750    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1751               offset, offset);
1752
1753    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1754    dst0 = __msa_adds_s_b(dst0, offset);
1755    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1756
1757    dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1758    dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1759    SD(dst_val0, dst);
1760    dst += dst_stride;
1761    SD(dst_val1, dst);
1762}
1763
1764static void hevc_sao_edge_filter_90degree_16multiple_msa(uint8_t *dst,
1765                                                         int32_t dst_stride,
1766                                                         uint8_t *src,
1767                                                         int32_t src_stride,
1768                                                         int16_t *
1769                                                         sao_offset_val,
1770                                                         int32_t width,
1771                                                         int32_t height)
1772{
1773    uint8_t *src_orig = src;
1774    uint8_t *dst_orig = dst;
1775    int32_t h_cnt, v_cnt;
1776    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1777    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1778    v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1779    v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1780    v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1781    v16u8 diff_plus13;
1782    v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
1783    v16u8 src12, dst2, src13, dst3;
1784    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1785
1786    sao_offset = LD_SB(sao_offset_val);
1787    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1788
1789    for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1790        src = src_orig + v_cnt;
1791        dst = dst_orig + v_cnt;
1792
1793        LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
1794
1795        for (h_cnt = (height >> 2); h_cnt--;) {
1796            LD_UB4(src + src_stride, src_stride, src10, src11, src12, src13);
1797
1798            cmp_minus10 = (src_minus11 == src_minus10);
1799            cmp_plus10 = (src_minus11 == src10);
1800            cmp_minus11 = (src10 == src_minus11);
1801            cmp_plus11 = (src10 == src11);
1802            cmp_minus12 = (src11 == src10);
1803            cmp_plus12 = (src11 == src12);
1804            cmp_minus13 = (src12 == src11);
1805            cmp_plus13 = (src12 == src13);
1806
1807            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1808            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1809            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1810            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1811            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1812            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1813            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1814            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1815
1816            cmp_minus10 = (src_minus10 < src_minus11);
1817            cmp_plus10 = (src10 < src_minus11);
1818            cmp_minus11 = (src_minus11 < src10);
1819            cmp_plus11 = (src11 < src10);
1820            cmp_minus12 = (src10 < src11);
1821            cmp_plus12 = (src12 < src11);
1822            cmp_minus13 = (src11 < src12);
1823            cmp_plus13 = (src13 < src12);
1824
1825            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1826            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1827            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1828            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1829            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1830            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1831            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1832            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1833
1834            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1835            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1836                       offset_mask0, offset_mask0, offset_mask0, offset_mask0);
1837            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1838            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1839                       offset_mask1, offset_mask1, offset_mask1, offset_mask1);
1840            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1841            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1842                       offset_mask2, offset_mask2, offset_mask2, offset_mask2);
1843            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1844            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1845                       offset_mask3, offset_mask3, offset_mask3, offset_mask3);
1846
1847            src_minus10 = src12;
1848            XORI_B4_128_UB(src_minus11, src10, src11, src12);
1849
1850            dst0 = (v16u8) __msa_adds_s_b((v16i8) src_minus11, offset_mask0);
1851            dst1 = (v16u8) __msa_adds_s_b((v16i8) src10, offset_mask1);
1852            dst2 = (v16u8) __msa_adds_s_b((v16i8) src11, offset_mask2);
1853            dst3 = (v16u8) __msa_adds_s_b((v16i8) src12, offset_mask3);
1854
1855            XORI_B4_128_UB(dst0, dst1, dst2, dst3);
1856            src_minus11 = src13;
1857
1858            ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
1859
1860            src += (src_stride << 2);
1861            dst += (dst_stride << 2);
1862        }
1863    }
1864}
1865
1866static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst,
1867                                                     int32_t dst_stride,
1868                                                     uint8_t *src,
1869                                                     int32_t src_stride,
1870                                                     int16_t *sao_offset_val,
1871                                                     int32_t height)
1872{
1873    uint8_t *src_orig;
1874    uint32_t dst_val0, dst_val1;
1875    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1876    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1877    v16i8 offset, sao_offset = LD_SB(sao_offset_val);
1878    v16u8 cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
1879    v16u8 src_minus11, src10, src11;
1880    v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0;
1881    v8i16 offset_mask0, offset_mask1;
1882    v16i8 zeros = { 0 };
1883
1884    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1885
1886    src_orig = src - 1;
1887
1888    /* load in advance */
1889    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
1890    LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1891
1892    for (height -= 2; height; height -= 2) {
1893        src_orig += (src_stride << 1);
1894
1895        SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
1896        SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1);
1897
1898        ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
1899                   src_minus11);
1900        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1901                   src_zero1);
1902
1903        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1904        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1905        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1906        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1907
1908        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1909        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1910        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1911        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1912
1913        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1914        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1915
1916        offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1917        dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1918
1919        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset,
1920                   offset, offset, offset);
1921
1922        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1923        dst0 = __msa_adds_s_b(dst0, offset);
1924        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1925
1926        src_minus10 = src10;
1927        src_minus11 = src11;
1928
1929        /* load in advance */
1930        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1931
1932        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1933        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1934        SW(dst_val0, dst);
1935        dst += dst_stride;
1936        SW(dst_val1, dst);
1937
1938        dst += dst_stride;
1939    }
1940
1941    SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
1942    SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1);
1943
1944    ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
1945               src_minus11);
1946    ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1947               src_zero1);
1948
1949    cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1950    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1951    cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1952    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1953
1954    cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1955    diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1956    cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1957    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1958
1959    offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1960    offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1961
1962    offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1963    dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1964
1965    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1966               offset, offset);
1967
1968    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1969    dst0 = __msa_adds_s_b(dst0, offset);
1970    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1971
1972    dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1973    dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1974    SW(dst_val0, dst);
1975    dst += dst_stride;
1976    SW(dst_val1, dst);
1977}
1978
1979static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst,
1980                                                     int32_t dst_stride,
1981                                                     uint8_t *src,
1982                                                     int32_t src_stride,
1983                                                     int16_t *sao_offset_val,
1984                                                     int32_t height)
1985{
1986    uint8_t *src_orig;
1987    uint64_t dst_val0, dst_val1;
1988    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1989    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1990    v16i8 offset, sao_offset = LD_SB(sao_offset_val);
1991    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1992    v16u8 src_minus10, src10, src_minus11, src11;
1993    v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0;
1994    v8i16 offset_mask0, offset_mask1;
1995    v16i8 zeros = { 0 };
1996
1997    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1998    src_orig = src - 1;
1999
2000    /* load in advance */
2001    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2002    LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2003
2004    for (height -= 2; height; height -= 2) {
2005        src_orig += (src_stride << 1);
2006
2007        SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2008        SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11);
2009
2010        ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
2011                   src_minus10, src_minus11);
2012        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1,
2013                   src_zero0, src_zero1);
2014
2015        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2016        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2017        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2018        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2019
2020        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2021        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2022        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2023        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2024
2025        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2026        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2027
2028        offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2029        dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2030
2031        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2032                   offset, offset);
2033
2034        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2035        dst0 = __msa_adds_s_b(dst0, offset);
2036        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2037
2038        src_minus10 = src10;
2039        src_minus11 = src11;
2040
2041        /* load in advance */
2042        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2043
2044        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2045        dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2046        SD(dst_val0, dst);
2047        dst += dst_stride;
2048        SD(dst_val1, dst);
2049        dst += dst_stride;
2050    }
2051
2052    SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2053    SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11);
2054    ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, src_minus10,
2055               src_minus11);
2056    ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2057               src_zero1);
2058
2059    cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2060    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2061    cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2062    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2063
2064    cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2065    diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2066    cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2067    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2068
2069    offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2070    offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2071
2072    offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2073    dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2074
2075    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2076               offset, offset);
2077
2078    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2079    dst0 = __msa_adds_s_b(dst0, offset);
2080    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2081
2082    src_minus10 = src10;
2083    src_minus11 = src11;
2084
2085    /* load in advance */
2086    LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2087
2088    dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2089    dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2090    SD(dst_val0, dst);
2091    dst += dst_stride;
2092    SD(dst_val1, dst);
2093}
2094
2095static void hevc_sao_edge_filter_45degree_16multiple_msa(uint8_t *dst,
2096                                                         int32_t dst_stride,
2097                                                         uint8_t *src,
2098                                                         int32_t src_stride,
2099                                                         int16_t *
2100                                                         sao_offset_val,
2101                                                         int32_t width,
2102                                                         int32_t height)
2103{
2104    uint8_t *src_orig = src;
2105    uint8_t *dst_orig = dst;
2106    int32_t v_cnt;
2107    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2108    v16u8 const1 = (v16u8) __msa_ldi_b(1);
2109    v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
2110    v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
2111    v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
2112    v16u8 diff_plus13, src_minus14, src_plus13;
2113    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
2114    v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
2115    v16u8 src12, src_minus12, dst2, src13, src_minus13, dst3;
2116    v16i8 src_zero0, src_plus10, src_zero1, src_plus11, src_zero2, src_plus12;
2117    v16i8 src_zero3, sao_offset;
2118
2119    sao_offset = LD_SB(sao_offset_val);
2120    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2121
2122    for (; height; height -= 4) {
2123        src_orig = src - 1;
2124        dst_orig = dst;
2125        LD_UB4(src_orig, src_stride, src_minus11, src_minus12, src_minus13,
2126               src_minus14);
2127
2128        for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
2129            src_minus10 = LD_UB(src_orig - src_stride);
2130            LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
2131            src_plus13 = LD_UB(src + 1 + v_cnt + (src_stride << 2));
2132            src_orig += 16;
2133
2134            SLDI_B4_SB(src10, src_minus11, src11, src_minus12,
2135                       src12, src_minus13, src13, src_minus14, 1,
2136                       src_zero0, src_zero1, src_zero2, src_zero3);
2137            SLDI_B2_SB(src11, src_minus12, src12, src_minus13, 2, src_plus10,
2138                       src_plus11);
2139
2140            src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2);
2141
2142            cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2143            cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
2144            cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2145            cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
2146            cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
2147            cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
2148            cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
2149            cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
2150
2151            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2152            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
2153            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2154            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
2155            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
2156            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
2157            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
2158            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
2159
2160            cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2161            cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
2162            cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2163            cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
2164            cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
2165            cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
2166            cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
2167            cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
2168
2169            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2170            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
2171            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2172            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
2173            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
2174            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
2175            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
2176            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
2177
2178            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
2179            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
2180            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
2181            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
2182
2183            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2184                       offset_mask0, offset_mask0, offset_mask0, offset_mask0);
2185            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2186                       offset_mask1, offset_mask1, offset_mask1, offset_mask1);
2187            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2188                       offset_mask2, offset_mask2, offset_mask2, offset_mask2);
2189            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2190                       offset_mask3, offset_mask3, offset_mask3, offset_mask3);
2191
2192            XORI_B4_128_SB(src_zero0, src_zero1, src_zero2, src_zero3);
2193
2194            dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
2195            dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
2196            dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
2197            dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
2198
2199            XORI_B4_128_UB(dst0, dst1, dst2, dst3);
2200
2201            src_minus11 = src10;
2202            src_minus12 = src11;
2203            src_minus13 = src12;
2204            src_minus14 = src13;
2205
2206            ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
2207            dst_orig += 16;
2208        }
2209
2210        src += (src_stride << 2);
2211        dst += (dst_stride << 2);
2212    }
2213}
2214
2215static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst,
2216                                                      int32_t dst_stride,
2217                                                      uint8_t *src,
2218                                                      int32_t src_stride,
2219                                                      int16_t *sao_offset_val,
2220                                                      int32_t height)
2221{
2222    uint8_t *src_orig;
2223    uint32_t dst_val0, dst_val1;
2224    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2225    v16u8 const1 = (v16u8) __msa_ldi_b(1);
2226    v16i8 offset, sao_offset = LD_SB(sao_offset_val);
2227    v16i8 src_zero0, src_zero1, dst0;
2228    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2229    v16u8 src_minus10, src10, src_minus11, src11;
2230    v8i16 offset_mask0, offset_mask1;
2231    v16i8 zeros = { 0 };
2232
2233    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2234    src_orig = src - 1;
2235
2236    /* load in advance */
2237    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2238    LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2239
2240    for (height -= 2; height; height -= 2) {
2241        src_orig += (src_stride << 1);
2242
2243        SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2244        SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2245
2246        ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2247                   src_minus11);
2248        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2249                   src_zero1);
2250
2251        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2252        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2253        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2254        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2255
2256        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2257        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2258        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2259        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2260
2261        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2262        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2263
2264        offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2265        dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2266
2267        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2268                   offset, offset);
2269
2270        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2271        dst0 = __msa_adds_s_b(dst0, offset);
2272        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2273
2274        src_minus10 = src10;
2275        src_minus11 = src11;
2276
2277        /* load in advance */
2278        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2279
2280        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
2281        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
2282
2283        SW(dst_val0, dst);
2284        dst += dst_stride;
2285        SW(dst_val1, dst);
2286
2287        dst += dst_stride;
2288    }
2289
2290    SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2291    SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2292
2293    ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2294               src_minus11);
2295    ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2296               src_zero1);
2297
2298    cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2299    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2300    cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2301    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2302
2303    cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2304    diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2305    cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2306    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2307
2308    offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2309    offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2310
2311    offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2312    dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2313
2314    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2315               offset, offset);
2316
2317    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2318    dst0 = __msa_adds_s_b(dst0, offset);
2319    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2320
2321    dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
2322    dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
2323
2324    SW(dst_val0, dst);
2325    dst += dst_stride;
2326    SW(dst_val1, dst);
2327    dst += dst_stride;
2328}
2329
2330static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst,
2331                                                      int32_t dst_stride,
2332                                                      uint8_t *src,
2333                                                      int32_t src_stride,
2334                                                      int16_t *sao_offset_val,
2335                                                      int32_t height)
2336{
2337    uint8_t *src_orig;
2338    uint64_t dst_val0, dst_val1;
2339    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2340    v16u8 const1 = (v16u8) __msa_ldi_b(1);
2341    v16i8 offset, sao_offset = LD_SB(sao_offset_val);
2342    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2343    v16u8 src_minus10, src10, src_minus11, src11;
2344    v16i8 src_zero0, src_zero1, dst0;
2345    v8i16 offset_mask0, offset_mask1;
2346    v16i8 zeros = { 0 };
2347
2348    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2349    src_orig = src - 1;
2350
2351    /* load in advance */
2352    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2353    LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2354
2355    for (height -= 2; height; height -= 2) {
2356        src_orig += (src_stride << 1);
2357
2358        SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2359        SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2360        ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2361                   src_minus11);
2362        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2363                   src_zero1);
2364
2365        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2366        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2367        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2368        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2369
2370        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2371        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2372        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2373        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2374
2375        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2376        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2377
2378        offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2379        dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2380
2381        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2382                   offset, offset);
2383
2384        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2385        dst0 = __msa_adds_s_b(dst0, offset);
2386        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2387
2388        src_minus10 = src10;
2389        src_minus11 = src11;
2390
2391        /* load in advance */
2392        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2393
2394        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2395        dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2396
2397        SD(dst_val0, dst);
2398        dst += dst_stride;
2399        SD(dst_val1, dst);
2400        dst += dst_stride;
2401    }
2402
2403    SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2404    SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2405    ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2406               src_minus11);
2407    ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2408               src_zero1);
2409
2410    cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2411    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2412    cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2413    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2414
2415    cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2416    diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2417    cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2418    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2419
2420    offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2421    offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2422
2423    offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2424    dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2425
2426    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2427               offset, offset);
2428
2429    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2430    dst0 = __msa_adds_s_b(dst0, offset);
2431    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2432
2433    dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2434    dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2435
2436    SD(dst_val0, dst);
2437    dst += dst_stride;
2438    SD(dst_val1, dst);
2439    dst += dst_stride;
2440}
2441
2442static void hevc_sao_edge_filter_135degree_16multiple_msa(uint8_t *dst,
2443                                                          int32_t dst_stride,
2444                                                          uint8_t *src,
2445                                                          int32_t src_stride,
2446                                                          int16_t *
2447                                                          sao_offset_val,
2448                                                          int32_t width,
2449                                                          int32_t height)
2450{
2451    uint8_t *src_orig, *dst_orig;
2452    int32_t v_cnt;
2453    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2454    v16u8 const1 = (v16u8) __msa_ldi_b(1);
2455    v16u8 dst0, dst1, dst2, dst3;
2456    v16u8 cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
2457    v16u8 cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
2458    v16u8 diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
2459    v16u8 diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
2460    v16u8 src_plus10, src_plus11, src_plus12, src_plus13;
2461    v16i8 src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
2462    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
2463
2464    sao_offset = LD_SB(sao_offset_val);
2465    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2466
2467    for (; height; height -= 4) {
2468        src_orig = src - 1;
2469        dst_orig = dst;
2470
2471        LD_UB4(src_orig, src_stride, src_minus11, src_plus10, src_plus11,
2472               src_plus12);
2473
2474        for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
2475            src_minus10 = LD_UB(src_orig + 2 - src_stride);
2476            LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
2477            src_plus13 = LD_UB(src_orig + (src_stride << 2));
2478            src_orig += 16;
2479
2480            src_zero0 = __msa_sldi_b((v16i8) src10, (v16i8) src_minus11, 1);
2481            cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2482            cmp_plus10 = ((v16u8) src_zero0 == src_plus10);
2483
2484            src_zero1 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 1);
2485            src_minus11 = (v16u8) __msa_sldi_b((v16i8) src10,
2486                                               (v16i8) src_minus11, 2);
2487            cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2488            cmp_plus11 = ((v16u8) src_zero1 == src_plus11);
2489
2490            src_zero2 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 1);
2491            src_minus12 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 2);
2492            cmp_minus12 = ((v16u8) src_zero2 == (v16u8) src_minus12);
2493            cmp_plus12 = ((v16u8) src_zero2 == src_plus12);
2494
2495            src_zero3 = __msa_sldi_b((v16i8) src13, (v16i8) src_plus12, 1);
2496            src_minus13 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 2);
2497            cmp_minus13 = ((v16u8) src_zero3 == (v16u8) src_minus13);
2498            cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
2499
2500            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2501            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
2502            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2503            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
2504            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
2505            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
2506            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
2507            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
2508
2509            cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2510            cmp_plus10 = (src_plus10 < (v16u8) src_zero0);
2511            cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2512            cmp_plus11 = (src_plus11 < (v16u8) src_zero1);
2513            cmp_minus12 = ((v16u8) src_minus12 < (v16u8) src_zero2);
2514            cmp_plus12 = (src_plus12 < (v16u8) src_zero2);
2515            cmp_minus13 = ((v16u8) src_minus13 < (v16u8) src_zero3);
2516            cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
2517
2518            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2519            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
2520            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2521            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
2522            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
2523            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
2524            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
2525            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
2526
2527            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
2528            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
2529            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
2530            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
2531
2532            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2533                       offset_mask0, offset_mask0, offset_mask0, offset_mask0);
2534            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2535                       offset_mask1, offset_mask1, offset_mask1, offset_mask1);
2536            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2537                       offset_mask2, offset_mask2, offset_mask2, offset_mask2);
2538            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2539                       offset_mask3, offset_mask3, offset_mask3, offset_mask3);
2540
2541            XORI_B4_128_SB(src_zero0, src_zero1, src_zero2, src_zero3);
2542
2543            dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
2544            dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
2545            dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
2546            dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
2547
2548            XORI_B4_128_UB(dst0, dst1, dst2, dst3);
2549
2550            src_minus11 = src10;
2551            src_plus10 = src11;
2552            src_plus11 = src12;
2553            src_plus12 = src13;
2554
2555            ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
2556            dst_orig += 16;
2557        }
2558
2559        src += (src_stride << 2);
2560        dst += (dst_stride << 2);
2561    }
2562}
2563
2564void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src,
2565                                      ptrdiff_t src_stride,
2566                                      int32_t beta, int32_t *tc,
2567                                      uint8_t *no_p, uint8_t *no_q)
2568{
2569    hevc_loopfilter_luma_hor_msa(src, src_stride, beta, tc, no_p, no_q);
2570}
2571
2572void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src,
2573                                      ptrdiff_t src_stride,
2574                                      int32_t beta, int32_t *tc,
2575                                      uint8_t *no_p, uint8_t *no_q)
2576{
2577    hevc_loopfilter_luma_ver_msa(src, src_stride, beta, tc, no_p, no_q);
2578}
2579
2580void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src,
2581                                        ptrdiff_t src_stride,
2582                                        int32_t *tc, uint8_t *no_p,
2583                                        uint8_t *no_q)
2584{
2585    hevc_loopfilter_chroma_hor_msa(src, src_stride, tc, no_p, no_q);
2586}
2587
2588void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src,
2589                                        ptrdiff_t src_stride,
2590                                        int32_t *tc, uint8_t *no_p,
2591                                        uint8_t *no_q)
2592{
2593    hevc_loopfilter_chroma_ver_msa(src, src_stride, tc, no_p, no_q);
2594}
2595
2596void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src,
2597                                     ptrdiff_t stride_dst, ptrdiff_t stride_src,
2598                                     int16_t *sao_offset_val, int sao_left_class,
2599                                     int width, int height)
2600{
2601    if (width >> 4) {
2602        hevc_sao_band_filter_16multiple_msa(dst, stride_dst, src, stride_src,
2603                                            sao_left_class, sao_offset_val,
2604                                            width - (width % 16), height);
2605        dst += width - (width % 16);
2606        src += width - (width % 16);
2607        width %= 16;
2608    }
2609
2610    if (width >> 3) {
2611        hevc_sao_band_filter_8width_msa(dst, stride_dst, src, stride_src,
2612                                        sao_left_class, sao_offset_val, height);
2613        dst += 8;
2614        src += 8;
2615        width %= 8;
2616    }
2617
2618    if (width) {
2619        hevc_sao_band_filter_4width_msa(dst, stride_dst, src, stride_src,
2620                                        sao_left_class, sao_offset_val, height);
2621    }
2622}
2623
2624void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src,
2625                                   ptrdiff_t stride_dst,
2626                                   int16_t *sao_offset_val,
2627                                   int eo, int width, int height)
2628{
2629    ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(uint8_t);
2630
2631    switch (eo) {
2632    case 0:
2633        if (width >> 4) {
2634            hevc_sao_edge_filter_0degree_16multiple_msa(dst, stride_dst,
2635                                                        src, stride_src,
2636                                                        sao_offset_val,
2637                                                        width - (width % 16),
2638                                                        height);
2639            dst += width - (width % 16);
2640            src += width - (width % 16);
2641            width %= 16;
2642        }
2643
2644        if (width >> 3) {
2645            hevc_sao_edge_filter_0degree_8width_msa(dst, stride_dst,
2646                                                    src, stride_src,
2647                                                    sao_offset_val, height);
2648            dst += 8;
2649            src += 8;
2650            width %= 8;
2651        }
2652
2653        if (width) {
2654            hevc_sao_edge_filter_0degree_4width_msa(dst, stride_dst,
2655                                                    src, stride_src,
2656                                                    sao_offset_val, height);
2657        }
2658        break;
2659
2660    case 1:
2661        if (width >> 4) {
2662            hevc_sao_edge_filter_90degree_16multiple_msa(dst, stride_dst,
2663                                                         src, stride_src,
2664                                                         sao_offset_val,
2665                                                         width - (width % 16),
2666                                                         height);
2667            dst += width - (width % 16);
2668            src += width - (width % 16);
2669            width %= 16;
2670        }
2671
2672        if (width >> 3) {
2673            hevc_sao_edge_filter_90degree_8width_msa(dst, stride_dst,
2674                                                     src, stride_src,
2675                                                     sao_offset_val, height);
2676            dst += 8;
2677            src += 8;
2678            width %= 8;
2679        }
2680
2681        if (width) {
2682            hevc_sao_edge_filter_90degree_4width_msa(dst, stride_dst,
2683                                                     src, stride_src,
2684                                                     sao_offset_val, height);
2685        }
2686        break;
2687
2688    case 2:
2689        if (width >> 4) {
2690            hevc_sao_edge_filter_45degree_16multiple_msa(dst, stride_dst,
2691                                                         src, stride_src,
2692                                                         sao_offset_val,
2693                                                         width - (width % 16),
2694                                                         height);
2695            dst += width - (width % 16);
2696            src += width - (width % 16);
2697            width %= 16;
2698        }
2699
2700        if (width >> 3) {
2701            hevc_sao_edge_filter_45degree_8width_msa(dst, stride_dst,
2702                                                     src, stride_src,
2703                                                     sao_offset_val, height);
2704            dst += 8;
2705            src += 8;
2706            width %= 8;
2707        }
2708
2709        if (width) {
2710            hevc_sao_edge_filter_45degree_4width_msa(dst, stride_dst,
2711                                                     src, stride_src,
2712                                                     sao_offset_val, height);
2713        }
2714        break;
2715
2716    case 3:
2717        if (width >> 4) {
2718            hevc_sao_edge_filter_135degree_16multiple_msa(dst, stride_dst,
2719                                                          src, stride_src,
2720                                                          sao_offset_val,
2721                                                          width - (width % 16),
2722                                                          height);
2723            dst += width - (width % 16);
2724            src += width - (width % 16);
2725            width %= 16;
2726        }
2727
2728        if (width >> 3) {
2729            hevc_sao_edge_filter_135degree_8width_msa(dst, stride_dst,
2730                                                      src, stride_src,
2731                                                      sao_offset_val, height);
2732            dst += 8;
2733            src += 8;
2734            width %= 8;
2735        }
2736
2737        if (width) {
2738            hevc_sao_edge_filter_135degree_4width_msa(dst, stride_dst,
2739                                                      src, stride_src,
2740                                                      sao_offset_val, height);
2741        }
2742        break;
2743    }
2744}
2745