1/*
2 * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavcodec/vp9dsp.h"
22#include "libavutil/mips/generic_macros_msa.h"
23#include "vp9dsp_mips.h"
24
25#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
26                           p1_out, p0_out, q0_out, q1_out)               \
27{                                                                        \
28    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt, filt1, filt2;         \
29    const v16i8 cnst4b = __msa_ldi_b(4);                                 \
30    const v16i8 cnst3b = __msa_ldi_b(3);                                 \
31                                                                         \
32    p1_m = (v16i8) __msa_xori_b(p1_in, 0x80);                            \
33    p0_m = (v16i8) __msa_xori_b(p0_in, 0x80);                            \
34    q0_m = (v16i8) __msa_xori_b(q0_in, 0x80);                            \
35    q1_m = (v16i8) __msa_xori_b(q1_in, 0x80);                            \
36                                                                         \
37    filt = __msa_subs_s_b(p1_m, q1_m);                                   \
38                                                                         \
39    filt = filt & (v16i8) hev_in;                                        \
40                                                                         \
41    q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);                              \
42    filt = __msa_adds_s_b(filt, q0_sub_p0);                              \
43    filt = __msa_adds_s_b(filt, q0_sub_p0);                              \
44    filt = __msa_adds_s_b(filt, q0_sub_p0);                              \
45    filt = filt & (v16i8) mask_in;                                       \
46                                                                         \
47    filt1 = __msa_adds_s_b(filt, cnst4b);                                \
48    filt1 >>= 3;                                                         \
49                                                                         \
50    filt2 = __msa_adds_s_b(filt, cnst3b);                                \
51    filt2 >>= 3;                                                         \
52                                                                         \
53    q0_m = __msa_subs_s_b(q0_m, filt1);                                  \
54    q0_out = __msa_xori_b((v16u8) q0_m, 0x80);                           \
55    p0_m = __msa_adds_s_b(p0_m, filt2);                                  \
56    p0_out = __msa_xori_b((v16u8) p0_m, 0x80);                           \
57                                                                         \
58    filt = __msa_srari_b(filt1, 1);                                      \
59    hev_in = __msa_xori_b((v16u8) hev_in, 0xff);                         \
60    filt = filt & (v16i8) hev_in;                                        \
61                                                                         \
62    q1_m = __msa_subs_s_b(q1_m, filt);                                   \
63    q1_out = __msa_xori_b((v16u8) q1_m, 0x80);                           \
64    p1_m = __msa_adds_s_b(p1_m, filt);                                   \
65    p1_out = __msa_xori_b((v16u8) p1_m, 0x80);                           \
66}
67
68#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)  \
69{                                                                      \
70    v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;     \
71    v16u8 zero_in = { 0 };                                             \
72                                                                       \
73    tmp = __msa_ori_b(zero_in, 1);                                     \
74    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                        \
75    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                        \
76    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                        \
77    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                        \
78                                                                       \
79    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);             \
80    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                   \
81    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);             \
82    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                   \
83                                                                       \
84    flat_out = (tmp < (v16u8) flat_out);                               \
85    flat_out = __msa_xori_b(flat_out, 0xff);                           \
86    flat_out = flat_out & (mask);                                      \
87}
88
89#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,  \
90                  q5_in, q6_in, q7_in, flat_in, flat2_out)          \
91{                                                                   \
92    v16u8 tmp, zero_in = { 0 };                                     \
93    v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;       \
94    v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;       \
95                                                                    \
96    tmp = __msa_ori_b(zero_in, 1);                                  \
97    p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                     \
98    q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                     \
99    p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                     \
100    q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                     \
101    p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                     \
102    q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                     \
103    p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                     \
104    q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                     \
105                                                                    \
106    p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);          \
107    flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);            \
108    flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);              \
109    p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);          \
110    flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);              \
111    p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);          \
112    flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);              \
113                                                                    \
114    flat2_out = (tmp < (v16u8) flat2_out);                          \
115    flat2_out = __msa_xori_b(flat2_out, 0xff);                      \
116    flat2_out = flat2_out & flat_in;                                \
117}
118
119#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in,                \
120                    q0_in, q1_in, q2_in, q3_in,                \
121                    p2_filt8_out, p1_filt8_out, p0_filt8_out,  \
122                    q0_filt8_out, q1_filt8_out, q2_filt8_out)  \
123{                                                              \
124    v8u16 tmp0, tmp1, tmp2;                                    \
125                                                               \
126    tmp2 = p2_in + p1_in + p0_in;                              \
127    tmp0 = p3_in << 1;                                         \
128                                                               \
129    tmp0 = tmp0 + tmp2 + q0_in;                                \
130    tmp1 = tmp0 + p3_in + p2_in;                               \
131    p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
132                                                               \
133    tmp1 = tmp0 + p1_in + q1_in;                               \
134    p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
135                                                               \
136    tmp1 = q2_in + q1_in + q0_in;                              \
137    tmp2 = tmp2 + tmp1;                                        \
138    tmp0 = tmp2 + (p0_in);                                     \
139    tmp0 = tmp0 + (p3_in);                                     \
140    p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3);     \
141                                                               \
142    tmp0 = q2_in + q3_in;                                      \
143    tmp0 = p0_in + tmp1 + tmp0;                                \
144    tmp1 = q3_in + q3_in;                                      \
145    tmp1 = tmp1 + tmp0;                                        \
146    q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
147                                                               \
148    tmp0 = tmp2 + q3_in;                                       \
149    tmp1 = tmp0 + q0_in;                                       \
150    q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
151                                                               \
152    tmp1 = tmp0 - p2_in;                                       \
153    tmp0 = q1_in + q3_in;                                      \
154    tmp1 = tmp0 + tmp1;                                        \
155    q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
156}
157
158#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                   \
159                     q0_in, q1_in, q2_in, q3_in,                   \
160                     limit_in, b_limit_in, thresh_in,              \
161                     hev_out, mask_out, flat_out)                  \
162{                                                                  \
163    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
164    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
165                                                                   \
166    /* absolute subtraction of pixel values */                     \
167    p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                   \
168    p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                   \
169    p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                   \
170    q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                   \
171    q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                   \
172    q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                   \
173    p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                   \
174    p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                   \
175                                                                   \
176    /* calculation of hev */                                       \
177    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
178    hev_out = thresh_in < (v16u8) flat_out;                        \
179                                                                   \
180    /* calculation of mask */                                      \
181    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
182    p1_asub_q1_m >>= 1;                                            \
183    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
184                                                                   \
185    mask_out = b_limit_in < p0_asub_q0_m;                          \
186    mask_out = __msa_max_u_b(flat_out, mask_out);                  \
187    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
188    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
189    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
190    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
191                                                                   \
192    mask_out = limit_in < (v16u8) mask_out;                        \
193    mask_out = __msa_xori_b(mask_out, 0xff);                       \
194}
195
196void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch,
197                              int32_t b_limit_ptr,
198                              int32_t limit_ptr,
199                              int32_t thresh_ptr)
200{
201    uint64_t p1_d, p0_d, q0_d, q1_d;
202    v16u8 mask, hev, flat, thresh, b_limit, limit;
203    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
204
205    /* load vector elements */
206    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
207
208    thresh = (v16u8) __msa_fill_b(thresh_ptr);
209    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
210    limit = (v16u8) __msa_fill_b(limit_ptr);
211
212    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
213                 hev, mask, flat);
214    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
215                       q1_out);
216
217    p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
218    p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
219    q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
220    q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
221    SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
222}
223
224
225void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch,
226                                int32_t b_limit_ptr,
227                                int32_t limit_ptr,
228                                int32_t thresh_ptr)
229{
230    v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
231    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
232
233    /* load vector elements */
234    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
235
236    thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
237    thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
238    thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
239
240    b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
241    b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
242    b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
243
244    limit0 = (v16u8) __msa_fill_b(limit_ptr);
245    limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
246    limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
247
248    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
249                 hev, mask, flat);
250    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
251
252    ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
253}
254
255void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch,
256                              int32_t b_limit_ptr,
257                              int32_t limit_ptr,
258                              int32_t thresh_ptr)
259{
260    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
261    v16u8 mask, hev, flat, thresh, b_limit, limit;
262    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
263    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
264    v8i16 p2_filter8, p1_filter8, p0_filter8;
265    v8i16 q0_filter8, q1_filter8, q2_filter8;
266    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
267    v16i8 zero = { 0 };
268
269    /* load vector elements */
270    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
271
272    thresh = (v16u8) __msa_fill_b(thresh_ptr);
273    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
274    limit = (v16u8) __msa_fill_b(limit_ptr);
275
276    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
277                 hev, mask, flat);
278    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
279    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
280                       q1_out);
281
282    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
283
284    /* if flat is zero for all pixels, then no need to calculate other filter */
285    if (__msa_test_bz_v(flat)) {
286        p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
287        p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
288        q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
289        q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
290        SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
291    } else {
292        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
293                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
294                   q2_r, q3_r);
295        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
296                    p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
297
298        /* convert 16 bit output data into 8 bit */
299        PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
300                    zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
301                    q0_filter8);
302        PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
303
304        /* store pixel values */
305        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
306        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
307        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
308        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
309        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
310        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
311
312        p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
313        p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
314        p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
315        q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
316        q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
317        q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
318
319        src -= 3 * pitch;
320
321        SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
322        src += (4 * pitch);
323        SD(q1_d, src);
324        src += pitch;
325        SD(q2_d, src);
326    }
327}
328
329void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch,
330                                int32_t b_limit_ptr,
331                                int32_t limit_ptr,
332                                int32_t thresh_ptr)
333{
334    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
335    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
336    v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
337    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
338    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
339    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
340    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
341    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
342    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
343    v16u8 zero = { 0 };
344
345    /* load vector elements */
346    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
347
348    thresh = (v16u8) __msa_fill_b(thresh_ptr);
349    tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
350    thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
351
352    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
353    tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
354    b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
355
356    limit = (v16u8) __msa_fill_b(limit_ptr);
357    tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
358    limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
359
360    /* mask and hev */
361    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
362                 hev, mask, flat);
363    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
364    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
365                       q1_out);
366
367    /* if flat is zero for all pixels, then no need to calculate other filter */
368    if (__msa_test_bz_v(flat)) {
369        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
370    } else {
371        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
372                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
373                   q2_r, q3_r);
374        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
375                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
376
377        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
378                   p0_l);
379        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
380                   q3_l);
381        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
382                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
383
384        /* convert 16 bit output data into 8 bit */
385        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
386                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
387                    p0_filt8_r, q0_filt8_r);
388        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r,
389                    q1_filt8_r, q2_filt8_r);
390
391        /* store pixel values */
392        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
393        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
394        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
395        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
396        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
397        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
398
399        src -= 3 * pitch;
400
401        ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
402        src += (4 * pitch);
403        ST_UB2(q1_out, q2_out, src, pitch);
404        src += (2 * pitch);
405    }
406}
407
408void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch,
409                                int32_t b_limit_ptr,
410                                int32_t limit_ptr,
411                                int32_t thresh_ptr)
412{
413    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
414    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
415    v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
416    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
417    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
418    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
419    v16u8 zero = { 0 };
420
421    /* load vector elements */
422    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
423
424    thresh = (v16u8) __msa_fill_b(thresh_ptr);
425    tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
426    thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
427
428    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
429    tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
430    b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
431
432    limit = (v16u8) __msa_fill_b(limit_ptr);
433    tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
434    limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
435
436    /* mask and hev */
437    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
438                 hev, mask, flat);
439    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
440    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
441                       q1_out);
442
443    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
444
445    /* if flat is zero for all pixels, then no need to calculate other filter */
446    if (__msa_test_bz_v(flat)) {
447        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
448    } else {
449        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
450                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
451                   q2_r, q3_r);
452        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
453                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
454
455        /* convert 16 bit output data into 8 bit */
456        PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
457                    p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
458                    p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
459        PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
460                    q1_filt8_r, q2_filt8_r);
461
462        /* store pixel values */
463        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
464        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
465        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
466        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
467        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
468        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
469
470        src -= 3 * pitch;
471
472        ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
473        src += (4 * pitch);
474        ST_UB2(q1_out, q2_out, src, pitch);
475        src += (2 * pitch);
476    }
477}
478
479void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch,
480                                int32_t b_limit_ptr,
481                                int32_t limit_ptr,
482                                int32_t thresh_ptr)
483{
484    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
485    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
486    v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
487    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
488    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
489    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
490    v16u8 zero = { 0 };
491
492    /* load vector elements */
493    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
494
495    thresh = (v16u8) __msa_fill_b(thresh_ptr);
496    tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
497    thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
498
499    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
500    tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
501    b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
502
503    limit = (v16u8) __msa_fill_b(limit_ptr);
504    tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
505    limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
506
507    /* mask and hev */
508    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
509                 hev, mask, flat);
510    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
511    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
512                       q1_out);
513
514    flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
515
516    /* if flat is zero for all pixels, then no need to calculate other filter */
517    if (__msa_test_bz_v(flat)) {
518        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
519    } else {
520        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
521                   p0_l);
522        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
523                   q3_l);
524        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
525                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
526
527        /* convert 16 bit output data into 8 bit */
528        PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
529                    p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
530                    p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
531        PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
532                    q1_filt8_l, q2_filt8_l);
533
534        /* store pixel values */
535        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
536        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
537        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
538        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
539        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
540        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
541
542        src -= 3 * pitch;
543
544        ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
545        src += (4 * pitch);
546        ST_UB2(q1_out, q2_out, src, pitch);
547        src += (2 * pitch);
548    }
549}
550
551static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch,
552                                        uint8_t *filter48,
553                                        int32_t b_limit_ptr,
554                                        int32_t limit_ptr,
555                                        int32_t thresh_ptr)
556{
557    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
558    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
559    v16u8 flat, mask, hev, thresh, b_limit, limit;
560    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
561    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
562    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
563    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
564    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
565    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
566    v16u8 zero = { 0 };
567
568    /* load vector elements */
569    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
570
571    thresh = (v16u8) __msa_fill_b(thresh_ptr);
572    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
573    limit = (v16u8) __msa_fill_b(limit_ptr);
574
575    /* mask and hev */
576    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
577                 hev, mask, flat);
578    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
579    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
580                       q1_out);
581
582    /* if flat is zero for all pixels, then no need to calculate other filter */
583    if (__msa_test_bz_v(flat)) {
584        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
585
586        return 1;
587    } else {
588        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
589                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
590                   q2_r, q3_r);
591        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
592                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
593
594        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
595                   p0_l);
596        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
597                   q3_l);
598        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
599                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
600
601        /* convert 16 bit output data into 8 bit */
602        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
603                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
604                    p0_filt8_r, q0_filt8_r);
605        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
606                    q2_filt8_r);
607
608        /* store pixel values */
609        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
610        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
611        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
612        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
613        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
614        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
615
616        ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
617        filter48 += (4 * 16);
618        ST_UB2(q1_out, q2_out, filter48, 16);
619        filter48 += (2 * 16);
620        ST_UB(flat, filter48);
621
622        return 0;
623    }
624}
625
626static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
627{
628    v16u8 flat, flat2, filter8;
629    v16i8 zero = { 0 };
630    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
631    v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
632    v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
633    v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
634    v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
635    v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
636    v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
637    v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
638    v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
639    v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
640    v8i16 l_out, r_out;
641
642    flat = LD_UB(filter48 + 96);
643
644    LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
645    LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
646    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
647
648    /* if flat2 is zero for all pixels, then no need to calculate other filter */
649    if (__msa_test_bz_v(flat2)) {
650        LD_UB4(filter48, 16, p2, p1, p0, q0);
651        LD_UB2(filter48 + 4 * 16, 16, q1, q2);
652
653        src -= 3 * pitch;
654        ST_UB4(p2, p1, p0, q0, src, pitch);
655        src += (4 * pitch);
656        ST_UB2(q1, q2, src, pitch);
657    } else {
658        src -= 7 * pitch;
659
660        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
661                   zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
662                   p3_r_in, p2_r_in, p1_r_in, p0_r_in);
663
664        q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
665
666        tmp0_r = p7_r_in << 3;
667        tmp0_r -= p7_r_in;
668        tmp0_r += p6_r_in;
669        tmp0_r += q0_r_in;
670        tmp1_r = p6_r_in + p5_r_in;
671        tmp1_r += p4_r_in;
672        tmp1_r += p3_r_in;
673        tmp1_r += p2_r_in;
674        tmp1_r += p1_r_in;
675        tmp1_r += p0_r_in;
676        tmp1_r += tmp0_r;
677        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
678
679        ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
680                   p5_l_in, p4_l_in);
681        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
682                   p1_l_in, p0_l_in);
683        q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
684
685        tmp0_l = p7_l_in << 3;
686        tmp0_l -= p7_l_in;
687        tmp0_l += p6_l_in;
688        tmp0_l += q0_l_in;
689        tmp1_l = p6_l_in + p5_l_in;
690        tmp1_l += p4_l_in;
691        tmp1_l += p3_l_in;
692        tmp1_l += p2_l_in;
693        tmp1_l += p1_l_in;
694        tmp1_l += p0_l_in;
695        tmp1_l += tmp0_l;
696        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
697
698        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
699        p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
700        ST_UB(p6, src);
701        src += pitch;
702
703        /* p5 */
704        q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
705        tmp0_r = p5_r_in - p6_r_in;
706        tmp0_r += q1_r_in;
707        tmp0_r -= p7_r_in;
708        tmp1_r += tmp0_r;
709        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
710
711        q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
712        tmp0_l = p5_l_in - p6_l_in;
713        tmp0_l += q1_l_in;
714        tmp0_l -= p7_l_in;
715        tmp1_l += tmp0_l;
716        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
717
718        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
719        p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
720        ST_UB(p5, src);
721        src += pitch;
722
723        /* p4 */
724        q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
725        tmp0_r = p4_r_in - p5_r_in;
726        tmp0_r += q2_r_in;
727        tmp0_r -= p7_r_in;
728        tmp1_r += tmp0_r;
729        r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4);
730
731        q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
732        tmp0_l = p4_l_in - p5_l_in;
733        tmp0_l += q2_l_in;
734        tmp0_l -= p7_l_in;
735        tmp1_l += tmp0_l;
736        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
737
738        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
739        p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
740        ST_UB(p4, src);
741        src += pitch;
742
743        /* p3 */
744        q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
745        tmp0_r = p3_r_in - p4_r_in;
746        tmp0_r += q3_r_in;
747        tmp0_r -= p7_r_in;
748        tmp1_r += tmp0_r;
749        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
750
751        q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
752        tmp0_l = p3_l_in - p4_l_in;
753        tmp0_l += q3_l_in;
754        tmp0_l -= p7_l_in;
755        tmp1_l += tmp0_l;
756        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
757
758        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
759        p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
760        ST_UB(p3, src);
761        src += pitch;
762
763        /* p2 */
764        q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
765        filter8 = LD_UB(filter48);
766        tmp0_r = p2_r_in - p3_r_in;
767        tmp0_r += q4_r_in;
768        tmp0_r -= p7_r_in;
769        tmp1_r += tmp0_r;
770        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
771
772        q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
773        tmp0_l = p2_l_in - p3_l_in;
774        tmp0_l += q4_l_in;
775        tmp0_l -= p7_l_in;
776        tmp1_l += tmp0_l;
777        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
778
779        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
780        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
781        ST_UB(filter8, src);
782        src += pitch;
783
784        /* p1 */
785        q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
786        filter8 = LD_UB(filter48 + 16);
787        tmp0_r = p1_r_in - p2_r_in;
788        tmp0_r += q5_r_in;
789        tmp0_r -= p7_r_in;
790        tmp1_r += tmp0_r;
791        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
792
793        q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
794        tmp0_l = p1_l_in - p2_l_in;
795        tmp0_l += q5_l_in;
796        tmp0_l -= p7_l_in;
797        tmp1_l += tmp0_l;
798        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
799
800        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
801        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
802        ST_UB(filter8, src);
803        src += pitch;
804
805        /* p0 */
806        q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
807        filter8 = LD_UB(filter48 + 32);
808        tmp0_r = p0_r_in - p1_r_in;
809        tmp0_r += q6_r_in;
810        tmp0_r -= p7_r_in;
811        tmp1_r += tmp0_r;
812        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
813
814        q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
815        tmp0_l = p0_l_in - p1_l_in;
816        tmp0_l += q6_l_in;
817        tmp0_l -= p7_l_in;
818        tmp1_l += tmp0_l;
819        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
820
821        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
822        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
823        ST_UB(filter8, src);
824        src += pitch;
825
826        /* q0 */
827        q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
828        filter8 = LD_UB(filter48 + 48);
829        tmp0_r = q7_r_in - p0_r_in;
830        tmp0_r += q0_r_in;
831        tmp0_r -= p7_r_in;
832        tmp1_r += tmp0_r;
833        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
834
835        q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
836        tmp0_l = q7_l_in - p0_l_in;
837        tmp0_l += q0_l_in;
838        tmp0_l -= p7_l_in;
839        tmp1_l += tmp0_l;
840        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
841
842        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
843        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
844        ST_UB(filter8, src);
845        src += pitch;
846
847        /* q1 */
848        filter8 = LD_UB(filter48 + 64);
849        tmp0_r = q7_r_in - q0_r_in;
850        tmp0_r += q1_r_in;
851        tmp0_r -= p6_r_in;
852        tmp1_r += tmp0_r;
853        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
854
855        tmp0_l = q7_l_in - q0_l_in;
856        tmp0_l += q1_l_in;
857        tmp0_l -= p6_l_in;
858        tmp1_l += tmp0_l;
859        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
860
861        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
862        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
863        ST_UB(filter8, src);
864        src += pitch;
865
866        /* q2 */
867        filter8 = LD_UB(filter48 + 80);
868        tmp0_r = q7_r_in - q1_r_in;
869        tmp0_r += q2_r_in;
870        tmp0_r -= p5_r_in;
871        tmp1_r += tmp0_r;
872        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
873
874        tmp0_l = q7_l_in - q1_l_in;
875        tmp0_l += q2_l_in;
876        tmp0_l -= p5_l_in;
877        tmp1_l += tmp0_l;
878        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
879
880        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
881        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
882        ST_UB(filter8, src);
883        src += pitch;
884
885        /* q3 */
886        tmp0_r = q7_r_in - q2_r_in;
887        tmp0_r += q3_r_in;
888        tmp0_r -= p4_r_in;
889        tmp1_r += tmp0_r;
890        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
891
892        tmp0_l = q7_l_in - q2_l_in;
893        tmp0_l += q3_l_in;
894        tmp0_l -= p4_l_in;
895        tmp1_l += tmp0_l;
896        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
897
898        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
899        q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
900        ST_UB(q3, src);
901        src += pitch;
902
903        /* q4 */
904        tmp0_r = q7_r_in - q3_r_in;
905        tmp0_r += q4_r_in;
906        tmp0_r -= p3_r_in;
907        tmp1_r += tmp0_r;
908        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
909
910        tmp0_l = q7_l_in - q3_l_in;
911        tmp0_l += q4_l_in;
912        tmp0_l -= p3_l_in;
913        tmp1_l += tmp0_l;
914        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
915
916        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
917        q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
918        ST_UB(q4, src);
919        src += pitch;
920
921        /* q5 */
922        tmp0_r = q7_r_in - q4_r_in;
923        tmp0_r += q5_r_in;
924        tmp0_r -= p2_r_in;
925        tmp1_r += tmp0_r;
926        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
927
928        tmp0_l = q7_l_in - q4_l_in;
929        tmp0_l += q5_l_in;
930        tmp0_l -= p2_l_in;
931        tmp1_l += tmp0_l;
932        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
933
934        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
935        q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
936        ST_UB(q5, src);
937        src += pitch;
938
939        /* q6 */
940        tmp0_r = q7_r_in - q5_r_in;
941        tmp0_r += q6_r_in;
942        tmp0_r -= p1_r_in;
943        tmp1_r += tmp0_r;
944        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
945
946        tmp0_l = q7_l_in - q5_l_in;
947        tmp0_l += q6_l_in;
948        tmp0_l -= p1_l_in;
949        tmp1_l += tmp0_l;
950        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
951
952        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
953        q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
954        ST_UB(q6, src);
955    }
956}
957
958void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch,
959                                int32_t b_limit_ptr,
960                                int32_t limit_ptr,
961                                int32_t thresh_ptr)
962{
963    uint8_t filter48[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
964    uint8_t early_exit = 0;
965
966    early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0],
967                                          b_limit_ptr, limit_ptr, thresh_ptr);
968
969    if (0 == early_exit) {
970        vp9_hz_lpf_t16_16w(src, pitch, filter48);
971    }
972}
973
974void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch,
975                               int32_t b_limit_ptr,
976                               int32_t limit_ptr,
977                               int32_t thresh_ptr)
978{
979    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
980    uint64_t dword0, dword1;
981    v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
982    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
983    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
984    v16u8 p0_filter16, p1_filter16;
985    v8i16 p2_filter8, p1_filter8, p0_filter8;
986    v8i16 q0_filter8, q1_filter8, q2_filter8;
987    v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
988    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
989    v16i8 zero = { 0 };
990    v8u16 tmp0, tmp1, tmp2;
991
992    /* load vector elements */
993    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
994
995    thresh = (v16u8) __msa_fill_b(thresh_ptr);
996    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
997    limit = (v16u8) __msa_fill_b(limit_ptr);
998
999    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1000                 hev, mask, flat);
1001    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1002    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1003                       q1_out);
1004
1005    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1006
1007    /* if flat is zero for all pixels, then no need to calculate other filter */
1008    if (__msa_test_bz_v(flat)) {
1009        p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1010        p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1011        q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1012        q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1013        SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
1014    } else {
1015        /* convert 8 bit input data into 16 bit */
1016        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero,
1017                   q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r,
1018                   q1_r, q2_r, q3_r);
1019        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r,
1020                    p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1021                    q1_filter8, q2_filter8);
1022
1023        /* convert 16 bit output data into 8 bit */
1024        PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
1025                    zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
1026                    q0_filter8);
1027        PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8,
1028                    q2_filter8);
1029
1030        /* store pixel values */
1031        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
1032        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
1033        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
1034        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
1035        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
1036        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
1037
1038        /* load 16 vector elements */
1039        LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
1040        LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
1041
1042        VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1043
1044        /* if flat2 is zero for all pixels, then no need to calculate other filter */
1045        if (__msa_test_bz_v(flat2)) {
1046            p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
1047            p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1048            p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1049            q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1050            q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1051            q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
1052
1053            SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
1054            SD(q1_d, src + pitch);
1055            SD(q2_d, src + 2 * pitch);
1056        } else {
1057            /* LSB(right) 8 pixel operation */
1058            ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4,
1059                       zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r,
1060                       q4_r, q5_r, q6_r, q7_r);
1061
1062            tmp0 = p7_r << 3;
1063            tmp0 -= p7_r;
1064            tmp0 += p6_r;
1065            tmp0 += q0_r;
1066
1067            src -= 7 * pitch;
1068
1069            /* calculation of p6 and p5 */
1070            tmp1 = p6_r + p5_r + p4_r + p3_r;
1071            tmp1 += (p2_r + p1_r + p0_r);
1072            tmp1 += tmp0;
1073            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1074            tmp0 = p5_r - p6_r + q1_r - p7_r;
1075            tmp1 += tmp0;
1076            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1077            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1078                        p0_filter16, p1_filter16);
1079            p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
1080            p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
1081            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1082            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1083            SD(dword0, src);
1084            src += pitch;
1085            SD(dword1, src);
1086            src += pitch;
1087
1088            /* calculation of p4 and p3 */
1089            tmp0 = p4_r - p5_r + q2_r - p7_r;
1090            tmp2 = p3_r - p4_r + q3_r - p7_r;
1091            tmp1 += tmp0;
1092            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1093            tmp1 += tmp2;
1094            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1095            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1096                        p0_filter16, p1_filter16);
1097            p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
1098            p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
1099            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1100            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1101            SD(dword0, src);
1102            src += pitch;
1103            SD(dword1, src);
1104            src += pitch;
1105
1106            /* calculation of p2 and p1 */
1107            tmp0 = p2_r - p3_r + q4_r - p7_r;
1108            tmp2 = p1_r - p2_r + q5_r - p7_r;
1109            tmp1 += tmp0;
1110            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1111            tmp1 += tmp2;
1112            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1113            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1114                        p0_filter16, p1_filter16);
1115            p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
1116            p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
1117            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1118            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1119            SD(dword0, src);
1120            src += pitch;
1121            SD(dword1, src);
1122            src += pitch;
1123
1124            /* calculation of p0 and q0 */
1125            tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
1126            tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
1127            tmp1 += tmp0;
1128            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1129            tmp1 += tmp2;
1130            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1131            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1132                        p0_filter16, p1_filter16);
1133            p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
1134            p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
1135            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1136            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1137            SD(dword0, src);
1138            src += pitch;
1139            SD(dword1, src);
1140            src += pitch;
1141
1142            /* calculation of q1 and q2 */
1143            tmp0 = q7_r - q0_r + q1_r - p6_r;
1144            tmp2 = q7_r - q1_r + q2_r - p5_r;
1145            tmp1 += tmp0;
1146            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1147            tmp1 += tmp2;
1148            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1149            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1150                        p0_filter16, p1_filter16);
1151            p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
1152            p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
1153            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1154            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1155            SD(dword0, src);
1156            src += pitch;
1157            SD(dword1, src);
1158            src += pitch;
1159
1160            /* calculation of q3 and q4 */
1161            tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
1162            tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
1163            tmp1 += tmp0;
1164            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1165            tmp1 += tmp2;
1166            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1167            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1168                        p0_filter16, p1_filter16);
1169            p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
1170            p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
1171            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1172            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1173            SD(dword0, src);
1174            src += pitch;
1175            SD(dword1, src);
1176            src += pitch;
1177
1178            /* calculation of q5 and q6 */
1179            tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
1180            tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
1181            tmp1 += tmp0;
1182            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1183            tmp1 += tmp2;
1184            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1185            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1186                        p0_filter16, p1_filter16);
1187            p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
1188            p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
1189            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1190            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1191            SD(dword0, src);
1192            src += pitch;
1193            SD(dword1, src);
1194        }
1195    }
1196}
1197
1198void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch,
1199                              int32_t b_limit_ptr,
1200                              int32_t limit_ptr,
1201                              int32_t thresh_ptr)
1202{
1203    v16u8 mask, hev, flat, limit, thresh, b_limit;
1204    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1205    v8i16 vec0, vec1, vec2, vec3;
1206
1207    LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1208
1209    thresh = (v16u8) __msa_fill_b(thresh_ptr);
1210    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1211    limit = (v16u8) __msa_fill_b(limit_ptr);
1212
1213    TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1214                       p3, p2, p1, p0, q0, q1, q2, q3);
1215    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1216                 hev, mask, flat);
1217    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1218    ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
1219    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1220
1221    src -= 2;
1222    ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1223}
1224
1225void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch,
1226                                int32_t b_limit_ptr,
1227                                int32_t limit_ptr,
1228                                int32_t thresh_ptr)
1229{
1230    v16u8 mask, hev, flat;
1231    v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1232    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1233    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1234    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1235    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1236
1237    LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1238    LD_UB8(src - 4 + (8 * pitch), pitch,
1239           row8, row9, row10, row11, row12, row13, row14, row15);
1240
1241    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1242                        row8, row9, row10, row11, row12, row13, row14, row15,
1243                        p3, p2, p1, p0, q0, q1, q2, q3);
1244
1245    thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
1246    thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
1247    thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
1248
1249    b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
1250    b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
1251    b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
1252
1253    limit0 = (v16u8) __msa_fill_b(limit_ptr);
1254    limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
1255    limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
1256
1257    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
1258                 hev, mask, flat);
1259    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1260    ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1261    ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
1262    ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1263    ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
1264
1265    src -= 2;
1266
1267    ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1268    ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1269}
1270
1271void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch,
1272                              int32_t b_limit_ptr,
1273                              int32_t limit_ptr,
1274                              int32_t thresh_ptr)
1275{
1276    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1277    v16u8 p1_out, p0_out, q0_out, q1_out;
1278    v16u8 flat, mask, hev, thresh, b_limit, limit;
1279    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1280    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1281    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1282    v16u8 zero = { 0 };
1283    v8i16 vec0, vec1, vec2, vec3, vec4;
1284
1285    /* load vector elements */
1286    LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1287
1288    TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1289                       p3, p2, p1, p0, q0, q1, q2, q3);
1290
1291    thresh = (v16u8) __msa_fill_b(thresh_ptr);
1292    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1293    limit = (v16u8) __msa_fill_b(limit_ptr);
1294
1295    /* mask and hev */
1296    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1297                 hev, mask, flat);
1298    /* flat4 */
1299    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1300    /* filter4 */
1301    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1302                       q1_out);
1303
1304    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1305
1306    /* if flat is zero for all pixels, then no need to calculate other filter */
1307    if (__msa_test_bz_v(flat)) {
1308        /* Store 4 pixels p1-_q1 */
1309        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1310        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1311
1312        src -= 2;
1313        ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1314    } else {
1315        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1316                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1317                   q3_r);
1318        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1319                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1320        /* convert 16 bit output data into 8 bit */
1321        PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
1322                    p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1323                    p0_filt8_r, q0_filt8_r);
1324        PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
1325                    q2_filt8_r);
1326
1327        /* store pixel values */
1328        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1329        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1330        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1331        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1332        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1333        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1334
1335        /* Store 6 pixels p2-_q2 */
1336        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1337        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1338        vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1339
1340        src -= 3;
1341        ST_W4(vec2, 0, 1, 2, 3, src, pitch);
1342        ST_H4(vec4, 0, 1, 2, 3, src + 4, pitch);
1343        src += (4 * pitch);
1344        ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1345        ST_H4(vec4, 4, 5, 6, 7, src + 4, pitch);
1346    }
1347}
1348
1349void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch,
1350                                int32_t b_limit_ptr,
1351                                int32_t limit_ptr,
1352                                int32_t thresh_ptr)
1353{
1354    uint8_t *temp_src;
1355    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1356    v16u8 p1_out, p0_out, q0_out, q1_out;
1357    v16u8 flat, mask, hev, thresh, b_limit, limit;
1358    v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1359    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1360    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1361    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1362    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1363    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1364    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1365    v16u8 zero = { 0 };
1366    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1367
1368    temp_src = src - 4;
1369
1370    LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1371    temp_src += (8 * pitch);
1372    LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1373
1374    /* transpose 16x8 matrix into 8x16 */
1375    TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1376                        q3, q2, q1, q0, row12, row13, row14, row15,
1377                        p3, p2, p1, p0, q0, q1, q2, q3);
1378
1379    thresh = (v16u8) __msa_fill_b(thresh_ptr);
1380    vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1381    thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1382
1383    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1384    vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1385    b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1386
1387    limit = (v16u8) __msa_fill_b(limit_ptr);
1388    vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1389    limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1390
1391    /* mask and hev */
1392    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1393                 hev, mask, flat);
1394    /* flat4 */
1395    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1396    /* filter4 */
1397    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1398                       q1_out);
1399
1400    /* if flat is zero for all pixels, then no need to calculate other filter */
1401    if (__msa_test_bz_v(flat)) {
1402        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1403        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1404        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1405        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1406
1407        src -= 2;
1408        ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1409        ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1410    } else {
1411        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1412                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1413                   q3_r);
1414        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1415                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1416
1417        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1418                   p0_l);
1419        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1420                   q3_l);
1421
1422        /* filter8 */
1423        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1424                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1425
1426        /* convert 16 bit output data into 8 bit */
1427        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1428                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1429                    p0_filt8_r, q0_filt8_r);
1430        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1431                    q2_filt8_r);
1432
1433        /* store pixel values */
1434        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1435        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1436        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1437        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1438        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1439        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1440
1441        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1442        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1443        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1444        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1445        ILVRL_B2_SH(q2, q1, vec2, vec5);
1446
1447        src -= 3;
1448        ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1449        ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1450        src += (4 * pitch);
1451        ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1452        ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1453        src += (4 * pitch);
1454        ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1455        ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1456        src += (4 * pitch);
1457        ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1458        ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1459    }
1460}
1461
1462void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch,
1463                                int32_t b_limit_ptr,
1464                                int32_t limit_ptr,
1465                                int32_t thresh_ptr)
1466{
1467    uint8_t *temp_src;
1468    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1469    v16u8 p1_out, p0_out, q0_out, q1_out;
1470    v16u8 flat, mask, hev, thresh, b_limit, limit;
1471    v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1472    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1473    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1474    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1475    v16u8 zero = { 0 };
1476    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1477
1478    temp_src = src - 4;
1479
1480    LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1481    temp_src += (8 * pitch);
1482    LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1483
1484    /* transpose 16x8 matrix into 8x16 */
1485    TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1486                        q3, q2, q1, q0, row12, row13, row14, row15,
1487                        p3, p2, p1, p0, q0, q1, q2, q3);
1488
1489    thresh = (v16u8) __msa_fill_b(thresh_ptr);
1490    vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1491    thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1492
1493    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1494    vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1495    b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1496
1497    limit = (v16u8) __msa_fill_b(limit_ptr);
1498    vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1499    limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1500
1501    /* mask and hev */
1502    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1503                 hev, mask, flat);
1504    /* flat4 */
1505    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1506    /* filter4 */
1507    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1508                       q1_out);
1509
1510    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1511
1512    /* if flat is zero for all pixels, then no need to calculate other filter */
1513    if (__msa_test_bz_v(flat)) {
1514        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1515        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1516        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1517        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1518
1519        src -= 2;
1520        ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1521        ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1522    } else {
1523        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1524                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1525                   q3_r);
1526        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1527                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1528
1529        /* convert 16 bit output data into 8 bit */
1530        PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
1531                    p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
1532                    p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
1533        PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
1534                    q1_filt8_r, q2_filt8_r);
1535
1536        /* store pixel values */
1537        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1538        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1539        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1540        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1541        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1542        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1543
1544        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1545        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1546        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1547        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1548        ILVRL_B2_SH(q2, q1, vec2, vec5);
1549
1550        src -= 3;
1551        ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1552        ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1553        src += (4 * pitch);
1554        ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1555        ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1556        src += (4 * pitch);
1557        ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1558        ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1559        src += (4 * pitch);
1560        ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1561        ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1562    }
1563}
1564
1565void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch,
1566                                int32_t b_limit_ptr,
1567                                int32_t limit_ptr,
1568                                int32_t thresh_ptr)
1569{
1570    uint8_t *temp_src;
1571    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1572    v16u8 p1_out, p0_out, q0_out, q1_out;
1573    v16u8 flat, mask, hev, thresh, b_limit, limit;
1574    v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1575    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1576    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1577    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1578    v16u8 zero = { 0 };
1579    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1580
1581    temp_src = src - 4;
1582
1583    LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1584    temp_src += (8 * pitch);
1585    LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1586
1587    /* transpose 16x8 matrix into 8x16 */
1588    TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1589                        q3, q2, q1, q0, row12, row13, row14, row15,
1590                        p3, p2, p1, p0, q0, q1, q2, q3);
1591
1592    thresh = (v16u8) __msa_fill_b(thresh_ptr);
1593    vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1594    thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1595
1596    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1597    vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1598    b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1599
1600    limit = (v16u8) __msa_fill_b(limit_ptr);
1601    vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1602    limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1603
1604    /* mask and hev */
1605    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1606                 hev, mask, flat);
1607    /* flat4 */
1608    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1609    /* filter4 */
1610    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1611                       q1_out);
1612
1613    flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
1614
1615    /* if flat is zero for all pixels, then no need to calculate other filter */
1616    if (__msa_test_bz_v(flat)) {
1617        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1618        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1619        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1620        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1621
1622        src -= 2;
1623        ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1624        ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1625    } else {
1626        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1627                   p0_l);
1628        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1629                   q3_l);
1630
1631        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1632                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1633
1634        /* convert 16 bit output data into 8 bit */
1635        PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1636                    p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
1637                    p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1638        PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1639                    q1_filt8_l, q2_filt8_l);
1640
1641        /* store pixel values */
1642        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
1643        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
1644        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
1645        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
1646        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
1647        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
1648
1649        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1650        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1651        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1652        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1653        ILVRL_B2_SH(q2, q1, vec2, vec5);
1654
1655        src -= 3;
1656        ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1657        ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1658        src += (4 * pitch);
1659        ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1660        ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1661        src += (4 * pitch);
1662        ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1663        ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1664        src += (4 * pitch);
1665        ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1666        ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1667    }
1668}
1669
1670static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
1671                                       uint8_t *output, int32_t out_pitch)
1672{
1673    v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
1674    v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1675    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1676    v16i8 zeros = { 0 };
1677
1678    LD_UB8(input, in_pitch,
1679           p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
1680    /* 8x8 transpose */
1681    TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
1682                       p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
1683    /* 8x8 transpose */
1684    ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
1685               tmp0, tmp1, tmp2, tmp3);
1686    ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
1687    ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
1688    ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
1689    ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
1690    SLDI_B4_UB(zeros, q0, zeros, q2, zeros, q4, zeros, q6, 8, q1, q3, q5, q7);
1691
1692    ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1693    output += (8 * out_pitch);
1694    ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1695}
1696
1697static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
1698                                       uint8_t *output, int32_t out_pitch)
1699{
1700    v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
1701    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1702
1703    LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
1704    LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
1705    TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
1706                        q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
1707    ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
1708}
1709
1710static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch,
1711                                uint8_t *output, int32_t out_pitch)
1712{
1713    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1714    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1715    v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
1716    v4i32 tmp2, tmp3;
1717    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1718
1719    LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1720    input += (8 * in_pitch);
1721    LD_UB8(input, in_pitch,
1722           row8, row9, row10, row11, row12, row13, row14, row15);
1723
1724    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1725                        row8, row9, row10, row11, row12, row13, row14, row15,
1726                        p7, p6, p5, p4, p3, p2, p1, p0);
1727
1728    /* transpose 16x8 matrix into 8x16 */
1729    /* total 8 intermediate register and 32 instructions */
1730    q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0);
1731    q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1);
1732    q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2);
1733    q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3);
1734    q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4);
1735    q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5);
1736    q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6);
1737    q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7);
1738
1739    ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
1740    tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7);
1741    tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5);
1742
1743    ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
1744    tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3);
1745    tmp7 = (v8i16) __msa_ilvod_b((v16i8) q0, (v16i8) q1);
1746
1747    ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
1748    q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1749    q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1750
1751    tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0);
1752    tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5);
1753    q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1754    q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1755
1756    ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
1757    q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1758    q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1759
1760    tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4);
1761    tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6);
1762    q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1763    q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1764
1765    ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1766    output += (8 * out_pitch);
1767    ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1768}
1769
1770static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
1771                                       uint8_t *src_org, int32_t pitch_org,
1772                                       int32_t b_limit_ptr,
1773                                       int32_t limit_ptr,
1774                                       int32_t thresh_ptr)
1775{
1776    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1777    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1778    v16u8 flat, mask, hev, thresh, b_limit, limit;
1779    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1780    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1781    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1782    v16i8 zero = { 0 };
1783    v8i16 vec0, vec1, vec2, vec3;
1784
1785    /* load vector elements */
1786    LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1787
1788    thresh = (v16u8) __msa_fill_b(thresh_ptr);
1789    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1790    limit = (v16u8) __msa_fill_b(limit_ptr);
1791
1792    /* mask and hev */
1793    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1794                 hev, mask, flat);
1795    /* flat4 */
1796    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1797    /* filter4 */
1798    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1799                       q1_out);
1800
1801    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1802
1803    /* if flat is zero for all pixels, then no need to calculate other filter */
1804    if (__msa_test_bz_v(flat)) {
1805        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1806        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1807        ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, (src_org - 2), pitch_org);
1808        return 1;
1809    } else {
1810        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1811                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1812                   q3_r);
1813        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1814                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1815
1816        /* convert 16 bit output data into 8 bit */
1817        p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r);
1818        p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r);
1819        p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r);
1820        q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r);
1821        q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r);
1822        q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r);
1823
1824        /* store pixel values */
1825        p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat);
1826        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat);
1827        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat);
1828        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat);
1829        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat);
1830        q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat);
1831
1832        ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1833        filter48 += (4 * 16);
1834        ST_UB2(q1_out, q2_out, filter48, 16);
1835        filter48 += (2 * 16);
1836        ST_UB(flat, filter48);
1837
1838        return 0;
1839    }
1840}
1841
1842static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
1843                                 uint8_t *filter48)
1844{
1845    v16i8 zero = { 0 };
1846    v16u8 filter8, flat, flat2;
1847    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1848    v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
1849    v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1850    v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
1851    v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1852    v8u16 tmp0_r, tmp1_r;
1853    v8i16 r_out;
1854
1855    flat = LD_UB(filter48 + 6 * 16);
1856
1857    LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1858    LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1859
1860    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1861
1862    /* if flat2 is zero for all pixels, then no need to calculate other filter */
1863    if (__msa_test_bz_v(flat2)) {
1864        v8i16 vec0, vec1, vec2, vec3, vec4;
1865
1866        LD_UB4(filter48, 16, p2, p1, p0, q0);
1867        LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1868
1869        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1870        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1871        vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1872
1873        src_org -= 3;
1874        ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
1875        ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
1876        src_org += (4 * pitch);
1877        ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
1878        ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
1879
1880        return 1;
1881    } else {
1882        src -= 7 * 16;
1883
1884        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
1885                   zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
1886                   p3_r_in, p2_r_in, p1_r_in, p0_r_in);
1887        q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
1888
1889        tmp0_r = p7_r_in << 3;
1890        tmp0_r -= p7_r_in;
1891        tmp0_r += p6_r_in;
1892        tmp0_r += q0_r_in;
1893        tmp1_r = p6_r_in + p5_r_in;
1894        tmp1_r += p4_r_in;
1895        tmp1_r += p3_r_in;
1896        tmp1_r += p2_r_in;
1897        tmp1_r += p1_r_in;
1898        tmp1_r += p0_r_in;
1899        tmp1_r += tmp0_r;
1900
1901        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1902        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1903        p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
1904        ST_D1(p6, 0, src);
1905        src += 16;
1906
1907        /* p5 */
1908        q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
1909        tmp0_r = p5_r_in - p6_r_in;
1910        tmp0_r += q1_r_in;
1911        tmp0_r -= p7_r_in;
1912        tmp1_r += tmp0_r;
1913        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1914        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1915        p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
1916        ST_D1(p5, 0, src);
1917        src += 16;
1918
1919        /* p4 */
1920        q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
1921        tmp0_r = p4_r_in - p5_r_in;
1922        tmp0_r += q2_r_in;
1923        tmp0_r -= p7_r_in;
1924        tmp1_r += tmp0_r;
1925        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1926        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1927        p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
1928        ST_D1(p4, 0, src);
1929        src += 16;
1930
1931        /* p3 */
1932        q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
1933        tmp0_r = p3_r_in - p4_r_in;
1934        tmp0_r += q3_r_in;
1935        tmp0_r -= p7_r_in;
1936        tmp1_r += tmp0_r;
1937        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1938        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1939        p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
1940        ST_D1(p3, 0, src);
1941        src += 16;
1942
1943        /* p2 */
1944        q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
1945        filter8 = LD_UB(filter48);
1946        tmp0_r = p2_r_in - p3_r_in;
1947        tmp0_r += q4_r_in;
1948        tmp0_r -= p7_r_in;
1949        tmp1_r += tmp0_r;
1950        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1951        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1952        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1953        ST_D1(filter8, 0, src);
1954        src += 16;
1955
1956        /* p1 */
1957        q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
1958        filter8 = LD_UB(filter48 + 16);
1959        tmp0_r = p1_r_in - p2_r_in;
1960        tmp0_r += q5_r_in;
1961        tmp0_r -= p7_r_in;
1962        tmp1_r += tmp0_r;
1963        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1964        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1965        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1966        ST_D1(filter8, 0, src);
1967        src += 16;
1968
1969        /* p0 */
1970        q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
1971        filter8 = LD_UB(filter48 + 32);
1972        tmp0_r = p0_r_in - p1_r_in;
1973        tmp0_r += q6_r_in;
1974        tmp0_r -= p7_r_in;
1975        tmp1_r += tmp0_r;
1976        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1977        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1978        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1979        ST_D1(filter8, 0, src);
1980        src += 16;
1981
1982        /* q0 */
1983        q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
1984        filter8 = LD_UB(filter48 + 48);
1985        tmp0_r = q7_r_in - p0_r_in;
1986        tmp0_r += q0_r_in;
1987        tmp0_r -= p7_r_in;
1988        tmp1_r += tmp0_r;
1989        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1990        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1991        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1992        ST_D1(filter8, 0, src);
1993        src += 16;
1994
1995        /* q1 */
1996        filter8 = LD_UB(filter48 + 64);
1997        tmp0_r = q7_r_in - q0_r_in;
1998        tmp0_r += q1_r_in;
1999        tmp0_r -= p6_r_in;
2000        tmp1_r += tmp0_r;
2001        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2002        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2003        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2004        ST_D1(filter8, 0, src);
2005        src += 16;
2006
2007        /* q2 */
2008        filter8 = LD_UB(filter48 + 80);
2009        tmp0_r = q7_r_in - q1_r_in;
2010        tmp0_r += q2_r_in;
2011        tmp0_r -= p5_r_in;
2012        tmp1_r += tmp0_r;
2013        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2014        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2015        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2016        ST_D1(filter8, 0, src);
2017        src += 16;
2018
2019        /* q3 */
2020        tmp0_r = q7_r_in - q2_r_in;
2021        tmp0_r += q3_r_in;
2022        tmp0_r -= p4_r_in;
2023        tmp1_r += tmp0_r;
2024        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2025        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2026        q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2027        ST_D1(q3, 0, src);
2028        src += 16;
2029
2030        /* q4 */
2031        tmp0_r = q7_r_in - q3_r_in;
2032        tmp0_r += q4_r_in;
2033        tmp0_r -= p3_r_in;
2034        tmp1_r += tmp0_r;
2035        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2036        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2037        q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2038        ST_D1(q4, 0, src);
2039        src += 16;
2040
2041        /* q5 */
2042        tmp0_r = q7_r_in - q4_r_in;
2043        tmp0_r += q5_r_in;
2044        tmp0_r -= p2_r_in;
2045        tmp1_r += tmp0_r;
2046        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2047        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2048        q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2049        ST_D1(q5, 0, src);
2050        src += 16;
2051
2052        /* q6 */
2053        tmp0_r = q7_r_in - q5_r_in;
2054        tmp0_r += q6_r_in;
2055        tmp0_r -= p1_r_in;
2056        tmp1_r += tmp0_r;
2057        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2058        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2059        q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2060        ST_D1(q6, 0, src);
2061
2062        return 0;
2063    }
2064}
2065
2066void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch,
2067                               int32_t b_limit_ptr,
2068                               int32_t limit_ptr,
2069                               int32_t thresh_ptr)
2070{
2071    uint8_t early_exit = 0;
2072    uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2073    uint8_t *filter48 = &transposed_input[16 * 16];
2074
2075    vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
2076
2077    early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
2078                                         &filter48[0], src, pitch,
2079                                         b_limit_ptr, limit_ptr, thresh_ptr);
2080
2081    if (0 == early_exit) {
2082        early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
2083                                       &filter48[0]);
2084
2085        if (0 == early_exit) {
2086            vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
2087        }
2088    }
2089}
2090
2091static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
2092                                        uint8_t *src_org, ptrdiff_t pitch,
2093                                        int32_t b_limit_ptr,
2094                                        int32_t limit_ptr,
2095                                        int32_t thresh_ptr)
2096{
2097    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
2098    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2099    v16u8 flat, mask, hev, thresh, b_limit, limit;
2100    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
2101    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2102    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
2103    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
2104    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
2105    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
2106    v16i8 zero = { 0 };
2107    v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
2108
2109    /* load vector elements */
2110    LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
2111
2112    thresh = (v16u8) __msa_fill_b(thresh_ptr);
2113    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
2114    limit = (v16u8) __msa_fill_b(limit_ptr);
2115
2116    /* mask and hev */
2117    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2118                 hev, mask, flat);
2119    /* flat4 */
2120    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2121    /* filter4 */
2122    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2123                       q1_out);
2124
2125    /* if flat is zero for all pixels, then no need to calculate other filter */
2126    if (__msa_test_bz_v(flat)) {
2127        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2128        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
2129        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2130        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
2131
2132        src_org -= 2;
2133        ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src_org, pitch);
2134        ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src_org + 8 * pitch, pitch);
2135
2136        return 1;
2137    } else {
2138        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
2139                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
2140                   q3_r);
2141        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
2142                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
2143        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
2144                   p0_l);
2145        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
2146                   q3_l);
2147        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2148                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2149
2150        /* convert 16 bit output data into 8 bit */
2151        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
2152                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
2153                    p0_filt8_r, q0_filt8_r);
2154        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
2155                    q2_filt8_r);
2156
2157        /* store pixel values */
2158        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
2159        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
2160        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
2161        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
2162        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
2163        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
2164
2165        ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
2166        filter48 += (4 * 16);
2167        ST_UB2(q1_out, q2_out, filter48, 16);
2168        filter48 += (2 * 16);
2169        ST_UB(flat, filter48);
2170
2171        return 0;
2172    }
2173}
2174
2175static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
2176                                  uint8_t *filter48)
2177{
2178    v16u8 flat, flat2, filter8;
2179    v16i8 zero = { 0 };
2180    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2181    v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
2182    v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
2183    v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
2184    v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
2185    v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2186    v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2187    v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2188    v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2189    v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
2190    v8i16 l_out, r_out;
2191
2192    flat = LD_UB(filter48 + 6 * 16);
2193
2194    LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
2195    LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
2196
2197    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2198
2199    /* if flat2 is zero for all pixels, then no need to calculate other filter */
2200    if (__msa_test_bz_v(flat2)) {
2201        v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2202
2203        LD_UB4(filter48, 16, p2, p1, p0, q0);
2204        LD_UB2(filter48 + 4 * 16, 16, q1, q2);
2205
2206        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
2207        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
2208        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
2209        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
2210        ILVRL_B2_SH(q2, q1, vec2, vec5);
2211
2212        src_org -= 3;
2213        ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
2214        ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
2215        src_org += (4 * pitch);
2216        ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
2217        ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
2218        src_org += (4 * pitch);
2219        ST_W4(vec6, 0, 1, 2, 3, src_org, pitch);
2220        ST_H4(vec5, 0, 1, 2, 3, (src_org + 4), pitch);
2221        src_org += (4 * pitch);
2222        ST_W4(vec7, 0, 1, 2, 3, src_org, pitch);
2223        ST_H4(vec5, 4, 5, 6, 7, (src_org + 4), pitch);
2224
2225        return 1;
2226    } else {
2227        src -= 7 * 16;
2228
2229        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
2230                   zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
2231                   p3_r_in, p2_r_in, p1_r_in, p0_r_in);
2232        q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
2233
2234        tmp0_r = p7_r_in << 3;
2235        tmp0_r -= p7_r_in;
2236        tmp0_r += p6_r_in;
2237        tmp0_r += q0_r_in;
2238        tmp1_r = p6_r_in + p5_r_in;
2239        tmp1_r += p4_r_in;
2240        tmp1_r += p3_r_in;
2241        tmp1_r += p2_r_in;
2242        tmp1_r += p1_r_in;
2243        tmp1_r += p0_r_in;
2244        tmp1_r += tmp0_r;
2245        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2246
2247        ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
2248                   p5_l_in, p4_l_in);
2249        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
2250                   p1_l_in, p0_l_in);
2251        q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
2252
2253        tmp0_l = p7_l_in << 3;
2254        tmp0_l -= p7_l_in;
2255        tmp0_l += p6_l_in;
2256        tmp0_l += q0_l_in;
2257        tmp1_l = p6_l_in + p5_l_in;
2258        tmp1_l += p4_l_in;
2259        tmp1_l += p3_l_in;
2260        tmp1_l += p2_l_in;
2261        tmp1_l += p1_l_in;
2262        tmp1_l += p0_l_in;
2263        tmp1_l += tmp0_l;
2264        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2265
2266        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2267        p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
2268        ST_UB(p6, src);
2269        src += 16;
2270
2271        /* p5 */
2272        q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
2273        tmp0_r = p5_r_in - p6_r_in;
2274        tmp0_r += q1_r_in;
2275        tmp0_r -= p7_r_in;
2276        tmp1_r += tmp0_r;
2277        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2278        q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
2279        tmp0_l = p5_l_in - p6_l_in;
2280        tmp0_l += q1_l_in;
2281        tmp0_l -= p7_l_in;
2282        tmp1_l += tmp0_l;
2283        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2284        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2285        p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
2286        ST_UB(p5, src);
2287        src += 16;
2288
2289        /* p4 */
2290        q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
2291        tmp0_r = p4_r_in - p5_r_in;
2292        tmp0_r += q2_r_in;
2293        tmp0_r -= p7_r_in;
2294        tmp1_r += tmp0_r;
2295        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2296        q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
2297        tmp0_l = p4_l_in - p5_l_in;
2298        tmp0_l += q2_l_in;
2299        tmp0_l -= p7_l_in;
2300        tmp1_l += tmp0_l;
2301        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2302        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2303        p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
2304        ST_UB(p4, src);
2305        src += 16;
2306
2307        /* p3 */
2308        q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
2309        tmp0_r = p3_r_in - p4_r_in;
2310        tmp0_r += q3_r_in;
2311        tmp0_r -= p7_r_in;
2312        tmp1_r += tmp0_r;
2313        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2314        q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
2315        tmp0_l = p3_l_in - p4_l_in;
2316        tmp0_l += q3_l_in;
2317        tmp0_l -= p7_l_in;
2318        tmp1_l += tmp0_l;
2319        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2320        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2321        p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
2322        ST_UB(p3, src);
2323        src += 16;
2324
2325        /* p2 */
2326        q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
2327        filter8 = LD_UB(filter48);
2328        tmp0_r = p2_r_in - p3_r_in;
2329        tmp0_r += q4_r_in;
2330        tmp0_r -= p7_r_in;
2331        tmp1_r += tmp0_r;
2332        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2333        q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
2334        tmp0_l = p2_l_in - p3_l_in;
2335        tmp0_l += q4_l_in;
2336        tmp0_l -= p7_l_in;
2337        tmp1_l += tmp0_l;
2338        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2339        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2340        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2341        ST_UB(filter8, src);
2342        src += 16;
2343
2344        /* p1 */
2345        q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
2346        filter8 = LD_UB(filter48 + 16);
2347        tmp0_r = p1_r_in - p2_r_in;
2348        tmp0_r += q5_r_in;
2349        tmp0_r -= p7_r_in;
2350        tmp1_r += tmp0_r;
2351        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2352        q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
2353        tmp0_l = p1_l_in - p2_l_in;
2354        tmp0_l += q5_l_in;
2355        tmp0_l -= p7_l_in;
2356        tmp1_l += tmp0_l;
2357        l_out = __msa_srari_h((v8i16) (tmp1_l), 4);
2358        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2359        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2360        ST_UB(filter8, src);
2361        src += 16;
2362
2363        /* p0 */
2364        q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
2365        filter8 = LD_UB(filter48 + 32);
2366        tmp0_r = p0_r_in - p1_r_in;
2367        tmp0_r += q6_r_in;
2368        tmp0_r -= p7_r_in;
2369        tmp1_r += tmp0_r;
2370        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2371        q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
2372        tmp0_l = p0_l_in - p1_l_in;
2373        tmp0_l += q6_l_in;
2374        tmp0_l -= p7_l_in;
2375        tmp1_l += tmp0_l;
2376        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2377        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2378        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2379        ST_UB(filter8, src);
2380        src += 16;
2381
2382        /* q0 */
2383        q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
2384        filter8 = LD_UB(filter48 + 48);
2385        tmp0_r = q7_r_in - p0_r_in;
2386        tmp0_r += q0_r_in;
2387        tmp0_r -= p7_r_in;
2388        tmp1_r += tmp0_r;
2389        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2390        q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
2391        tmp0_l = q7_l_in - p0_l_in;
2392        tmp0_l += q0_l_in;
2393        tmp0_l -= p7_l_in;
2394        tmp1_l += tmp0_l;
2395        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2396        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2397        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2398        ST_UB(filter8, src);
2399        src += 16;
2400
2401        /* q1 */
2402        filter8 = LD_UB(filter48 + 64);
2403        tmp0_r = q7_r_in - q0_r_in;
2404        tmp0_r += q1_r_in;
2405        tmp0_r -= p6_r_in;
2406        tmp1_r += tmp0_r;
2407        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2408        tmp0_l = q7_l_in - q0_l_in;
2409        tmp0_l += q1_l_in;
2410        tmp0_l -= p6_l_in;
2411        tmp1_l += tmp0_l;
2412        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2413        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2414        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2415        ST_UB(filter8, src);
2416        src += 16;
2417
2418        /* q2 */
2419        filter8 = LD_UB(filter48 + 80);
2420        tmp0_r = q7_r_in - q1_r_in;
2421        tmp0_r += q2_r_in;
2422        tmp0_r -= p5_r_in;
2423        tmp1_r += tmp0_r;
2424        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2425        tmp0_l = q7_l_in - q1_l_in;
2426        tmp0_l += q2_l_in;
2427        tmp0_l -= p5_l_in;
2428        tmp1_l += tmp0_l;
2429        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2430        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2431        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2432        ST_UB(filter8, src);
2433        src += 16;
2434
2435        /* q3 */
2436        tmp0_r = q7_r_in - q2_r_in;
2437        tmp0_r += q3_r_in;
2438        tmp0_r -= p4_r_in;
2439        tmp1_r += tmp0_r;
2440        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2441        tmp0_l = q7_l_in - q2_l_in;
2442        tmp0_l += q3_l_in;
2443        tmp0_l -= p4_l_in;
2444        tmp1_l += tmp0_l;
2445        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2446        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2447        q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2448        ST_UB(q3, src);
2449        src += 16;
2450
2451        /* q4 */
2452        tmp0_r = q7_r_in - q3_r_in;
2453        tmp0_r += q4_r_in;
2454        tmp0_r -= p3_r_in;
2455        tmp1_r += tmp0_r;
2456        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2457        tmp0_l = q7_l_in - q3_l_in;
2458        tmp0_l += q4_l_in;
2459        tmp0_l -= p3_l_in;
2460        tmp1_l += tmp0_l;
2461        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2462        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2463        q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2464        ST_UB(q4, src);
2465        src += 16;
2466
2467        /* q5 */
2468        tmp0_r = q7_r_in - q4_r_in;
2469        tmp0_r += q5_r_in;
2470        tmp0_r -= p2_r_in;
2471        tmp1_r += tmp0_r;
2472        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2473        tmp0_l = q7_l_in - q4_l_in;
2474        tmp0_l += q5_l_in;
2475        tmp0_l -= p2_l_in;
2476        tmp1_l += tmp0_l;
2477        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2478        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2479        q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2480        ST_UB(q5, src);
2481        src += 16;
2482
2483        /* q6 */
2484        tmp0_r = q7_r_in - q5_r_in;
2485        tmp0_r += q6_r_in;
2486        tmp0_r -= p1_r_in;
2487        tmp1_r += tmp0_r;
2488        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2489        tmp0_l = q7_l_in - q5_l_in;
2490        tmp0_l += q6_l_in;
2491        tmp0_l -= p1_l_in;
2492        tmp1_l += tmp0_l;
2493        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2494        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2495        q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2496        ST_UB(q6, src);
2497
2498        return 0;
2499    }
2500}
2501
2502void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch,
2503                                int32_t b_limit_ptr,
2504                                int32_t limit_ptr,
2505                                int32_t thresh_ptr)
2506{
2507    uint8_t early_exit = 0;
2508    uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2509    uint8_t *filter48 = &transposed_input[16 * 16];
2510
2511    vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
2512
2513    early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
2514                                          &filter48[0], src, pitch,
2515                                          b_limit_ptr, limit_ptr, thresh_ptr);
2516
2517    if (0 == early_exit) {
2518        early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
2519                                        &filter48[0]);
2520
2521        if (0 == early_exit) {
2522            vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch);
2523        }
2524    }
2525}
2526