1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
22cabdff1aSopenharmony_ci#include "h263dsp_mips.h"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_cistatic const uint8_t h263_loop_filter_strength_msa[32] = {
25cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 7,
26cabdff1aSopenharmony_ci    7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12
27cabdff1aSopenharmony_ci};
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_cistatic void h263_h_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)
30cabdff1aSopenharmony_ci{
31cabdff1aSopenharmony_ci    int32_t strength = h263_loop_filter_strength_msa[qscale];
32cabdff1aSopenharmony_ci    v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
33cabdff1aSopenharmony_ci    v8i16 temp0, temp1, temp2;
34cabdff1aSopenharmony_ci    v8i16 diff0, diff2, diff4, diff6, diff8;
35cabdff1aSopenharmony_ci    v8i16 d0, a_d0, str_x2, str;
36cabdff1aSopenharmony_ci
37cabdff1aSopenharmony_ci    src -= 2;
38cabdff1aSopenharmony_ci    LD_UB8(src, stride, in0, in1, in2, in3, in4, in5, in6, in7);
39cabdff1aSopenharmony_ci    TRANSPOSE8x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,
40cabdff1aSopenharmony_ci                       in0, in3, in2, in1);
41cabdff1aSopenharmony_ci
42cabdff1aSopenharmony_ci    temp0 = (v8i16) __msa_ilvr_b((v16i8) in0, (v16i8) in1);
43cabdff1aSopenharmony_ci    a_d0 = __msa_hsub_u_h((v16u8) temp0, (v16u8) temp0);
44cabdff1aSopenharmony_ci    temp2 = (v8i16) __msa_ilvr_b((v16i8) in2, (v16i8) in3);
45cabdff1aSopenharmony_ci    temp2 = __msa_hsub_u_h((v16u8) temp2, (v16u8) temp2);
46cabdff1aSopenharmony_ci    temp2 <<= 2;
47cabdff1aSopenharmony_ci    diff0 = a_d0 + temp2;
48cabdff1aSopenharmony_ci    diff2 = -(-diff0 >> 3);
49cabdff1aSopenharmony_ci    str_x2 = __msa_fill_h(-(strength << 1));
50cabdff1aSopenharmony_ci    temp0 = (str_x2 <= diff2);
51cabdff1aSopenharmony_ci    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) temp0, (v16u8) temp0);
52cabdff1aSopenharmony_ci    temp2 = str_x2 - diff2;
53cabdff1aSopenharmony_ci    str = __msa_fill_h(-strength);
54cabdff1aSopenharmony_ci    temp0 = (diff2 < str);
55cabdff1aSopenharmony_ci    diff2 = (v8i16) __msa_bmnz_v((v16u8) diff2, (v16u8) temp2, (v16u8) temp0);
56cabdff1aSopenharmony_ci    diff4 = diff0 >> 3;
57cabdff1aSopenharmony_ci    str_x2 = __msa_fill_h(strength << 1);
58cabdff1aSopenharmony_ci    temp0 = (diff4 <= str_x2);
59cabdff1aSopenharmony_ci    diff4 = (v8i16) __msa_bmz_v((v16u8) diff4, (v16u8) temp0, (v16u8) temp0);
60cabdff1aSopenharmony_ci    temp2 = str_x2 - diff4;
61cabdff1aSopenharmony_ci    str = __msa_fill_h(strength);
62cabdff1aSopenharmony_ci    temp0 = (str < diff4);
63cabdff1aSopenharmony_ci    diff4 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) temp2, (v16u8) temp0);
64cabdff1aSopenharmony_ci    temp0 = __msa_clti_s_h(diff0, 0);
65cabdff1aSopenharmony_ci    d0 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
66cabdff1aSopenharmony_ci    diff2 = -diff2 >> 1;
67cabdff1aSopenharmony_ci    diff4 >>= 1;
68cabdff1aSopenharmony_ci    diff8 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
69cabdff1aSopenharmony_ci    diff6 = (-a_d0) >> 2;
70cabdff1aSopenharmony_ci    diff6 = -(diff6);
71cabdff1aSopenharmony_ci    temp2 = -diff8;
72cabdff1aSopenharmony_ci    temp0 = (diff6 < temp2);
73cabdff1aSopenharmony_ci    diff6 = (v8i16) __msa_bmnz_v((v16u8) diff6, (v16u8) temp2, (v16u8) temp0);
74cabdff1aSopenharmony_ci    diff2 = a_d0 >> 2;
75cabdff1aSopenharmony_ci    temp0 = (diff2 <= diff8);
76cabdff1aSopenharmony_ci    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) diff8, (v16u8) temp0);
77cabdff1aSopenharmony_ci    temp0 = __msa_clti_s_h(a_d0, 0);
78cabdff1aSopenharmony_ci    diff6 = (v8i16) __msa_bmz_v((v16u8) diff6, (v16u8) diff2, (v16u8) temp0);
79cabdff1aSopenharmony_ci    PCKEV_B2_SH(a_d0, diff6, a_d0, d0, diff6, d0);
80cabdff1aSopenharmony_ci    in0 = (v16u8) ((v16i8) in0 - (v16i8) diff6);
81cabdff1aSopenharmony_ci    in1 = (v16u8) ((v16i8) in1 + (v16i8) diff6);
82cabdff1aSopenharmony_ci    in3 = __msa_xori_b(in3, 128);
83cabdff1aSopenharmony_ci    in3 = (v16u8) __msa_adds_s_b((v16i8) in3, (v16i8) d0);
84cabdff1aSopenharmony_ci    in3 = __msa_xori_b(in3, 128);
85cabdff1aSopenharmony_ci    in2 = __msa_subsus_u_b(in2, (v16i8) d0);
86cabdff1aSopenharmony_ci    ILVR_B2_SH(in3, in0, in1, in2, temp0, temp1);
87cabdff1aSopenharmony_ci    in0 = (v16u8) __msa_ilvr_h(temp1, temp0);
88cabdff1aSopenharmony_ci    in3 = (v16u8) __msa_ilvl_h(temp1, temp0);
89cabdff1aSopenharmony_ci    ST_W8(in0, in3, 0, 1, 2, 3, 0, 1, 2, 3, src, stride);
90cabdff1aSopenharmony_ci}
91cabdff1aSopenharmony_ci
92cabdff1aSopenharmony_cistatic void h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)
93cabdff1aSopenharmony_ci{
94cabdff1aSopenharmony_ci    int32_t strength = h263_loop_filter_strength_msa[qscale];
95cabdff1aSopenharmony_ci    uint64_t res0, res1, res2, res3;
96cabdff1aSopenharmony_ci    v16u8 in0, in1, in2, in3;
97cabdff1aSopenharmony_ci    v8i16 temp0, temp2, diff0, diff2, diff4, diff6, diff8;
98cabdff1aSopenharmony_ci    v8i16 d0, a_d0, str_x2, str;
99cabdff1aSopenharmony_ci
100cabdff1aSopenharmony_ci    src -= 2 * stride;
101cabdff1aSopenharmony_ci    LD_UB4(src, stride, in0, in3, in2, in1);
102cabdff1aSopenharmony_ci    temp0 = (v8i16) __msa_ilvr_b((v16i8) in0, (v16i8) in1);
103cabdff1aSopenharmony_ci    a_d0 = __msa_hsub_u_h((v16u8) temp0, (v16u8) temp0);
104cabdff1aSopenharmony_ci    temp2 = (v8i16) __msa_ilvr_b((v16i8) in2, (v16i8) in3);
105cabdff1aSopenharmony_ci    temp2 = __msa_hsub_u_h((v16u8) temp2, (v16u8) temp2);
106cabdff1aSopenharmony_ci    temp2 <<= 2;
107cabdff1aSopenharmony_ci    diff0 = a_d0 + temp2;
108cabdff1aSopenharmony_ci    diff2 = -(-diff0 >> 3);
109cabdff1aSopenharmony_ci    str_x2 = __msa_fill_h(-(strength << 1));
110cabdff1aSopenharmony_ci    temp0 = (str_x2 <= diff2);
111cabdff1aSopenharmony_ci    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) temp0, (v16u8) temp0);
112cabdff1aSopenharmony_ci    temp2 = str_x2 - diff2;
113cabdff1aSopenharmony_ci    str = __msa_fill_h(-strength);
114cabdff1aSopenharmony_ci    temp0 = (diff2 < str);
115cabdff1aSopenharmony_ci    diff2 = (v8i16) __msa_bmnz_v((v16u8) diff2, (v16u8) temp2, (v16u8) temp0);
116cabdff1aSopenharmony_ci    diff4 = diff0 >> 3;
117cabdff1aSopenharmony_ci    str_x2 = __msa_fill_h(strength << 1);
118cabdff1aSopenharmony_ci    temp0 = (diff4 <= str_x2);
119cabdff1aSopenharmony_ci    diff4 = (v8i16) __msa_bmz_v((v16u8) diff4, (v16u8) temp0, (v16u8) temp0);
120cabdff1aSopenharmony_ci    temp2 = str_x2 - diff4;
121cabdff1aSopenharmony_ci    str = __msa_fill_h(strength);
122cabdff1aSopenharmony_ci    temp0 = (str < diff4);
123cabdff1aSopenharmony_ci    diff4 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) temp2, (v16u8) temp0);
124cabdff1aSopenharmony_ci    temp0 = __msa_clti_s_h(diff0, 0);
125cabdff1aSopenharmony_ci    d0 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
126cabdff1aSopenharmony_ci    diff2 = -diff2 >> 1;
127cabdff1aSopenharmony_ci    diff4 >>= 1;
128cabdff1aSopenharmony_ci    diff8 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
129cabdff1aSopenharmony_ci    diff6 = (-a_d0) >> 2;
130cabdff1aSopenharmony_ci    diff6 = -(diff6);
131cabdff1aSopenharmony_ci    temp2 = -diff8;
132cabdff1aSopenharmony_ci    temp0 = (diff6 < temp2);
133cabdff1aSopenharmony_ci    diff6 = (v8i16) __msa_bmnz_v((v16u8) diff6, (v16u8) temp2, (v16u8) temp0);
134cabdff1aSopenharmony_ci    diff2 = a_d0 >> 2;
135cabdff1aSopenharmony_ci    temp0 = (diff2 <= diff8);
136cabdff1aSopenharmony_ci    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) diff8, (v16u8) temp0);
137cabdff1aSopenharmony_ci    temp0 = __msa_clti_s_h(a_d0, 0);
138cabdff1aSopenharmony_ci    diff6 = (v8i16) __msa_bmz_v((v16u8) diff6, (v16u8) diff2, (v16u8) temp0);
139cabdff1aSopenharmony_ci    PCKEV_B2_SH(a_d0, diff6, a_d0, d0, diff6, d0);
140cabdff1aSopenharmony_ci    in0 = (v16u8) ((v16i8) in0 - (v16i8) diff6);
141cabdff1aSopenharmony_ci    in1 = (v16u8) ((v16i8) in1 + (v16i8) diff6);
142cabdff1aSopenharmony_ci    in3 = __msa_xori_b(in3, 128);
143cabdff1aSopenharmony_ci    in3 = (v16u8) __msa_adds_s_b((v16i8) in3, (v16i8) d0);
144cabdff1aSopenharmony_ci    in3 = __msa_xori_b(in3, 128);
145cabdff1aSopenharmony_ci    in2 = __msa_subsus_u_b(in2, (v16i8) d0);
146cabdff1aSopenharmony_ci    res0 = __msa_copy_u_d((v2i64) in0, 0);
147cabdff1aSopenharmony_ci    res1 = __msa_copy_u_d((v2i64) in3, 0);
148cabdff1aSopenharmony_ci    res2 = __msa_copy_u_d((v2i64) in2, 0);
149cabdff1aSopenharmony_ci    res3 = __msa_copy_u_d((v2i64) in1, 0);
150cabdff1aSopenharmony_ci    SD4(res0, res1, res2, res3, src, stride);
151cabdff1aSopenharmony_ci}
152cabdff1aSopenharmony_ci
153cabdff1aSopenharmony_civoid ff_h263_h_loop_filter_msa(uint8_t *src, int32_t stride, int32_t q_scale)
154cabdff1aSopenharmony_ci{
155cabdff1aSopenharmony_ci    h263_h_loop_filter_msa(src, stride, q_scale);
156cabdff1aSopenharmony_ci}
157cabdff1aSopenharmony_ci
158cabdff1aSopenharmony_civoid ff_h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t q_scale)
159cabdff1aSopenharmony_ci{
160cabdff1aSopenharmony_ci    h263_v_loop_filter_msa(src, stride, q_scale);
161cabdff1aSopenharmony_ci}
162