1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
22cabdff1aSopenharmony_ci#include "h263dsp_mips.h"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_cistatic void h263_dct_unquantize_msa(int16_t *block, int16_t qmul,
25cabdff1aSopenharmony_ci                                    int16_t qadd, int8_t n_coeffs,
26cabdff1aSopenharmony_ci                                    uint8_t loop_start)
27cabdff1aSopenharmony_ci{
28cabdff1aSopenharmony_ci    int16_t *block_dup = block;
29cabdff1aSopenharmony_ci    int32_t level, cnt;
30cabdff1aSopenharmony_ci    v8i16 block_vec, qmul_vec, qadd_vec, sub;
31cabdff1aSopenharmony_ci    v8i16 add, mask, mul, zero_mask;
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_ci    qmul_vec = __msa_fill_h(qmul);
34cabdff1aSopenharmony_ci    qadd_vec = __msa_fill_h(qadd);
35cabdff1aSopenharmony_ci    for (cnt = 0; cnt < (n_coeffs >> 3); cnt++) {
36cabdff1aSopenharmony_ci        block_vec = LD_SH(block_dup + loop_start);
37cabdff1aSopenharmony_ci        mask = __msa_clti_s_h(block_vec, 0);
38cabdff1aSopenharmony_ci        zero_mask = __msa_ceqi_h(block_vec, 0);
39cabdff1aSopenharmony_ci        mul = block_vec * qmul_vec;
40cabdff1aSopenharmony_ci        sub = mul - qadd_vec;
41cabdff1aSopenharmony_ci        add = mul + qadd_vec;
42cabdff1aSopenharmony_ci        add = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) sub, (v16u8) mask);
43cabdff1aSopenharmony_ci        block_vec = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) block_vec,
44cabdff1aSopenharmony_ci                                         (v16u8) zero_mask);
45cabdff1aSopenharmony_ci        ST_SH(block_vec, block_dup + loop_start);
46cabdff1aSopenharmony_ci        block_dup += 8;
47cabdff1aSopenharmony_ci    }
48cabdff1aSopenharmony_ci
49cabdff1aSopenharmony_ci    cnt = ((n_coeffs >> 3) * 8) + loop_start;
50cabdff1aSopenharmony_ci
51cabdff1aSopenharmony_ci    for (; cnt <= n_coeffs; cnt++) {
52cabdff1aSopenharmony_ci        level = block[cnt];
53cabdff1aSopenharmony_ci        if (level) {
54cabdff1aSopenharmony_ci            if (level < 0) {
55cabdff1aSopenharmony_ci                level = level * qmul - qadd;
56cabdff1aSopenharmony_ci            } else {
57cabdff1aSopenharmony_ci                level = level * qmul + qadd;
58cabdff1aSopenharmony_ci            }
59cabdff1aSopenharmony_ci            block[cnt] = level;
60cabdff1aSopenharmony_ci        }
61cabdff1aSopenharmony_ci    }
62cabdff1aSopenharmony_ci}
63cabdff1aSopenharmony_ci
64cabdff1aSopenharmony_cistatic int32_t mpeg2_dct_unquantize_inter_msa(int16_t *block,
65cabdff1aSopenharmony_ci                                              int32_t qscale,
66cabdff1aSopenharmony_ci                                              const int16_t *quant_matrix)
67cabdff1aSopenharmony_ci{
68cabdff1aSopenharmony_ci    int32_t cnt, sum_res = -1;
69cabdff1aSopenharmony_ci    v8i16 block_vec, block_neg, qscale_vec, mask;
70cabdff1aSopenharmony_ci    v8i16 block_org0, block_org1, block_org2, block_org3;
71cabdff1aSopenharmony_ci    v8i16 quant_m0, quant_m1, quant_m2, quant_m3;
72cabdff1aSopenharmony_ci    v8i16 sum, mul, zero_mask;
73cabdff1aSopenharmony_ci    v4i32 mul_vec, qscale_l, qscale_r, quant_m_r, quant_m_l;
74cabdff1aSopenharmony_ci    v4i32 block_l, block_r, sad;
75cabdff1aSopenharmony_ci
76cabdff1aSopenharmony_ci    qscale_vec = __msa_fill_h(qscale);
77cabdff1aSopenharmony_ci    for (cnt = 0; cnt < 2; cnt++) {
78cabdff1aSopenharmony_ci        LD_SH4(block, 8, block_org0, block_org1, block_org2, block_org3);
79cabdff1aSopenharmony_ci        LD_SH4(quant_matrix, 8, quant_m0, quant_m1, quant_m2, quant_m3);
80cabdff1aSopenharmony_ci        mask = __msa_clti_s_h(block_org0, 0);
81cabdff1aSopenharmony_ci        zero_mask = __msa_ceqi_h(block_org0, 0);
82cabdff1aSopenharmony_ci        block_neg = -block_org0;
83cabdff1aSopenharmony_ci        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org0, (v16u8) block_neg,
84cabdff1aSopenharmony_ci                                         (v16u8) mask);
85cabdff1aSopenharmony_ci        block_vec <<= 1;
86cabdff1aSopenharmony_ci        block_vec += 1;
87cabdff1aSopenharmony_ci        UNPCK_SH_SW(block_vec, block_r, block_l);
88cabdff1aSopenharmony_ci        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
89cabdff1aSopenharmony_ci        UNPCK_SH_SW(quant_m0, quant_m_r, quant_m_l);
90cabdff1aSopenharmony_ci        mul_vec = block_l * qscale_l;
91cabdff1aSopenharmony_ci        mul_vec *= quant_m_l;
92cabdff1aSopenharmony_ci        block_l = mul_vec >> 4;
93cabdff1aSopenharmony_ci        mul_vec = block_r * qscale_r;
94cabdff1aSopenharmony_ci        mul_vec *= quant_m_r;
95cabdff1aSopenharmony_ci        block_r = mul_vec >> 4;
96cabdff1aSopenharmony_ci        mul = (v8i16) __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
97cabdff1aSopenharmony_ci        block_neg = - mul;
98cabdff1aSopenharmony_ci        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
99cabdff1aSopenharmony_ci                                   (v16u8) mask);
100cabdff1aSopenharmony_ci        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org0,
101cabdff1aSopenharmony_ci                                   (v16u8) zero_mask);
102cabdff1aSopenharmony_ci        ST_SH(sum, block);
103cabdff1aSopenharmony_ci        block += 8;
104cabdff1aSopenharmony_ci        quant_matrix += 8;
105cabdff1aSopenharmony_ci        sad = __msa_hadd_s_w(sum, sum);
106cabdff1aSopenharmony_ci        sum_res += HADD_SW_S32(sad);
107cabdff1aSopenharmony_ci        mask = __msa_clti_s_h(block_org1, 0);
108cabdff1aSopenharmony_ci        zero_mask = __msa_ceqi_h(block_org1, 0);
109cabdff1aSopenharmony_ci        block_neg = - block_org1;
110cabdff1aSopenharmony_ci        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org1, (v16u8) block_neg,
111cabdff1aSopenharmony_ci                                         (v16u8) mask);
112cabdff1aSopenharmony_ci        block_vec <<= 1;
113cabdff1aSopenharmony_ci        block_vec += 1;
114cabdff1aSopenharmony_ci        UNPCK_SH_SW(block_vec, block_r, block_l);
115cabdff1aSopenharmony_ci        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
116cabdff1aSopenharmony_ci        UNPCK_SH_SW(quant_m1, quant_m_r, quant_m_l);
117cabdff1aSopenharmony_ci        mul_vec = block_l * qscale_l;
118cabdff1aSopenharmony_ci        mul_vec *= quant_m_l;
119cabdff1aSopenharmony_ci        block_l = mul_vec >> 4;
120cabdff1aSopenharmony_ci        mul_vec = block_r * qscale_r;
121cabdff1aSopenharmony_ci        mul_vec *= quant_m_r;
122cabdff1aSopenharmony_ci        block_r = mul_vec >> 4;
123cabdff1aSopenharmony_ci        mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
124cabdff1aSopenharmony_ci        block_neg = - mul;
125cabdff1aSopenharmony_ci        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
126cabdff1aSopenharmony_ci                                   (v16u8) mask);
127cabdff1aSopenharmony_ci        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org1,
128cabdff1aSopenharmony_ci                                   (v16u8) zero_mask);
129cabdff1aSopenharmony_ci        ST_SH(sum, block);
130cabdff1aSopenharmony_ci
131cabdff1aSopenharmony_ci        block += 8;
132cabdff1aSopenharmony_ci        quant_matrix += 8;
133cabdff1aSopenharmony_ci        sad = __msa_hadd_s_w(sum, sum);
134cabdff1aSopenharmony_ci        sum_res += HADD_SW_S32(sad);
135cabdff1aSopenharmony_ci        mask = __msa_clti_s_h(block_org2, 0);
136cabdff1aSopenharmony_ci        zero_mask = __msa_ceqi_h(block_org2, 0);
137cabdff1aSopenharmony_ci        block_neg = - block_org2;
138cabdff1aSopenharmony_ci        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org2, (v16u8) block_neg,
139cabdff1aSopenharmony_ci                                         (v16u8) mask);
140cabdff1aSopenharmony_ci        block_vec <<= 1;
141cabdff1aSopenharmony_ci        block_vec += 1;
142cabdff1aSopenharmony_ci        UNPCK_SH_SW(block_vec, block_r, block_l);
143cabdff1aSopenharmony_ci        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
144cabdff1aSopenharmony_ci        UNPCK_SH_SW(quant_m2, quant_m_r, quant_m_l);
145cabdff1aSopenharmony_ci        mul_vec = block_l * qscale_l;
146cabdff1aSopenharmony_ci        mul_vec *= quant_m_l;
147cabdff1aSopenharmony_ci        block_l = mul_vec >> 4;
148cabdff1aSopenharmony_ci        mul_vec = block_r * qscale_r;
149cabdff1aSopenharmony_ci        mul_vec *= quant_m_r;
150cabdff1aSopenharmony_ci        block_r = mul_vec >> 4;
151cabdff1aSopenharmony_ci        mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
152cabdff1aSopenharmony_ci        block_neg = - mul;
153cabdff1aSopenharmony_ci        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
154cabdff1aSopenharmony_ci                                   (v16u8) mask);
155cabdff1aSopenharmony_ci        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org2,
156cabdff1aSopenharmony_ci                                   (v16u8) zero_mask);
157cabdff1aSopenharmony_ci        ST_SH(sum, block);
158cabdff1aSopenharmony_ci
159cabdff1aSopenharmony_ci        block += 8;
160cabdff1aSopenharmony_ci        quant_matrix += 8;
161cabdff1aSopenharmony_ci        sad = __msa_hadd_s_w(sum, sum);
162cabdff1aSopenharmony_ci        sum_res += HADD_SW_S32(sad);
163cabdff1aSopenharmony_ci        mask = __msa_clti_s_h(block_org3, 0);
164cabdff1aSopenharmony_ci        zero_mask = __msa_ceqi_h(block_org3, 0);
165cabdff1aSopenharmony_ci        block_neg = - block_org3;
166cabdff1aSopenharmony_ci        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org3, (v16u8) block_neg,
167cabdff1aSopenharmony_ci                                         (v16u8) mask);
168cabdff1aSopenharmony_ci        block_vec <<= 1;
169cabdff1aSopenharmony_ci        block_vec += 1;
170cabdff1aSopenharmony_ci        UNPCK_SH_SW(block_vec, block_r, block_l);
171cabdff1aSopenharmony_ci        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
172cabdff1aSopenharmony_ci        UNPCK_SH_SW(quant_m3, quant_m_r, quant_m_l);
173cabdff1aSopenharmony_ci        mul_vec = block_l * qscale_l;
174cabdff1aSopenharmony_ci        mul_vec *= quant_m_l;
175cabdff1aSopenharmony_ci        block_l = mul_vec >> 4;
176cabdff1aSopenharmony_ci        mul_vec = block_r * qscale_r;
177cabdff1aSopenharmony_ci        mul_vec *= quant_m_r;
178cabdff1aSopenharmony_ci        block_r = mul_vec >> 4;
179cabdff1aSopenharmony_ci        mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
180cabdff1aSopenharmony_ci        block_neg = - mul;
181cabdff1aSopenharmony_ci        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
182cabdff1aSopenharmony_ci                                   (v16u8) mask);
183cabdff1aSopenharmony_ci        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org3,
184cabdff1aSopenharmony_ci                                   (v16u8) zero_mask);
185cabdff1aSopenharmony_ci        ST_SH(sum, block);
186cabdff1aSopenharmony_ci
187cabdff1aSopenharmony_ci        block += 8;
188cabdff1aSopenharmony_ci        quant_matrix += 8;
189cabdff1aSopenharmony_ci        sad = __msa_hadd_s_w(sum, sum);
190cabdff1aSopenharmony_ci        sum_res += HADD_SW_S32(sad);
191cabdff1aSopenharmony_ci    }
192cabdff1aSopenharmony_ci
193cabdff1aSopenharmony_ci    return sum_res;
194cabdff1aSopenharmony_ci}
195cabdff1aSopenharmony_ci
196cabdff1aSopenharmony_civoid ff_dct_unquantize_h263_intra_msa(MpegEncContext *s,
197cabdff1aSopenharmony_ci                                      int16_t *block, int32_t index,
198cabdff1aSopenharmony_ci                                      int32_t qscale)
199cabdff1aSopenharmony_ci{
200cabdff1aSopenharmony_ci    int32_t qmul, qadd;
201cabdff1aSopenharmony_ci    int32_t nCoeffs;
202cabdff1aSopenharmony_ci
203cabdff1aSopenharmony_ci    av_assert2(s->block_last_index[index] >= 0 || s->h263_aic);
204cabdff1aSopenharmony_ci
205cabdff1aSopenharmony_ci    qmul = qscale << 1;
206cabdff1aSopenharmony_ci
207cabdff1aSopenharmony_ci    if (!s->h263_aic) {
208cabdff1aSopenharmony_ci        block[0] *= index < 4 ? s->y_dc_scale : s->c_dc_scale;
209cabdff1aSopenharmony_ci        qadd = (qscale - 1) | 1;
210cabdff1aSopenharmony_ci    } else {
211cabdff1aSopenharmony_ci        qadd = 0;
212cabdff1aSopenharmony_ci    }
213cabdff1aSopenharmony_ci    if (s->ac_pred)
214cabdff1aSopenharmony_ci        nCoeffs = 63;
215cabdff1aSopenharmony_ci    else
216cabdff1aSopenharmony_ci        nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]];
217cabdff1aSopenharmony_ci
218cabdff1aSopenharmony_ci    h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 1);
219cabdff1aSopenharmony_ci}
220cabdff1aSopenharmony_ci
221cabdff1aSopenharmony_civoid ff_dct_unquantize_h263_inter_msa(MpegEncContext *s,
222cabdff1aSopenharmony_ci                                      int16_t *block, int32_t index,
223cabdff1aSopenharmony_ci                                      int32_t qscale)
224cabdff1aSopenharmony_ci{
225cabdff1aSopenharmony_ci    int32_t qmul, qadd;
226cabdff1aSopenharmony_ci    int32_t nCoeffs;
227cabdff1aSopenharmony_ci
228cabdff1aSopenharmony_ci    av_assert2(s->block_last_index[index] >= 0);
229cabdff1aSopenharmony_ci
230cabdff1aSopenharmony_ci    qadd = (qscale - 1) | 1;
231cabdff1aSopenharmony_ci    qmul = qscale << 1;
232cabdff1aSopenharmony_ci
233cabdff1aSopenharmony_ci    nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]];
234cabdff1aSopenharmony_ci
235cabdff1aSopenharmony_ci    h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 0);
236cabdff1aSopenharmony_ci}
237cabdff1aSopenharmony_ci
238cabdff1aSopenharmony_civoid ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s,
239cabdff1aSopenharmony_ci                                       int16_t *block, int32_t index,
240cabdff1aSopenharmony_ci                                       int32_t qscale)
241cabdff1aSopenharmony_ci{
242cabdff1aSopenharmony_ci    const uint16_t *quant_matrix;
243cabdff1aSopenharmony_ci    int32_t sum = -1;
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ci    quant_matrix = s->inter_matrix;
246cabdff1aSopenharmony_ci
247cabdff1aSopenharmony_ci    sum = mpeg2_dct_unquantize_inter_msa(block, qscale, quant_matrix);
248cabdff1aSopenharmony_ci
249cabdff1aSopenharmony_ci    block[63] ^= sum & 1;
250cabdff1aSopenharmony_ci}
251