1/* 2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mips/generic_macros_msa.h" 22#include "h263dsp_mips.h" 23 24static void h263_dct_unquantize_msa(int16_t *block, int16_t qmul, 25 int16_t qadd, int8_t n_coeffs, 26 uint8_t loop_start) 27{ 28 int16_t *block_dup = block; 29 int32_t level, cnt; 30 v8i16 block_vec, qmul_vec, qadd_vec, sub; 31 v8i16 add, mask, mul, zero_mask; 32 33 qmul_vec = __msa_fill_h(qmul); 34 qadd_vec = __msa_fill_h(qadd); 35 for (cnt = 0; cnt < (n_coeffs >> 3); cnt++) { 36 block_vec = LD_SH(block_dup + loop_start); 37 mask = __msa_clti_s_h(block_vec, 0); 38 zero_mask = __msa_ceqi_h(block_vec, 0); 39 mul = block_vec * qmul_vec; 40 sub = mul - qadd_vec; 41 add = mul + qadd_vec; 42 add = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) sub, (v16u8) mask); 43 block_vec = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) block_vec, 44 (v16u8) zero_mask); 45 ST_SH(block_vec, block_dup + loop_start); 46 block_dup += 8; 47 } 48 49 cnt = ((n_coeffs >> 3) * 8) + loop_start; 50 51 for (; cnt <= n_coeffs; cnt++) { 52 level = block[cnt]; 53 if (level) { 54 if (level < 0) { 55 level = level * qmul - qadd; 56 } else { 57 level = level * qmul + qadd; 58 } 59 block[cnt] = level; 60 } 61 } 62} 63 64static int32_t mpeg2_dct_unquantize_inter_msa(int16_t *block, 65 int32_t qscale, 66 const int16_t *quant_matrix) 67{ 68 int32_t cnt, sum_res = -1; 69 v8i16 block_vec, block_neg, qscale_vec, mask; 70 v8i16 block_org0, block_org1, block_org2, block_org3; 71 v8i16 quant_m0, quant_m1, quant_m2, quant_m3; 72 v8i16 sum, mul, zero_mask; 73 v4i32 mul_vec, qscale_l, qscale_r, quant_m_r, quant_m_l; 74 v4i32 block_l, block_r, sad; 75 76 qscale_vec = __msa_fill_h(qscale); 77 for (cnt = 0; cnt < 2; cnt++) { 78 LD_SH4(block, 8, block_org0, block_org1, block_org2, block_org3); 79 LD_SH4(quant_matrix, 8, quant_m0, quant_m1, quant_m2, quant_m3); 80 mask = __msa_clti_s_h(block_org0, 0); 81 zero_mask = __msa_ceqi_h(block_org0, 0); 82 block_neg = -block_org0; 83 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org0, (v16u8) block_neg, 84 (v16u8) mask); 85 block_vec <<= 1; 86 block_vec += 1; 87 UNPCK_SH_SW(block_vec, block_r, block_l); 88 UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l); 89 UNPCK_SH_SW(quant_m0, quant_m_r, quant_m_l); 90 mul_vec = block_l * qscale_l; 91 mul_vec *= quant_m_l; 92 block_l = mul_vec >> 4; 93 mul_vec = block_r * qscale_r; 94 mul_vec *= quant_m_r; 95 block_r = mul_vec >> 4; 96 mul = (v8i16) __msa_pckev_h((v8i16) block_l, (v8i16) block_r); 97 block_neg = - mul; 98 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg, 99 (v16u8) mask); 100 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org0, 101 (v16u8) zero_mask); 102 ST_SH(sum, block); 103 block += 8; 104 quant_matrix += 8; 105 sad = __msa_hadd_s_w(sum, sum); 106 sum_res += HADD_SW_S32(sad); 107 mask = __msa_clti_s_h(block_org1, 0); 108 zero_mask = __msa_ceqi_h(block_org1, 0); 109 block_neg = - block_org1; 110 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org1, (v16u8) block_neg, 111 (v16u8) mask); 112 block_vec <<= 1; 113 block_vec += 1; 114 UNPCK_SH_SW(block_vec, block_r, block_l); 115 UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l); 116 UNPCK_SH_SW(quant_m1, quant_m_r, quant_m_l); 117 mul_vec = block_l * qscale_l; 118 mul_vec *= quant_m_l; 119 block_l = mul_vec >> 4; 120 mul_vec = block_r * qscale_r; 121 mul_vec *= quant_m_r; 122 block_r = mul_vec >> 4; 123 mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r); 124 block_neg = - mul; 125 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg, 126 (v16u8) mask); 127 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org1, 128 (v16u8) zero_mask); 129 ST_SH(sum, block); 130 131 block += 8; 132 quant_matrix += 8; 133 sad = __msa_hadd_s_w(sum, sum); 134 sum_res += HADD_SW_S32(sad); 135 mask = __msa_clti_s_h(block_org2, 0); 136 zero_mask = __msa_ceqi_h(block_org2, 0); 137 block_neg = - block_org2; 138 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org2, (v16u8) block_neg, 139 (v16u8) mask); 140 block_vec <<= 1; 141 block_vec += 1; 142 UNPCK_SH_SW(block_vec, block_r, block_l); 143 UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l); 144 UNPCK_SH_SW(quant_m2, quant_m_r, quant_m_l); 145 mul_vec = block_l * qscale_l; 146 mul_vec *= quant_m_l; 147 block_l = mul_vec >> 4; 148 mul_vec = block_r * qscale_r; 149 mul_vec *= quant_m_r; 150 block_r = mul_vec >> 4; 151 mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r); 152 block_neg = - mul; 153 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg, 154 (v16u8) mask); 155 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org2, 156 (v16u8) zero_mask); 157 ST_SH(sum, block); 158 159 block += 8; 160 quant_matrix += 8; 161 sad = __msa_hadd_s_w(sum, sum); 162 sum_res += HADD_SW_S32(sad); 163 mask = __msa_clti_s_h(block_org3, 0); 164 zero_mask = __msa_ceqi_h(block_org3, 0); 165 block_neg = - block_org3; 166 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org3, (v16u8) block_neg, 167 (v16u8) mask); 168 block_vec <<= 1; 169 block_vec += 1; 170 UNPCK_SH_SW(block_vec, block_r, block_l); 171 UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l); 172 UNPCK_SH_SW(quant_m3, quant_m_r, quant_m_l); 173 mul_vec = block_l * qscale_l; 174 mul_vec *= quant_m_l; 175 block_l = mul_vec >> 4; 176 mul_vec = block_r * qscale_r; 177 mul_vec *= quant_m_r; 178 block_r = mul_vec >> 4; 179 mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r); 180 block_neg = - mul; 181 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg, 182 (v16u8) mask); 183 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org3, 184 (v16u8) zero_mask); 185 ST_SH(sum, block); 186 187 block += 8; 188 quant_matrix += 8; 189 sad = __msa_hadd_s_w(sum, sum); 190 sum_res += HADD_SW_S32(sad); 191 } 192 193 return sum_res; 194} 195 196void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s, 197 int16_t *block, int32_t index, 198 int32_t qscale) 199{ 200 int32_t qmul, qadd; 201 int32_t nCoeffs; 202 203 av_assert2(s->block_last_index[index] >= 0 || s->h263_aic); 204 205 qmul = qscale << 1; 206 207 if (!s->h263_aic) { 208 block[0] *= index < 4 ? s->y_dc_scale : s->c_dc_scale; 209 qadd = (qscale - 1) | 1; 210 } else { 211 qadd = 0; 212 } 213 if (s->ac_pred) 214 nCoeffs = 63; 215 else 216 nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]]; 217 218 h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 1); 219} 220 221void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s, 222 int16_t *block, int32_t index, 223 int32_t qscale) 224{ 225 int32_t qmul, qadd; 226 int32_t nCoeffs; 227 228 av_assert2(s->block_last_index[index] >= 0); 229 230 qadd = (qscale - 1) | 1; 231 qmul = qscale << 1; 232 233 nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]]; 234 235 h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 0); 236} 237 238void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s, 239 int16_t *block, int32_t index, 240 int32_t qscale) 241{ 242 const uint16_t *quant_matrix; 243 int32_t sum = -1; 244 245 quant_matrix = s->inter_matrix; 246 247 sum = mpeg2_dct_unquantize_inter_msa(block, qscale, quant_matrix); 248 249 block[63] ^= sum & 1; 250} 251