1/* 2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mips/generic_macros_msa.h" 22#include "idctdsp_mips.h" 23 24static void simple_idct_msa(int16_t *block) 25{ 26 int32_t const_val; 27 v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 }; 28 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 29 v8i16 w1, w3, w5, w7; 30 v8i16 const0, const1, const2, const3, const4, const5, const6, const7; 31 v4i32 temp0_r, temp1_r, temp2_r, temp3_r; 32 v4i32 temp0_l, temp1_l, temp2_l, temp3_l; 33 v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l; 34 v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l; 35 v4i32 w2, w4, w6; 36 v8i16 select_vec, temp; 37 v8i16 zero = { 0 }; 38 v4i32 const_val0 = __msa_ldi_w(1); 39 v4i32 const_val1 = __msa_ldi_w(1); 40 41 LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); 42 const_val0 <<= 10; 43 const_val = 16383 * ((1 << 19) / 16383); 44 const_val1 = __msa_insert_w(const_val0, 0, const_val); 45 const_val1 = __msa_splati_w(const_val1, 0); 46 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 47 in0, in1, in2, in3, in4, in5, in6, in7); 48 select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7; 49 select_vec = __msa_clti_u_h((v8u16) select_vec, 1); 50 UNPCK_SH_SW(in0, a0_r, a0_l); 51 UNPCK_SH_SW(in2, temp3_r, temp3_l); 52 temp = in0 << 3; 53 w2 = (v4i32) __msa_splati_h(weights, 2); 54 w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2); 55 w4 = (v4i32) __msa_splati_h(weights, 4); 56 w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4); 57 w6 = (v4i32) __msa_splati_h(weights, 6); 58 w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6); 59 MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); 60 ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l); 61 MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l, 62 temp1_r, temp1_l, temp2_r, temp2_l); 63 BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, 64 temp2_l, temp2_r, temp1_l, temp1_r, 65 a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); 66 UNPCK_SH_SW(in4, temp0_r, temp0_l); 67 UNPCK_SH_SW(in6, temp3_r, temp3_l); 68 MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l); 69 MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l, 70 temp2_r, temp2_l, temp1_r, temp1_l); 71 ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l); 72 SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l, 73 a1_r, a1_l, a2_r, a2_l); 74 ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l, 75 a3_r, a3_l, a0_r, a0_l); 76 SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l); 77 ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l); 78 SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l); 79 ILVRL_H2_SW(in1, in3, b3_r, b3_l); 80 SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7); 81 ILVRL_H2_SW(in5, in7, temp0_r, temp0_l); 82 ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5, 83 const0, const1, const2, const3); 84 ILVR_H2_SH(w5, w7, w7, w3, const4, const6); 85 const5 = __msa_ilvod_h(-w1, -w5); 86 const7 = __msa_ilvod_h(w3, -w1); 87 DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, 88 b0_r, b1_r, b2_r, b3_r); 89 DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r, 90 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); 91 DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, 92 b0_l, b1_l, b2_l, b3_l); 93 DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l, 94 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); 95 BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, 96 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, 97 temp0_r, temp0_l, temp1_r, temp1_l, 98 temp2_r, temp2_l, temp3_r, temp3_l, 99 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); 100 SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11); 101 SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11); 102 PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, 103 temp2_l, temp2_r, temp3_l, temp3_r, 104 temp0_r, temp1_r, temp2_r, temp3_r); 105 in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp, 106 (v16u8) select_vec); 107 in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp, 108 (v16u8) select_vec); 109 in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp, 110 (v16u8) select_vec); 111 in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp, 112 (v16u8) select_vec); 113 SRA_4V(a3_r, a3_l, a2_r, a2_l, 11); 114 SRA_4V(a1_r, a1_l, a0_r, a0_l, 11); 115 PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, 116 a0_r, a1_r, a2_r, a3_r); 117 in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec); 118 in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec); 119 in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec); 120 in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec); 121 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 122 in0, in1, in2, in3, in4, in5, in6, in7); 123 124 UNPCK_SH_SW(in0, a0_r, a0_l); 125 UNPCK_SH_SW(in2, temp3_r, temp3_l); 126 w2 = (v4i32) __msa_splati_h(weights, 2); 127 w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2); 128 w4 = (v4i32) __msa_splati_h(weights, 4); 129 w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4); 130 w6 = (v4i32) __msa_splati_h(weights, 6); 131 w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6); 132 MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); 133 ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l); 134 MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l, 135 temp1_r, temp1_l, temp2_r, temp2_l); 136 BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, 137 temp2_l, temp2_r, temp1_l, temp1_r, 138 a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); 139 UNPCK_SH_SW(in4, temp0_r, temp0_l); 140 UNPCK_SH_SW(in6, temp3_r, temp3_l); 141 MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l); 142 MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l, 143 temp2_r, temp2_l, temp1_r, temp1_l); 144 ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l); 145 SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l, 146 a1_r, a1_l, a2_r, a2_l); 147 ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l, 148 a3_r, a3_l, a0_r, a0_l); 149 SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l); 150 ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l); 151 SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l); 152 ILVRL_H2_SW(in1, in3, b3_r, b3_l); 153 SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7); 154 ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5, 155 const0, const1, const2, const3); 156 DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, 157 b0_r, b1_r, b2_r, b3_r); 158 DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, 159 b0_l, b1_l, b2_l, b3_l); 160 ILVRL_H2_SW(in5, in7, temp0_r, temp0_l); 161 ILVR_H2_SH(w5, w7, w7, w3, const4, const6); 162 const5 = __msa_ilvod_h(-w1, -w5); 163 const7 = __msa_ilvod_h(w3, -w1); 164 DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r, 165 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); 166 DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l, 167 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); 168 BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, 169 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, 170 temp0_r, temp0_l, temp1_r, temp1_l, 171 temp2_r, temp2_l, temp3_r, temp3_l, 172 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); 173 SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20); 174 SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20); 175 PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r, 176 temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r); 177 SRA_4V(a3_r, a3_l, a2_r, a2_l, 20); 178 SRA_4V(a1_r, a1_l, a0_r, a0_l, 20); 179 PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, 180 a0_r, a1_r, a2_r, a3_r); 181 ST_SW8(temp0_r, temp1_r, temp2_r, temp3_r, a3_r, a2_r, a1_r, a0_r, 182 block, 8); 183} 184 185static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride, 186 int16_t *block) 187{ 188 int32_t const_val; 189 uint64_t tmp0, tmp1, tmp2, tmp3; 190 v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 }; 191 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 192 v8i16 w1, w3, w5, w7; 193 v8i16 const0, const1, const2, const3, const4, const5, const6, const7; 194 v4i32 temp0_r, temp1_r, temp2_r, temp3_r; 195 v4i32 temp0_l, temp1_l, temp2_l, temp3_l; 196 v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l; 197 v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l; 198 v4i32 w2, w4, w6; 199 v8i16 select_vec, temp; 200 v8i16 zero = { 0 }; 201 v4i32 const_val0 = __msa_ldi_w(1); 202 v4i32 const_val1 = __msa_ldi_w(1); 203 204 LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); 205 const_val0 <<= 10; 206 const_val = 16383 * ((1 << 19) / 16383); 207 const_val1 = __msa_insert_w(const_val0, 0, const_val); 208 const_val1 = __msa_splati_w(const_val1, 0); 209 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 210 in0, in1, in2, in3, in4, in5, in6, in7); 211 select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7; 212 select_vec = __msa_clti_u_h((v8u16) select_vec, 1); 213 UNPCK_SH_SW(in0, a0_r, a0_l); 214 UNPCK_SH_SW(in2, temp3_r, temp3_l); 215 temp = in0 << 3; 216 w2 = (v4i32) __msa_splati_h(weights, 2); 217 w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2); 218 w4 = (v4i32) __msa_splati_h(weights, 4); 219 w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4); 220 w6 = (v4i32) __msa_splati_h(weights, 6); 221 w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6); 222 MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); 223 ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l); 224 MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l); 225 MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l); 226 BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, 227 temp2_l, temp2_r, temp1_l, temp1_r, 228 a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); 229 UNPCK_SH_SW(in4, temp0_r, temp0_l); 230 UNPCK_SH_SW(in6, temp3_r, temp3_l); 231 MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l); 232 MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l); 233 MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l); 234 ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l); 235 SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l); 236 SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l); 237 ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l); 238 ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l); 239 SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l); 240 ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l); 241 SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l); 242 ILVRL_H2_SW(in1, in3, b3_r, b3_l); 243 SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7); 244 ILVRL_H2_SW(in5, in7, temp0_r, temp0_l); 245 ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5, 246 const0, const1, const2, const3); 247 ILVR_H2_SH(w5, w7, w7, w3, const4, const6); 248 const5 = __msa_ilvod_h(-w1, -w5); 249 const7 = __msa_ilvod_h(w3, -w1); 250 DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, 251 b0_r, b1_r, b2_r, b3_r); 252 DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r, 253 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); 254 DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, 255 b0_l, b1_l, b2_l, b3_l); 256 DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l, 257 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); 258 BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, 259 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, 260 temp0_r, temp0_l, temp1_r, temp1_l, 261 temp2_r, temp2_l, temp3_r, temp3_l, 262 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); 263 SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11); 264 SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11); 265 PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, 266 temp2_l, temp2_r, temp3_l, temp3_r, 267 temp0_r, temp1_r, temp2_r, temp3_r); 268 in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp, 269 (v16u8) select_vec); 270 in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp, 271 (v16u8) select_vec); 272 in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp, 273 (v16u8) select_vec); 274 in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp, 275 (v16u8) select_vec); 276 SRA_4V(a3_r, a3_l, a2_r, a2_l, 11); 277 SRA_4V(a1_r, a1_l, a0_r, a0_l, 11); 278 PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, 279 a0_r, a1_r, a2_r, a3_r); 280 in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec); 281 in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec); 282 in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec); 283 in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec); 284 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 285 in0, in1, in2, in3, in4, in5, in6, in7); 286 UNPCK_SH_SW(in0, a0_r, a0_l); 287 UNPCK_SH_SW(in2, temp3_r, temp3_l); 288 w2 = (v4i32) __msa_splati_h(weights, 2); 289 w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2); 290 w4 = (v4i32) __msa_splati_h(weights, 4); 291 w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4); 292 w6 = (v4i32) __msa_splati_h(weights, 6); 293 w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6); 294 MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); 295 ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l); 296 MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l); 297 MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l); 298 BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, 299 temp2_l, temp2_r, temp1_l, temp1_r, 300 a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); 301 UNPCK_SH_SW(in4, temp0_r, temp0_l); 302 UNPCK_SH_SW(in6, temp3_r, temp3_l); 303 MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l); 304 MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l); 305 MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l); 306 ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l); 307 SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l); 308 SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l); 309 ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l); 310 ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l); 311 SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l); 312 ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l); 313 SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l); 314 ILVRL_H2_SW(in1, in3, b3_r, b3_l); 315 SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7); 316 ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5, 317 const0, const1, const2, const3); 318 DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, 319 b0_r, b1_r, b2_r, b3_r); 320 DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, 321 b0_l, b1_l, b2_l, b3_l); 322 ILVRL_H2_SW(in5, in7, temp0_r, temp0_l); 323 ILVR_H2_SH(w5, w7, w7, w3, const4, const6); 324 const5 = __msa_ilvod_h(-w1, -w5); 325 const7 = __msa_ilvod_h(w3, -w1); 326 DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r, 327 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); 328 DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l, 329 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); 330 BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, 331 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, 332 temp0_r, temp0_l, temp1_r, temp1_l, 333 temp2_r, temp2_l, temp3_r, temp3_l, 334 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); 335 SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20); 336 SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20); 337 SRA_4V(a3_r, a3_l, a2_r, a2_l, 20); 338 SRA_4V(a1_r, a1_l, a0_r, a0_l, 20); 339 PCKEV_H4_SH(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r, 340 temp3_l, temp3_r, in0, in1, in2, in3); 341 PCKEV_H4_SH(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, 342 in4, in5, in6, in7); 343 CLIP_SH4_0_255(in0, in1, in2, in3); 344 PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, 345 in0, in1, in2, in3); 346 tmp0 = __msa_copy_u_d((v2i64) in0, 1); 347 tmp1 = __msa_copy_u_d((v2i64) in1, 1); 348 tmp2 = __msa_copy_u_d((v2i64) in2, 1); 349 tmp3 = __msa_copy_u_d((v2i64) in3, 1); 350 SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 351 CLIP_SH4_0_255(in4, in5, in6, in7); 352 PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, 353 in4, in5, in6, in7); 354 tmp3 = __msa_copy_u_d((v2i64) in4, 1); 355 tmp2 = __msa_copy_u_d((v2i64) in5, 1); 356 tmp1 = __msa_copy_u_d((v2i64) in6, 1); 357 tmp0 = __msa_copy_u_d((v2i64) in7, 1); 358 SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride); 359} 360 361static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, 362 int16_t *block) 363{ 364 int32_t const_val; 365 uint64_t tmp0, tmp1, tmp2, tmp3; 366 v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 }; 367 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 368 v8i16 w1, w3, w5, w7; 369 v8i16 const0, const1, const2, const3, const4, const5, const6, const7; 370 v4i32 temp0_r, temp1_r, temp2_r, temp3_r; 371 v4i32 temp4_r, temp5_r, temp6_r, temp7_r, temp8_r; 372 v4i32 temp0_l, temp1_l, temp2_l, temp3_l; 373 v4i32 temp4_l, temp5_l, temp6_l, temp7_l, temp8_l; 374 v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l; 375 v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l; 376 v4i32 w2, w4, w6; 377 v8i16 select_vec, temp; 378 v8i16 zero = { 0 }; 379 v4i32 const_val0 = __msa_ldi_w(1); 380 v4i32 const_val1 = __msa_ldi_w(1); 381 382 const_val0 <<= 10; 383 const_val = 16383 * ((1 << 19) / 16383); 384 const_val1 = __msa_insert_w(const_val0, 0, const_val); 385 const_val1 = __msa_splati_w(const_val1, 0); 386 LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); 387 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 388 in0, in1, in2, in3, in4, in5, in6, in7); 389 390 select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7; 391 select_vec = __msa_clti_u_h((v8u16) select_vec, 1); 392 UNPCK_SH_SW(in0, a0_r, a0_l); 393 UNPCK_SH_SW(in2, temp3_r, temp3_l); 394 ILVRL_H2_SW(in1, in3, b3_r, b3_l); 395 UNPCK_SH_SW(in4, temp4_r, temp4_l); 396 UNPCK_SH_SW(in6, temp7_r, temp7_l); 397 ILVRL_H2_SW(in5, in7, temp8_r, temp8_l); 398 temp = in0 << 3; 399 SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7); 400 ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5, 401 const0, const1, const2, const3); 402 ILVR_H2_SH(w5, w7, w7, w3, const4, const6); 403 const5 = __msa_ilvod_h(-w1, -w5); 404 const7 = __msa_ilvod_h(w3, -w1); 405 DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, 406 b0_r, b1_r, b2_r, b3_r); 407 DPADD_SH4_SW(temp8_r, temp8_r, temp8_r, temp8_r, 408 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); 409 DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, 410 b0_l, b1_l, b2_l, b3_l); 411 DPADD_SH4_SW(temp8_l, temp8_l, temp8_l, temp8_l, 412 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); 413 w2 = (v4i32) __msa_splati_h(weights, 2); 414 w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2); 415 w4 = (v4i32) __msa_splati_h(weights, 4); 416 w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4); 417 w6 = (v4i32) __msa_splati_h(weights, 6); 418 w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6); 419 MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); 420 ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l); 421 MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l); 422 MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l); 423 BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, 424 temp2_l, temp2_r, temp1_l, temp1_r, 425 a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); 426 MUL2(temp4_r, w4, temp4_l, w4, temp4_r, temp4_l); 427 MUL2(temp7_r, w2, temp7_l, w2, temp6_r, temp6_l); 428 MUL2(temp7_r, w6, temp7_l, w6, temp5_r, temp5_l); 429 ADD2(a0_r, temp4_r, a0_l, temp4_l, a0_r, a0_l); 430 SUB2(a1_r, temp4_r, a1_l, temp4_l, a1_r, a1_l); 431 SUB2(a2_r, temp4_r, a2_l, temp4_l, a2_r, a2_l); 432 ADD2(a3_r, temp4_r, a3_l, temp4_l, a3_r, a3_l); 433 ADD2(a0_r, temp5_r, a0_l, temp5_l, a0_r, a0_l); 434 SUB2(a1_r, temp6_r, a1_l, temp6_l, a1_r, a1_l); 435 ADD2(a2_r, temp6_r, a2_l, temp6_l, a2_r, a2_l); 436 SUB2(a3_r, temp5_r, a3_l, temp5_l, a3_r, a3_l); 437 BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, 438 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, 439 temp0_r, temp0_l, temp1_r, temp1_l, 440 temp2_r, temp2_l, temp3_r, temp3_l, 441 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); 442 SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11); 443 SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11); 444 PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, 445 temp2_l, temp2_r, temp3_l, temp3_r, 446 temp0_r, temp1_r, temp2_r, temp3_r); 447 in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp, 448 (v16u8) select_vec); 449 in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp, 450 (v16u8) select_vec); 451 in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp, 452 (v16u8) select_vec); 453 in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp, 454 (v16u8) select_vec); 455 SRA_4V(a3_r, a3_l, a2_r, a2_l, 11); 456 SRA_4V(a1_r, a1_l, a0_r, a0_l, 11); 457 PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, 458 a0_r, a1_r, a2_r, a3_r); 459 in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec); 460 in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec); 461 in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec); 462 in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec); 463 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 464 in0, in1, in2, in3, in4, in5, in6, in7); 465 466 UNPCK_SH_SW(in0, a0_r, a0_l); 467 UNPCK_SH_SW(in2, temp3_r, temp3_l); 468 MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); 469 ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l); 470 MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l); 471 MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l); 472 BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, 473 temp2_l, temp2_r, temp1_l, temp1_r, 474 a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); 475 UNPCK_SH_SW(in4, temp0_r, temp0_l); 476 UNPCK_SH_SW(in6, temp3_r, temp3_l); 477 MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l); 478 MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l); 479 MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l); 480 ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l); 481 SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l); 482 SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l); 483 ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l); 484 ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l); 485 SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l); 486 ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l); 487 SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l); 488 ILVRL_H2_SW(in1, in3, b3_r, b3_l); 489 ILVRL_H2_SW(in5, in7, temp0_r, temp0_l); 490 DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, 491 b0_r, b1_r, b2_r, b3_r); 492 DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, 493 b0_l, b1_l, b2_l, b3_l); 494 DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r, 495 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); 496 DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l, 497 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); 498 BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, 499 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, 500 temp0_r, temp0_l, temp1_r, temp1_l, 501 temp2_r, temp2_l, temp3_r, temp3_l, 502 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); 503 SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20); 504 SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20); 505 LD_SH4(dst, dst_stride, in0, in1, in2, in3); 506 PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r, 507 temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r); 508 ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3, 509 temp0_l, temp1_l, temp2_l, temp3_l); 510 in0 = (v8i16) (temp0_r) + (v8i16) (temp0_l); 511 in1 = (v8i16) (temp1_r) + (v8i16) (temp1_l); 512 in2 = (v8i16) (temp2_r) + (v8i16) (temp2_l); 513 in3 = (v8i16) (temp3_r) + (v8i16) (temp3_l); 514 CLIP_SH4_0_255(in0, in1, in2, in3); 515 PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, 516 in0, in1, in2, in3); 517 tmp0 = __msa_copy_u_d((v2i64) in0, 1); 518 tmp1 = __msa_copy_u_d((v2i64) in1, 1); 519 tmp2 = __msa_copy_u_d((v2i64) in2, 1); 520 tmp3 = __msa_copy_u_d((v2i64) in3, 1); 521 SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 522 523 SRA_4V(a3_r, a3_l, a2_r, a2_l, 20); 524 SRA_4V(a1_r, a1_l, a0_r, a0_l, 20); 525 LD_SH4(dst + 4 * dst_stride, dst_stride, in4, in5, in6, in7); 526 PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, 527 a0_r, a1_r, a2_r, a3_r); 528 ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7, 529 a3_l, a2_l, a1_l, a0_l); 530 in4 = (v8i16) (a3_r) + (v8i16) (a3_l); 531 in5 = (v8i16) (a2_r) + (v8i16) (a2_l); 532 in6 = (v8i16) (a1_r) + (v8i16) (a1_l); 533 in7 = (v8i16) (a0_r) + (v8i16) (a0_l); 534 CLIP_SH4_0_255(in4, in5, in6, in7); 535 PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, 536 in4, in5, in6, in7); 537 tmp0 = __msa_copy_u_d((v2i64) in4, 1); 538 tmp1 = __msa_copy_u_d((v2i64) in5, 1); 539 tmp2 = __msa_copy_u_d((v2i64) in6, 1); 540 tmp3 = __msa_copy_u_d((v2i64) in7, 1); 541 SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride); 542} 543 544void ff_simple_idct_msa(int16_t *block) 545{ 546 simple_idct_msa(block); 547} 548 549void ff_simple_idct_put_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block) 550{ 551 simple_idct_put_msa(dst, dst_stride, block); 552} 553 554void ff_simple_idct_add_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block) 555{ 556 simple_idct_add_msa(dst, dst_stride, block); 557} 558