1/* 2 * Loongson SIMD optimized vp8dsp 3 * 4 * Copyright (c) 2016 Loongson Technology Corporation Limited 5 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> 6 * 7 * This file is part of FFmpeg. 8 * 9 * FFmpeg is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public 11 * License as published by the Free Software Foundation; either 12 * version 2.1 of the License, or (at your option) any later version. 13 * 14 * FFmpeg is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with FFmpeg; if not, write to the Free Software 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 */ 23 24#include "vp8dsp_mips.h" 25#include "constants.h" 26#include "libavutil/attributes.h" 27#include "libavutil/mips/mmiutils.h" 28#include "libavutil/mem_internal.h" 29 30#define DECLARE_DOUBLE_1 double db_1 31#define DECLARE_DOUBLE_2 double db_2 32#define DECLARE_UINT32_T uint32_t it_1 33#define RESTRICT_ASM_DOUBLE_1 [db_1]"=&f"(db_1) 34#define RESTRICT_ASM_DOUBLE_2 [db_2]"=&f"(db_2) 35#define RESTRICT_ASM_UINT32_T [it_1]"=&r"(it_1) 36 37#define MMI_PCMPGTUB(dst, src1, src2) \ 38 "pcmpeqb %[db_1], "#src1", "#src2" \n\t" \ 39 "pmaxub %[db_2], "#src1", "#src2" \n\t" \ 40 "pcmpeqb %[db_2], %[db_2], "#src1" \n\t" \ 41 "pxor "#dst", %[db_2], %[db_1] \n\t" 42 43#define MMI_BTOH(dst_l, dst_r, src) \ 44 "pxor %[db_1], %[db_1], %[db_1] \n\t" \ 45 "pcmpgtb %[db_2], %[db_1], "#src" \n\t" \ 46 "punpcklbh "#dst_r", "#src", %[db_2] \n\t" \ 47 "punpckhbh "#dst_l", "#src", %[db_2] \n\t" 48 49#define MMI_VP8_LOOP_FILTER \ 50 /* Calculation of hev */ \ 51 "dmtc1 %[thresh], %[ftmp3] \n\t" \ 52 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 53 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 54 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 55 "pasubub %[ftmp0], %[p1], %[p0] \n\t" \ 56 "pasubub %[ftmp1], %[q1], %[q0] \n\t" \ 57 "pmaxub %[ftmp0], %[ftmp0], %[ftmp1] \n\t" \ 58 MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3]) \ 59 /* Calculation of mask */ \ 60 "pasubub %[ftmp1], %[p0], %[q0] \n\t" \ 61 "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \ 62 "pasubub %[ftmp2], %[p1], %[q1] \n\t" \ 63 "li %[tmp0], 0x09 \n\t" \ 64 "dmtc1 %[tmp0], %[ftmp3] \n\t" \ 65 PSRLB_MMI(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5], %[ftmp2]) \ 66 "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ 67 "dmtc1 %[e], %[ftmp3] \n\t" \ 68 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 69 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 70 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 71 MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3]) \ 72 "pmaxub %[mask], %[mask], %[ftmp0] \n\t" \ 73 "pasubub %[ftmp1], %[p3], %[p2] \n\t" \ 74 "pasubub %[ftmp2], %[p2], %[p1] \n\t" \ 75 "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ 76 "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \ 77 "pasubub %[ftmp1], %[q3], %[q2] \n\t" \ 78 "pasubub %[ftmp2], %[q2], %[q1] \n\t" \ 79 "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ 80 "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \ 81 "dmtc1 %[i], %[ftmp3] \n\t" \ 82 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 83 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 84 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 85 MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3]) \ 86 "pcmpeqw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 87 "pxor %[mask], %[mask], %[ftmp3] \n\t" \ 88 /* VP8_MBFILTER */ \ 89 "li %[tmp0], 0x80808080 \n\t" \ 90 "dmtc1 %[tmp0], %[ftmp7] \n\t" \ 91 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \ 92 "pxor %[p2], %[p2], %[ftmp7] \n\t" \ 93 "pxor %[p1], %[p1], %[ftmp7] \n\t" \ 94 "pxor %[p0], %[p0], %[ftmp7] \n\t" \ 95 "pxor %[q0], %[q0], %[ftmp7] \n\t" \ 96 "pxor %[q1], %[q1], %[ftmp7] \n\t" \ 97 "pxor %[q2], %[q2], %[ftmp7] \n\t" \ 98 "psubsb %[ftmp4], %[p1], %[q1] \n\t" \ 99 "psubb %[ftmp5], %[q0], %[p0] \n\t" \ 100 MMI_BTOH(%[ftmp1], %[ftmp0], %[ftmp5]) \ 101 MMI_BTOH(%[ftmp3], %[ftmp2], %[ftmp4]) \ 102 /* Right part */ \ 103 "paddh %[ftmp5], %[ftmp0], %[ftmp0] \n\t" \ 104 "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" \ 105 "paddh %[ftmp0], %[ftmp2], %[ftmp0] \n\t" \ 106 /* Left part */ \ 107 "paddh %[ftmp5], %[ftmp1], %[ftmp1] \n\t" \ 108 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \ 109 "paddh %[ftmp1], %[ftmp3], %[ftmp1] \n\t" \ 110 /* Combine left and right part */ \ 111 "packsshb %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 112 "pand %[ftmp1], %[ftmp1], %[mask] \n\t" \ 113 "pand %[ftmp2], %[ftmp1], %[hev] \n\t" \ 114 "li %[tmp0], 0x04040404 \n\t" \ 115 "dmtc1 %[tmp0], %[ftmp0] \n\t" \ 116 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 117 "paddsb %[ftmp3], %[ftmp2], %[ftmp0] \n\t" \ 118 "li %[tmp0], 0x0B \n\t" \ 119 "dmtc1 %[tmp0], %[ftmp4] \n\t" \ 120 PSRAB_MMI(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], %[ftmp3]) \ 121 "li %[tmp0], 0x03030303 \n\t" \ 122 "dmtc1 %[tmp0], %[ftmp0] \n\t" \ 123 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 124 "paddsb %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \ 125 "li %[tmp0], 0x0B \n\t" \ 126 "dmtc1 %[tmp0], %[ftmp2] \n\t" \ 127 PSRAB_MMI(%[ftmp4], %[ftmp2], %[ftmp5], %[ftmp6], %[ftmp4]) \ 128 "psubsb %[q0], %[q0], %[ftmp3] \n\t" \ 129 "paddsb %[p0], %[p0], %[ftmp4] \n\t" \ 130 /* filt_val &= ~hev */ \ 131 "pcmpeqw %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 132 "pxor %[hev], %[hev], %[ftmp0] \n\t" \ 133 "pand %[ftmp1], %[ftmp1], %[hev] \n\t" \ 134 MMI_BTOH(%[ftmp5], %[ftmp6], %[ftmp1]) \ 135 "li %[tmp0], 0x07 \n\t" \ 136 "dmtc1 %[tmp0], %[ftmp2] \n\t" \ 137 "li %[tmp0], 0x001b001b \n\t" \ 138 "dmtc1 %[tmp0], %[ftmp1] \n\t" \ 139 "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \ 140 "li %[tmp0], 0x003f003f \n\t" \ 141 "dmtc1 %[tmp0], %[ftmp0] \n\t" \ 142 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 143 /* Right part */ \ 144 "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \ 145 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 146 "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \ 147 /* Left part */ \ 148 "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \ 149 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ 150 "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \ 151 /* Combine left and right part */ \ 152 "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \ 153 "psubsb %[q0], %[q0], %[ftmp4] \n\t" \ 154 "pxor %[q0], %[q0], %[ftmp7] \n\t" \ 155 "paddsb %[p0], %[p0], %[ftmp4] \n\t" \ 156 "pxor %[p0], %[p0], %[ftmp7] \n\t" \ 157 "li %[tmp0], 0x00120012 \n\t" \ 158 "dmtc1 %[tmp0], %[ftmp1] \n\t" \ 159 "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \ 160 /* Right part */ \ 161 "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \ 162 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 163 "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \ 164 /* Left part */ \ 165 "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \ 166 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ 167 "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \ 168 /* Combine left and right part */ \ 169 "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \ 170 "psubsb %[q1], %[q1], %[ftmp4] \n\t" \ 171 "pxor %[q1], %[q1], %[ftmp7] \n\t" \ 172 "paddsb %[p1], %[p1], %[ftmp4] \n\t" \ 173 "pxor %[p1], %[p1], %[ftmp7] \n\t" \ 174 "li %[tmp0], 0x03 \n\t" \ 175 "dmtc1 %[tmp0], %[ftmp1] \n\t" \ 176 /* Right part */ \ 177 "psllh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \ 178 "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" \ 179 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 180 "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \ 181 /* Left part */ \ 182 "psllh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \ 183 "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ 184 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ 185 "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \ 186 /* Combine left and right part */ \ 187 "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \ 188 "psubsb %[q2], %[q2], %[ftmp4] \n\t" \ 189 "pxor %[q2], %[q2], %[ftmp7] \n\t" \ 190 "paddsb %[p2], %[p2], %[ftmp4] \n\t" \ 191 "pxor %[p2], %[p2], %[ftmp7] \n\t" 192 193#define PUT_VP8_EPEL4_H6_MMI(src, dst) \ 194 MMI_ULWC1(%[ftmp1], src, 0x00) \ 195 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 196 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \ 197 \ 198 MMI_ULWC1(%[ftmp1], src, -0x01) \ 199 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 200 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \ 201 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \ 202 \ 203 MMI_ULWC1(%[ftmp1], src, -0x02) \ 204 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 205 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \ 206 "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \ 207 \ 208 MMI_ULWC1(%[ftmp1], src, 0x01) \ 209 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 210 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \ 211 \ 212 MMI_ULWC1(%[ftmp1], src, 0x02) \ 213 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 214 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \ 215 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \ 216 \ 217 MMI_ULWC1(%[ftmp1], src, 0x03) \ 218 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 219 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \ 220 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \ 221 \ 222 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 223 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \ 224 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 225 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \ 226 \ 227 MMI_SWC1(%[ftmp1], dst, 0x00) 228 229 230#define PUT_VP8_EPEL4_H4_MMI(src, dst) \ 231 MMI_ULWC1(%[ftmp1], src, 0x00) \ 232 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 233 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \ 234 \ 235 MMI_ULWC1(%[ftmp1], src, -0x01) \ 236 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 237 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \ 238 "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \ 239 \ 240 MMI_ULWC1(%[ftmp1], src, 0x01) \ 241 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 242 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \ 243 \ 244 MMI_ULWC1(%[ftmp1], src, 0x02) \ 245 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 246 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \ 247 "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \ 248 \ 249 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 250 \ 251 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \ 252 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 253 \ 254 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \ 255 MMI_SWC1(%[ftmp1], dst, 0x00) 256 257 258#define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride) \ 259 MMI_ULWC1(%[ftmp1], src, 0x00) \ 260 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 261 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \ 262 \ 263 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \ 264 MMI_ULWC1(%[ftmp1], src1, 0x00) \ 265 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 266 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \ 267 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \ 268 \ 269 PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \ 270 MMI_ULWC1(%[ftmp1], src1, 0x00) \ 271 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 272 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \ 273 "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \ 274 \ 275 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \ 276 MMI_ULWC1(%[ftmp1], src1, 0x00) \ 277 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 278 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \ 279 \ 280 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \ 281 MMI_ULWC1(%[ftmp1], src1, 0x00) \ 282 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 283 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \ 284 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \ 285 \ 286 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \ 287 MMI_ULWC1(%[ftmp1], src1, 0x00) \ 288 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 289 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \ 290 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \ 291 \ 292 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 293 \ 294 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \ 295 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 296 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \ 297 \ 298 MMI_SWC1(%[ftmp1], dst, 0x00) 299 300 301#define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride) \ 302 MMI_ULWC1(%[ftmp1], src, 0x00) \ 303 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 304 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \ 305 \ 306 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \ 307 MMI_ULWC1(%[ftmp1], src1, 0x00) \ 308 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 309 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \ 310 "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \ 311 \ 312 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \ 313 MMI_ULWC1(%[ftmp1], src1, 0x00) \ 314 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 315 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \ 316 \ 317 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \ 318 MMI_ULWC1(%[ftmp1], src1, 0x00) \ 319 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 320 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \ 321 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \ 322 \ 323 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 324 \ 325 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \ 326 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 327 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \ 328 \ 329 MMI_SWC1(%[ftmp1], dst, 0x00) 330 331 332#define PUT_VP8_EPEL8_H6_MMI(src, dst) \ 333 MMI_ULDC1(%[ftmp1], src, 0x00) \ 334 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 335 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 336 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \ 337 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \ 338 \ 339 MMI_ULDC1(%[ftmp1], src, -0x01) \ 340 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 341 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 342 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \ 343 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \ 344 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ 345 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 346 \ 347 MMI_ULDC1(%[ftmp1], src, -0x02) \ 348 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 349 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 350 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \ 351 "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \ 352 "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \ 353 "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \ 354 \ 355 MMI_ULDC1(%[ftmp1], src, 0x01) \ 356 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 357 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 358 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \ 359 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \ 360 \ 361 MMI_ULDC1(%[ftmp1], src, 0x02) \ 362 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 363 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 364 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \ 365 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \ 366 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ 367 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 368 \ 369 MMI_ULDC1(%[ftmp1], src, 0x03) \ 370 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 371 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 372 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \ 373 "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \ 374 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ 375 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 376 \ 377 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \ 378 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \ 379 \ 380 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \ 381 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \ 382 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \ 383 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \ 384 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \ 385 \ 386 MMI_SDC1(%[ftmp1], dst, 0x00) 387 388 389#define PUT_VP8_EPEL8_H4_MMI(src, dst) \ 390 MMI_ULDC1(%[ftmp1], src, 0x00) \ 391 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 392 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 393 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \ 394 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \ 395 \ 396 MMI_ULDC1(%[ftmp1], src, -0x01) \ 397 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 398 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 399 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \ 400 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \ 401 "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \ 402 "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \ 403 \ 404 MMI_ULDC1(%[ftmp1], src, 0x01) \ 405 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 406 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 407 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \ 408 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \ 409 \ 410 MMI_ULDC1(%[ftmp1], src, 0x02) \ 411 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 412 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 413 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \ 414 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \ 415 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ 416 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 417 \ 418 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \ 419 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \ 420 \ 421 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \ 422 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \ 423 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \ 424 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \ 425 \ 426 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \ 427 MMI_SDC1(%[ftmp1], dst, 0x00) 428 429 430#define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride) \ 431 MMI_ULDC1(%[ftmp1], src, 0x00) \ 432 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 433 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 434 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \ 435 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \ 436 \ 437 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \ 438 MMI_ULDC1(%[ftmp1], src1, 0x00) \ 439 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 440 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 441 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \ 442 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \ 443 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ 444 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 445 \ 446 PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \ 447 MMI_ULDC1(%[ftmp1], src1, 0x00) \ 448 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 449 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 450 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \ 451 "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \ 452 "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \ 453 "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \ 454 \ 455 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \ 456 MMI_ULDC1(%[ftmp1], src1, 0x00) \ 457 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 458 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 459 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \ 460 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \ 461 \ 462 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \ 463 MMI_ULDC1(%[ftmp1], src1, 0x00) \ 464 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 465 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 466 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \ 467 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \ 468 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ 469 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 470 \ 471 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \ 472 MMI_ULDC1(%[ftmp1], src1, 0x00) \ 473 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 474 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 475 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \ 476 "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \ 477 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ 478 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 479 \ 480 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \ 481 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \ 482 \ 483 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \ 484 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \ 485 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \ 486 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \ 487 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \ 488 \ 489 MMI_SDC1(%[ftmp1], dst, 0x00) 490 491 492#define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride) \ 493 MMI_ULDC1(%[ftmp1], src, 0x00) \ 494 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 495 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 496 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \ 497 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \ 498 \ 499 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \ 500 MMI_ULDC1(%[ftmp1], src1, 0x00) \ 501 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 502 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 503 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \ 504 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \ 505 "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \ 506 "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \ 507 \ 508 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \ 509 MMI_ULDC1(%[ftmp1], src1, 0x00) \ 510 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 511 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 512 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \ 513 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \ 514 \ 515 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \ 516 MMI_ULDC1(%[ftmp1], src1, 0x00) \ 517 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 518 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 519 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \ 520 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \ 521 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ 522 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 523 \ 524 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \ 525 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \ 526 \ 527 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \ 528 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \ 529 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \ 530 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \ 531 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \ 532 \ 533 MMI_SDC1(%[ftmp1], dst, 0x00) 534 535 536#define PUT_VP8_BILINEAR8_H_MMI(src, dst) \ 537 MMI_ULDC1(%[ftmp1], src, 0x00) \ 538 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 539 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 540 "pmullh %[ftmp5], %[ftmp2], %[a] \n\t" \ 541 "pmullh %[ftmp6], %[ftmp3], %[a] \n\t" \ 542 \ 543 MMI_ULDC1(%[ftmp1], src, 0x01) \ 544 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 545 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 546 "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \ 547 "pmullh %[ftmp3], %[ftmp3], %[b] \n\t" \ 548 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ 549 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 550 \ 551 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \ 552 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \ 553 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \ 554 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \ 555 \ 556 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \ 557 MMI_SDC1(%[ftmp1], dst, 0x00) 558 559 560#define PUT_VP8_BILINEAR4_H_MMI(src, dst) \ 561 MMI_ULWC1(%[ftmp1], src, 0x00) \ 562 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 563 "pmullh %[ftmp3], %[ftmp2], %[a] \n\t" \ 564 \ 565 MMI_ULWC1(%[ftmp1], src, 0x01) \ 566 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 567 "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \ 568 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \ 569 \ 570 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \ 571 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 572 \ 573 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \ 574 MMI_SWC1(%[ftmp1], dst, 0x00) 575 576 577#define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride) \ 578 MMI_ULDC1(%[ftmp1], src, 0x00) \ 579 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 580 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 581 "pmullh %[ftmp5], %[ftmp2], %[c] \n\t" \ 582 "pmullh %[ftmp6], %[ftmp3], %[c] \n\t" \ 583 \ 584 PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \ 585 MMI_ULDC1(%[ftmp1], src1, 0x00) \ 586 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 587 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 588 "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \ 589 "pmullh %[ftmp3], %[ftmp3], %[d] \n\t" \ 590 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ 591 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 592 \ 593 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \ 594 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \ 595 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \ 596 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \ 597 \ 598 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \ 599 MMI_SDC1(%[ftmp1], dst, 0x00) 600 601 602#define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride) \ 603 MMI_ULWC1(%[ftmp1], src, 0x00) \ 604 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 605 "pmullh %[ftmp3], %[ftmp2], %[c] \n\t" \ 606 \ 607 PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \ 608 MMI_ULWC1(%[ftmp1], src1, 0x00) \ 609 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 610 "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \ 611 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \ 612 \ 613 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \ 614 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 615 \ 616 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \ 617 MMI_SWC1(%[ftmp1], dst, 0x00) 618 619 620DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = { 621 {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b, 622 0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000}, 623 624 {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c, 625 0x0024002400240024, 0x0008000800080008, 0x0001000100010001}, 626 627 {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d, 628 0x0032003200320032, 0x0006000600060006, 0x0000000000000000}, 629 630 {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d, 631 0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003}, 632 633 {0x0000000000000000, 0x0006000600060006, 0x0032003200320032, 634 0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000}, 635 636 {0x0001000100010001, 0x0008000800080008, 0x0024002400240024, 637 0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002}, 638 639 {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c, 640 0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000} 641}; 642 643#if 0 644#define FILTER_6TAP(src, F, stride) \ 645 cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \ 646 F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] - \ 647 F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7] 648 649#define FILTER_4TAP(src, F, stride) \ 650 cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \ 651 F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7] 652 653static const uint8_t subpel_filters[7][6] = { 654 { 0, 6, 123, 12, 1, 0 }, 655 { 2, 11, 108, 36, 8, 1 }, 656 { 0, 9, 93, 50, 6, 0 }, 657 { 3, 16, 77, 77, 16, 3 }, 658 { 0, 6, 50, 93, 9, 0 }, 659 { 1, 8, 36, 108, 11, 2 }, 660 { 0, 1, 12, 123, 6, 0 }, 661}; 662 663#define MUL_20091(a) ((((a) * 20091) >> 16) + (a)) 664#define MUL_35468(a) (((a) * 35468) >> 16) 665#endif 666 667#define clip_int8(n) (cm[(n) + 0x80] - 0x80) 668static av_always_inline void vp8_filter_common_is4tap(uint8_t *p, 669 ptrdiff_t stride) 670{ 671 int av_unused p1 = p[-2 * stride]; 672 int av_unused p0 = p[-1 * stride]; 673 int av_unused q0 = p[ 0 * stride]; 674 int av_unused q1 = p[ 1 * stride]; 675 int a, f1, f2; 676 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 677 678 a = 3 * (q0 - p0); 679 a += clip_int8(p1 - q1); 680 a = clip_int8(a); 681 682 // We deviate from the spec here with c(a+3) >> 3 683 // since that's what libvpx does. 684 f1 = FFMIN(a + 4, 127) >> 3; 685 f2 = FFMIN(a + 3, 127) >> 3; 686 687 // Despite what the spec says, we do need to clamp here to 688 // be bitexact with libvpx. 689 p[-1 * stride] = cm[p0 + f2]; 690 p[ 0 * stride] = cm[q0 - f1]; 691} 692 693static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p, 694 ptrdiff_t stride) 695{ 696 int av_unused p1 = p[-2 * stride]; 697 int av_unused p0 = p[-1 * stride]; 698 int av_unused q0 = p[ 0 * stride]; 699 int av_unused q1 = p[ 1 * stride]; 700 int a, f1, f2; 701 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 702 703 a = 3 * (q0 - p0); 704 a = clip_int8(a); 705 706 // We deviate from the spec here with c(a+3) >> 3 707 // since that's what libvpx does. 708 f1 = FFMIN(a + 4, 127) >> 3; 709 f2 = FFMIN(a + 3, 127) >> 3; 710 711 // Despite what the spec says, we do need to clamp here to 712 // be bitexact with libvpx. 713 p[-1 * stride] = cm[p0 + f2]; 714 p[ 0 * stride] = cm[q0 - f1]; 715 a = (f1 + 1) >> 1; 716 p[-2 * stride] = cm[p1 + a]; 717 p[ 1 * stride] = cm[q1 - a]; 718} 719 720static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride, 721 int flim) 722{ 723 int av_unused p1 = p[-2 * stride]; 724 int av_unused p0 = p[-1 * stride]; 725 int av_unused q0 = p[ 0 * stride]; 726 int av_unused q1 = p[ 1 * stride]; 727 728 return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim; 729} 730 731static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh) 732{ 733 int av_unused p1 = p[-2 * stride]; 734 int av_unused p0 = p[-1 * stride]; 735 int av_unused q0 = p[ 0 * stride]; 736 int av_unused q1 = p[ 1 * stride]; 737 738 return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh; 739} 740 741static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride) 742{ 743 int a0, a1, a2, w; 744 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 745 746 int av_unused p2 = p[-3 * stride]; 747 int av_unused p1 = p[-2 * stride]; 748 int av_unused p0 = p[-1 * stride]; 749 int av_unused q0 = p[ 0 * stride]; 750 int av_unused q1 = p[ 1 * stride]; 751 int av_unused q2 = p[ 2 * stride]; 752 753 w = clip_int8(p1 - q1); 754 w = clip_int8(w + 3 * (q0 - p0)); 755 756 a0 = (27 * w + 63) >> 7; 757 a1 = (18 * w + 63) >> 7; 758 a2 = (9 * w + 63) >> 7; 759 760 p[-3 * stride] = cm[p2 + a2]; 761 p[-2 * stride] = cm[p1 + a1]; 762 p[-1 * stride] = cm[p0 + a0]; 763 p[ 0 * stride] = cm[q0 - a0]; 764 p[ 1 * stride] = cm[q1 - a1]; 765 p[ 2 * stride] = cm[q2 - a2]; 766} 767 768static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride, 769 int E, int I) 770{ 771 int av_unused p3 = p[-4 * stride]; 772 int av_unused p2 = p[-3 * stride]; 773 int av_unused p1 = p[-2 * stride]; 774 int av_unused p0 = p[-1 * stride]; 775 int av_unused q0 = p[ 0 * stride]; 776 int av_unused q1 = p[ 1 * stride]; 777 int av_unused q2 = p[ 2 * stride]; 778 int av_unused q3 = p[ 3 * stride]; 779 780 return vp8_simple_limit(p, stride, E) && 781 FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I && 782 FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I && 783 FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I; 784} 785 786static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst, 787 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh) 788{ 789 double ftmp[18]; 790 uint32_t tmp[1]; 791 DECLARE_DOUBLE_1; 792 DECLARE_DOUBLE_2; 793 DECLARE_UINT32_T; 794 DECLARE_VAR_ALL64; 795 796 __asm__ volatile( 797 /* Get data from dst */ 798 MMI_ULDC1(%[q0], %[dst], 0x0) 799 PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t" 800 MMI_ULDC1(%[p0], %[tmp0], 0x0) 801 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t" 802 MMI_ULDC1(%[p1], %[tmp0], 0x0) 803 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t" 804 MMI_ULDC1(%[p2], %[tmp0], 0x0) 805 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t" 806 MMI_ULDC1(%[p3], %[tmp0], 0x0) 807 PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t" 808 MMI_ULDC1(%[q1], %[tmp0], 0x0) 809 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t" 810 MMI_ULDC1(%[q2], %[tmp0], 0x0) 811 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t" 812 MMI_ULDC1(%[q3], %[tmp0], 0x0) 813 MMI_VP8_LOOP_FILTER 814 /* Move to dst */ 815 MMI_USDC1(%[q0], %[dst], 0x0) 816 PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t" 817 MMI_USDC1(%[p0], %[tmp0], 0x0) 818 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t" 819 MMI_USDC1(%[p1], %[tmp0], 0x0) 820 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t" 821 MMI_USDC1(%[p2], %[tmp0], 0x0) 822 PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t" 823 MMI_USDC1(%[q1], %[tmp0], 0x0) 824 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t" 825 MMI_USDC1(%[q2], %[tmp0], 0x0) 826 : RESTRICT_ASM_ALL64 827 [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]), 828 [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]), 829 [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]), 830 [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]), 831 [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]), 832 [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]), 833 [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]), 834 [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]), 835 [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]), 836 [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]), 837 RESTRICT_ASM_DOUBLE_1, RESTRICT_ASM_DOUBLE_2, 838 RESTRICT_ASM_UINT32_T 839 : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh), 840 [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride) 841 : "memory" 842 ); 843} 844 845static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst, 846 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh) 847{ 848 int i; 849 850 for (i = 0; i < 8; i++) 851 if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) { 852 int hv = hev(dst + i * 1, stride, hev_thresh); 853 if (hv) 854 vp8_filter_common_is4tap(dst + i * 1, stride); 855 else 856 vp8_filter_common_isnot4tap(dst + i * 1, stride); 857 } 858} 859 860static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst, 861 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh) 862{ 863 double ftmp[18]; 864 uint32_t tmp[1]; 865 DECLARE_DOUBLE_1; 866 DECLARE_DOUBLE_2; 867 DECLARE_UINT32_T; 868 DECLARE_VAR_ALL64; 869 870 __asm__ volatile( 871 /* Get data from dst */ 872 MMI_ULDC1(%[p3], %[dst], -0x04) 873 PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t" 874 MMI_ULDC1(%[p2], %[tmp0], -0x04) 875 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t" 876 MMI_ULDC1(%[p1], %[tmp0], -0x04) 877 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t" 878 MMI_ULDC1(%[p0], %[tmp0], -0x04) 879 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t" 880 MMI_ULDC1(%[q0], %[tmp0], -0x04) 881 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t" 882 MMI_ULDC1(%[q1], %[tmp0], -0x04) 883 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t" 884 MMI_ULDC1(%[q2], %[tmp0], -0x04) 885 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t" 886 MMI_ULDC1(%[q3], %[tmp0], -0x04) 887 /* Matrix transpose */ 888 TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0], 889 %[q0], %[q1], %[q2], %[q3], 890 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) 891 MMI_VP8_LOOP_FILTER 892 /* Matrix transpose */ 893 TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0], 894 %[q0], %[q1], %[q2], %[q3], 895 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) 896 /* Move to dst */ 897 MMI_USDC1(%[p3], %[dst], -0x04) 898 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 899 MMI_USDC1(%[p2], %[dst], -0x04) 900 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 901 MMI_USDC1(%[p1], %[dst], -0x04) 902 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 903 MMI_USDC1(%[p0], %[dst], -0x04) 904 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 905 MMI_USDC1(%[q0], %[dst], -0x04) 906 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 907 MMI_USDC1(%[q1], %[dst], -0x04) 908 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 909 MMI_USDC1(%[q2], %[dst], -0x04) 910 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 911 MMI_USDC1(%[q3], %[dst], -0x04) 912 : RESTRICT_ASM_ALL64 913 [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]), 914 [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]), 915 [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]), 916 [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]), 917 [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]), 918 [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]), 919 [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]), 920 [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]), 921 [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]), 922 [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]), 923 RESTRICT_ASM_DOUBLE_1, RESTRICT_ASM_DOUBLE_2, 924 RESTRICT_ASM_UINT32_T 925 : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh), 926 [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride) 927 : "memory" 928 ); 929} 930 931static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst, 932 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh) 933{ 934 int i; 935 936 for (i = 0; i < 8; i++) 937 if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) { 938 int hv = hev(dst + i * stride, 1, hev_thresh); 939 if (hv) 940 vp8_filter_common_is4tap(dst + i * stride, 1); 941 else 942 vp8_filter_common_isnot4tap(dst + i * stride, 1); 943 } 944} 945 946void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16]) 947{ 948#if 1 949 double ftmp[8]; 950 DECLARE_VAR_ALL64; 951 952 __asm__ volatile ( 953 MMI_LDC1(%[ftmp0], %[dc], 0x00) 954 MMI_LDC1(%[ftmp1], %[dc], 0x08) 955 MMI_LDC1(%[ftmp2], %[dc], 0x10) 956 MMI_LDC1(%[ftmp3], %[dc], 0x18) 957 "paddsh %[ftmp4], %[ftmp0], %[ftmp3] \n\t" 958 "psubsh %[ftmp5], %[ftmp0], %[ftmp3] \n\t" 959 "paddsh %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 960 "psubsh %[ftmp7], %[ftmp1], %[ftmp2] \n\t" 961 "paddsh %[ftmp0], %[ftmp4], %[ftmp6] \n\t" 962 "paddsh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" 963 "psubsh %[ftmp2], %[ftmp4], %[ftmp6] \n\t" 964 "psubsh %[ftmp3], %[ftmp5], %[ftmp7] \n\t" 965 MMI_SDC1(%[ftmp0], %[dc], 0x00) 966 MMI_SDC1(%[ftmp1], %[dc], 0x08) 967 MMI_SDC1(%[ftmp2], %[dc], 0x10) 968 MMI_SDC1(%[ftmp3], %[dc], 0x18) 969 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 970 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 971 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 972 [ftmp6]"=&f"(ftmp[6]), 973 RESTRICT_ASM_ALL64 974 [ftmp7]"=&f"(ftmp[7]) 975 : [dc]"r"((uint8_t*)dc) 976 : "memory" 977 ); 978 979 block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3; 980 block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3; 981 block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3; 982 block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3; 983 984 block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3; 985 block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3; 986 block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3; 987 block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3; 988 989 block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3; 990 block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3; 991 block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3; 992 block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3; 993 994 block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3; 995 block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3; 996 block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3; 997 block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3; 998 999 __asm__ volatile ( 1000 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1001 MMI_SDC1(%[ftmp0], %[dc], 0x00) 1002 MMI_SDC1(%[ftmp0], %[dc], 0x08) 1003 MMI_SDC1(%[ftmp0], %[dc], 0x10) 1004 MMI_SDC1(%[ftmp0], %[dc], 0x18) 1005 : RESTRICT_ASM_ALL64 1006 [ftmp0]"=&f"(ftmp[0]) 1007 : [dc]"r"((uint8_t *)dc) 1008 : "memory" 1009 ); 1010#else 1011 int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33; 1012 1013 t00 = dc[0] + dc[12]; 1014 t10 = dc[1] + dc[13]; 1015 t20 = dc[2] + dc[14]; 1016 t30 = dc[3] + dc[15]; 1017 1018 t03 = dc[0] - dc[12]; 1019 t13 = dc[1] - dc[13]; 1020 t23 = dc[2] - dc[14]; 1021 t33 = dc[3] - dc[15]; 1022 1023 t01 = dc[4] + dc[ 8]; 1024 t11 = dc[5] + dc[ 9]; 1025 t21 = dc[6] + dc[10]; 1026 t31 = dc[7] + dc[11]; 1027 1028 t02 = dc[4] - dc[ 8]; 1029 t12 = dc[5] - dc[ 9]; 1030 t22 = dc[6] - dc[10]; 1031 t32 = dc[7] - dc[11]; 1032 1033 dc[ 0] = t00 + t01; 1034 dc[ 1] = t10 + t11; 1035 dc[ 2] = t20 + t21; 1036 dc[ 3] = t30 + t31; 1037 1038 dc[ 4] = t03 + t02; 1039 dc[ 5] = t13 + t12; 1040 dc[ 6] = t23 + t22; 1041 dc[ 7] = t33 + t32; 1042 1043 dc[ 8] = t00 - t01; 1044 dc[ 9] = t10 - t11; 1045 dc[10] = t20 - t21; 1046 dc[11] = t30 - t31; 1047 1048 dc[12] = t03 - t02; 1049 dc[13] = t13 - t12; 1050 dc[14] = t23 - t22; 1051 dc[15] = t33 - t32; 1052 1053 block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3; 1054 block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3; 1055 block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3; 1056 block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3; 1057 1058 block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3; 1059 block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3; 1060 block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3; 1061 block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3; 1062 1063 block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3; 1064 block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3; 1065 block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3; 1066 block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3; 1067 1068 block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3; 1069 block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3; 1070 block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3; 1071 block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3; 1072 1073 AV_ZERO64(dc + 0); 1074 AV_ZERO64(dc + 4); 1075 AV_ZERO64(dc + 8); 1076 AV_ZERO64(dc + 12); 1077#endif 1078} 1079 1080void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16]) 1081{ 1082 int val = (dc[0] + 3) >> 3; 1083 1084 dc[0] = 0; 1085 1086 block[0][0][0] = val; 1087 block[0][1][0] = val; 1088 block[0][2][0] = val; 1089 block[0][3][0] = val; 1090 block[1][0][0] = val; 1091 block[1][1][0] = val; 1092 block[1][2][0] = val; 1093 block[1][3][0] = val; 1094 block[2][0][0] = val; 1095 block[2][1][0] = val; 1096 block[2][2][0] = val; 1097 block[2][3][0] = val; 1098 block[3][0][0] = val; 1099 block[3][1][0] = val; 1100 block[3][2][0] = val; 1101 block[3][3][0] = val; 1102} 1103 1104void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride) 1105{ 1106#if 1 1107 double ftmp[12]; 1108 uint32_t tmp[1]; 1109 union av_intfloat64 ff_ph_4e7b_u; 1110 union av_intfloat64 ff_ph_22a3_u; 1111 DECLARE_VAR_LOW32; 1112 DECLARE_VAR_ALL64; 1113 ff_ph_4e7b_u.i = 0x4e7b4e7b4e7b4e7bULL; 1114 ff_ph_22a3_u.i = 0x22a322a322a322a3ULL; 1115 1116 __asm__ volatile ( 1117 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1118 MMI_LDC1(%[ftmp1], %[block], 0x00) 1119 MMI_LDC1(%[ftmp2], %[block], 0x08) 1120 MMI_LDC1(%[ftmp3], %[block], 0x10) 1121 MMI_LDC1(%[ftmp4], %[block], 0x18) 1122 1123 "li %[tmp0], 0x02 \n\t" 1124 "mtc1 %[tmp0], %[ftmp11] \n\t" 1125 1126 // block[0...3] + block[8...11] 1127 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" 1128 // block[0...3] - block[8...11] 1129 "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t" 1130 // MUL_35468(block[12...15]) 1131 "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t" 1132 "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t" 1133 // MUL_35468(block[4...7]) 1134 "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t" 1135 "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t" 1136 // MUL_20091(block[4...7] 1137 "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t" 1138 "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t" 1139 // MUL_20091(block[12...15]) 1140 "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t" 1141 "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" 1142 1143 // tmp[0 4 8 12] 1144 "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" 1145 "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" 1146 // tmp[1 5 9 13] 1147 "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t" 1148 "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" 1149 // tmp[2 6 10 14] 1150 "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t" 1151 "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t" 1152 // tmp[3 7 11 15] 1153 "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t" 1154 "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t" 1155 1156 MMI_SDC1(%[ftmp0], %[block], 0x00) 1157 MMI_SDC1(%[ftmp0], %[block], 0x08) 1158 MMI_SDC1(%[ftmp0], %[block], 0x10) 1159 MMI_SDC1(%[ftmp0], %[block], 0x18) 1160 1161 TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], 1162 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8]) 1163 1164 // t[0 4 8 12] 1165 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" 1166 // t[1 5 9 13] 1167 "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t" 1168 // t[2 6 10 14] 1169 "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t" 1170 "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t" 1171 "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t" 1172 "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t" 1173 "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t" 1174 // t[3 7 11 15] 1175 "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t" 1176 "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t" 1177 "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t" 1178 "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t" 1179 "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" 1180 1181 "li %[tmp0], 0x03 \n\t" 1182 "mtc1 %[tmp0], %[ftmp11] \n\t" 1183 "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t" 1184 "paddh %[ftmp1], %[ftmp1], %[ff_pw_4] \n\t" 1185 "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t" 1186 "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t" 1187 "paddh %[ftmp2], %[ftmp2], %[ff_pw_4] \n\t" 1188 "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t" 1189 "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t" 1190 "paddh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" 1191 "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t" 1192 "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t" 1193 "paddh %[ftmp4], %[ftmp4], %[ff_pw_4] \n\t" 1194 "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t" 1195 1196 TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], 1197 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8]) 1198 1199 MMI_LWC1(%[ftmp5], %[dst0], 0x00) 1200 MMI_LWC1(%[ftmp6], %[dst1], 0x00) 1201 MMI_LWC1(%[ftmp7], %[dst2], 0x00) 1202 MMI_LWC1(%[ftmp8], %[dst3], 0x00) 1203 1204 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" 1205 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" 1206 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" 1207 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 1208 1209 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" 1210 "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" 1211 "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" 1212 "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" 1213 1214 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1215 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1216 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1217 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1218 1219 MMI_SWC1(%[ftmp1], %[dst0], 0x00) 1220 MMI_SWC1(%[ftmp2], %[dst1], 0x00) 1221 MMI_SWC1(%[ftmp3], %[dst2], 0x00) 1222 MMI_SWC1(%[ftmp4], %[dst3], 0x00) 1223 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 1224 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 1225 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 1226 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 1227 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 1228 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 1229 RESTRICT_ASM_LOW32 1230 RESTRICT_ASM_ALL64 1231 [tmp0]"=&r"(tmp[0]) 1232 : [dst0]"r"(dst), [dst1]"r"(dst+stride), 1233 [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride), 1234 [block]"r"(block), [ff_pw_4]"f"(ff_pw_4.f), 1235 [ff_ph_4e7b]"f"(ff_ph_4e7b_u.f), [ff_ph_22a3]"f"(ff_ph_22a3_u.f) 1236 : "memory" 1237 ); 1238#else 1239 int i, t0, t1, t2, t3; 1240 int16_t tmp[16]; 1241 1242 for (i = 0; i < 4; i++) { 1243 t0 = block[0 + i] + block[8 + i]; 1244 t1 = block[0 + i] - block[8 + i]; 1245 t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]); 1246 t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]); 1247 block[ 0 + i] = 0; 1248 block[ 4 + i] = 0; 1249 block[ 8 + i] = 0; 1250 block[12 + i] = 0; 1251 1252 tmp[i * 4 + 0] = t0 + t3; 1253 tmp[i * 4 + 1] = t1 + t2; 1254 tmp[i * 4 + 2] = t1 - t2; 1255 tmp[i * 4 + 3] = t0 - t3; 1256 } 1257 1258 for (i = 0; i < 4; i++) { 1259 t0 = tmp[0 + i] + tmp[8 + i]; 1260 t1 = tmp[0 + i] - tmp[8 + i]; 1261 t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]); 1262 t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]); 1263 1264 dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3)); 1265 dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3)); 1266 dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3)); 1267 dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3)); 1268 dst += stride; 1269 } 1270#endif 1271} 1272 1273void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride) 1274{ 1275#if 1 1276 int dc = (block[0] + 4) >> 3; 1277 double ftmp[6]; 1278 DECLARE_VAR_LOW32; 1279 1280 block[0] = 0; 1281 1282 __asm__ volatile ( 1283 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1284 "mtc1 %[dc], %[ftmp5] \n\t" 1285 MMI_LWC1(%[ftmp1], %[dst0], 0x00) 1286 MMI_LWC1(%[ftmp2], %[dst1], 0x00) 1287 MMI_LWC1(%[ftmp3], %[dst2], 0x00) 1288 MMI_LWC1(%[ftmp4], %[dst3], 0x00) 1289 "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" 1290 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1291 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1292 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1293 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1294 "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" 1295 "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t" 1296 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" 1297 "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" 1298 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1299 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1300 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1301 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1302 MMI_SWC1(%[ftmp1], %[dst0], 0x00) 1303 MMI_SWC1(%[ftmp2], %[dst1], 0x00) 1304 MMI_SWC1(%[ftmp3], %[dst2], 0x00) 1305 MMI_SWC1(%[ftmp4], %[dst3], 0x00) 1306 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 1307 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 1308 [ftmp4]"=&f"(ftmp[4]), 1309 RESTRICT_ASM_LOW32 1310 [ftmp5]"=&f"(ftmp[5]) 1311 : [dst0]"r"(dst), [dst1]"r"(dst+stride), 1312 [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride), 1313 [dc]"r"(dc) 1314 : "memory" 1315 ); 1316#else 1317 int i, dc = (block[0] + 4) >> 3; 1318 1319 block[0] = 0; 1320 1321 for (i = 0; i < 4; i++) { 1322 dst[0] = av_clip_uint8(dst[0] + dc); 1323 dst[1] = av_clip_uint8(dst[1] + dc); 1324 dst[2] = av_clip_uint8(dst[2] + dc); 1325 dst[3] = av_clip_uint8(dst[3] + dc); 1326 dst += stride; 1327 } 1328#endif 1329} 1330 1331void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16], 1332 ptrdiff_t stride) 1333{ 1334 ff_vp8_idct_dc_add_mmi(dst + 0, block[0], stride); 1335 ff_vp8_idct_dc_add_mmi(dst + 4, block[1], stride); 1336 ff_vp8_idct_dc_add_mmi(dst + 8, block[2], stride); 1337 ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride); 1338} 1339 1340void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16], 1341 ptrdiff_t stride) 1342{ 1343 ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride); 1344 ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride); 1345 ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride); 1346 ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride); 1347} 1348 1349// loop filter applied to edges between macroblocks 1350void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, 1351 int flim_I, int hev_thresh) 1352{ 1353 vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh); 1354 vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh); 1355} 1356 1357void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, 1358 int flim_I, int hev_thresh) 1359{ 1360 vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh); 1361 vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I, 1362 hev_thresh); 1363} 1364 1365void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, 1366 int flim_E, int flim_I, int hev_thresh) 1367{ 1368 vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh); 1369 vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh); 1370} 1371 1372void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, 1373 int flim_E, int flim_I, int hev_thresh) 1374{ 1375 vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh); 1376 vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh); 1377} 1378 1379// loop filter applied to inner macroblock edges 1380void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride, 1381 int flim_E, int flim_I, int hev_thresh) 1382{ 1383 int i; 1384 1385 for (i = 0; i < 16; i++) 1386 if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) { 1387 int hv = hev(dst + i * 1, stride, hev_thresh); 1388 if (hv) 1389 vp8_filter_common_is4tap(dst + i * 1, stride); 1390 else 1391 vp8_filter_common_isnot4tap(dst + i * 1, stride); 1392 } 1393} 1394 1395void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride, 1396 int flim_E, int flim_I, int hev_thresh) 1397{ 1398 int i; 1399 1400 for (i = 0; i < 16; i++) 1401 if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) { 1402 int hv = hev(dst + i * stride, 1, hev_thresh); 1403 if (hv) 1404 vp8_filter_common_is4tap(dst + i * stride, 1); 1405 else 1406 vp8_filter_common_isnot4tap(dst + i * stride, 1); 1407 } 1408} 1409 1410void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV, 1411 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh) 1412{ 1413 vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh); 1414 vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh); 1415} 1416 1417void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV, 1418 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh) 1419{ 1420 vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh); 1421 vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh); 1422} 1423 1424void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim) 1425{ 1426 int i; 1427 1428 for (i = 0; i < 16; i++) 1429 if (vp8_simple_limit(dst + i, stride, flim)) 1430 vp8_filter_common_is4tap(dst + i, stride); 1431} 1432 1433void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim) 1434{ 1435 int i; 1436 1437 for (i = 0; i < 16; i++) 1438 if (vp8_simple_limit(dst + i * stride, 1, flim)) 1439 vp8_filter_common_is4tap(dst + i * stride, 1); 1440} 1441 1442void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 1443 ptrdiff_t srcstride, int h, int x, int y) 1444{ 1445#if 1 1446 double ftmp[2]; 1447 uint64_t tmp[2]; 1448 mips_reg addr[2]; 1449 DECLARE_VAR_ALL64; 1450 1451 __asm__ volatile ( 1452 "1: \n\t" 1453 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t" 1454 MMI_ULDC1(%[ftmp0], %[src], 0x00) 1455 "ldl %[tmp0], 0x0f(%[src]) \n\t" 1456 "ldr %[tmp0], 0x08(%[src]) \n\t" 1457 MMI_ULDC1(%[ftmp1], %[addr0], 0x00) 1458 "ldl %[tmp1], 0x0f(%[addr0]) \n\t" 1459 "ldr %[tmp1], 0x08(%[addr0]) \n\t" 1460 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t" 1461 MMI_SDC1(%[ftmp0], %[dst], 0x00) 1462 "sdl %[tmp0], 0x0f(%[dst]) \n\t" 1463 "sdr %[tmp0], 0x08(%[dst]) \n\t" 1464 "addiu %[h], %[h], -0x02 \n\t" 1465 MMI_SDC1(%[ftmp1], %[addr1], 0x00) 1466 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t" 1467 "sdl %[tmp1], 0x0f(%[addr1]) \n\t" 1468 "sdr %[tmp1], 0x08(%[addr1]) \n\t" 1469 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t" 1470 "bnez %[h], 1b \n\t" 1471 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 1472 [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), 1473 RESTRICT_ASM_ALL64 1474 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), 1475 [dst]"+&r"(dst), [src]"+&r"(src), 1476 [h]"+&r"(h) 1477 : [dststride]"r"((mips_reg)dststride), 1478 [srcstride]"r"((mips_reg)srcstride) 1479 : "memory" 1480 ); 1481#else 1482 int i; 1483 1484 for (i = 0; i < h; i++, dst += dststride, src += srcstride) 1485 memcpy(dst, src, 16); 1486#endif 1487} 1488 1489void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 1490 ptrdiff_t srcstride, int h, int x, int y) 1491{ 1492#if 1 1493 double ftmp[1]; 1494 uint64_t tmp[1]; 1495 mips_reg addr[2]; 1496 DECLARE_VAR_ALL64; 1497 1498 __asm__ volatile ( 1499 "1: \n\t" 1500 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t" 1501 MMI_ULDC1(%[ftmp0], %[src], 0x00) 1502 "ldl %[tmp0], 0x07(%[addr0]) \n\t" 1503 "ldr %[tmp0], 0x00(%[addr0]) \n\t" 1504 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t" 1505 MMI_SDC1(%[ftmp0], %[dst], 0x00) 1506 "addiu %[h], %[h], -0x02 \n\t" 1507 "sdl %[tmp0], 0x07(%[addr1]) \n\t" 1508 "sdr %[tmp0], 0x00(%[addr1]) \n\t" 1509 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t" 1510 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t" 1511 "bnez %[h], 1b \n\t" 1512 : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]), 1513 RESTRICT_ASM_ALL64 1514 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), 1515 [dst]"+&r"(dst), [src]"+&r"(src), 1516 [h]"+&r"(h) 1517 : [dststride]"r"((mips_reg)dststride), 1518 [srcstride]"r"((mips_reg)srcstride) 1519 : "memory" 1520 ); 1521#else 1522 int i; 1523 1524 for (i = 0; i < h; i++, dst += dststride, src += srcstride) 1525 memcpy(dst, src, 8); 1526#endif 1527} 1528 1529void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 1530 ptrdiff_t srcstride, int h, int x, int y) 1531{ 1532#if 1 1533 double ftmp[1]; 1534 uint64_t tmp[1]; 1535 mips_reg addr[2]; 1536 DECLARE_VAR_LOW32; 1537 1538 __asm__ volatile ( 1539 "1: \n\t" 1540 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t" 1541 MMI_LWC1(%[ftmp0], %[src], 0x00) 1542 "lwl %[tmp0], 0x03(%[addr0]) \n\t" 1543 "lwr %[tmp0], 0x00(%[addr0]) \n\t" 1544 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t" 1545 MMI_SWC1(%[ftmp0], %[dst], 0x00) 1546 "addiu %[h], %[h], -0x02 \n\t" 1547 "swl %[tmp0], 0x03(%[addr1]) \n\t" 1548 "swr %[tmp0], 0x00(%[addr1]) \n\t" 1549 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t" 1550 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t" 1551 "bnez %[h], 1b \n\t" 1552 : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]), 1553 RESTRICT_ASM_LOW32 1554 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), 1555 [dst]"+&r"(dst), [src]"+&r"(src), 1556 [h]"+&r"(h) 1557 : [dststride]"r"((mips_reg)dststride), 1558 [srcstride]"r"((mips_reg)srcstride) 1559 : "memory" 1560 ); 1561#else 1562 int i; 1563 1564 for (i = 0; i < h; i++, dst += dststride, src += srcstride) 1565 memcpy(dst, src, 4); 1566#endif 1567} 1568 1569void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 1570 ptrdiff_t srcstride, int h, int mx, int my) 1571{ 1572#if 1 1573 const uint64_t *filter = fourtap_subpel_filters[mx - 1]; 1574 double ftmp[9]; 1575 uint32_t tmp[1]; 1576 union av_intfloat64 filter1; 1577 union av_intfloat64 filter2; 1578 union av_intfloat64 filter3; 1579 union av_intfloat64 filter4; 1580 mips_reg src1, dst1; 1581 DECLARE_VAR_ALL64; 1582 filter1.i = filter[1]; 1583 filter2.i = filter[2]; 1584 filter3.i = filter[3]; 1585 filter4.i = filter[4]; 1586 1587 /* 1588 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7]; 1589 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7]; 1590 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7]; 1591 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7]; 1592 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7]; 1593 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7]; 1594 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7]; 1595 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7]; 1596 1597 dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7]; 1598 dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7]; 1599 dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7]; 1600 dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7]; 1601 dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7]; 1602 dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7]; 1603 dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7]; 1604 dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7]; 1605 */ 1606 __asm__ volatile ( 1607 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1608 "li %[tmp0], 0x07 \n\t" 1609 "mtc1 %[tmp0], %[ftmp4] \n\t" 1610 1611 "1: \n\t" 1612 // 0 - 7 1613 PUT_VP8_EPEL8_H4_MMI(%[src], %[dst]) 1614 PTR_ADDIU "%[src1], %[src], 0x08 \n\t" 1615 PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t" 1616 // 8 - 15 1617 PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1]) 1618 1619 "addiu %[h], %[h], -0x01 \n\t" 1620 PTR_ADDU "%[src], %[src], %[srcstride] \n\t" 1621 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t" 1622 "bnez %[h], 1b \n\t" 1623 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 1624 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 1625 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 1626 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 1627 [ftmp8]"=&f"(ftmp[8]), 1628 [tmp0]"=&r"(tmp[0]), 1629 RESTRICT_ASM_ALL64 1630 [dst1]"=&r"(dst1), [src1]"=&r"(src1), 1631 [h]"+&r"(h), 1632 [dst]"+&r"(dst), [src]"+&r"(src) 1633 : [ff_pw_64]"f"(ff_pw_64.f), 1634 [srcstride]"r"((mips_reg)srcstride), 1635 [dststride]"r"((mips_reg)dststride), 1636 [filter1]"f"(filter1.f), [filter2]"f"(filter2.f), 1637 [filter3]"f"(filter3.f), [filter4]"f"(filter4.f) 1638 : "memory" 1639 ); 1640#else 1641 const uint8_t *filter = subpel_filters[mx - 1]; 1642 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 1643 int x, y; 1644 1645 for (y = 0; y < h; y++) { 1646 for (x = 0; x < 16; x++) 1647 dst[x] = FILTER_4TAP(src, filter, 1); 1648 dst += dststride; 1649 src += srcstride; 1650 } 1651#endif 1652} 1653 1654void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 1655 ptrdiff_t srcstride, int h, int mx, int my) 1656{ 1657#if 1 1658 const uint64_t *filter = fourtap_subpel_filters[mx - 1]; 1659 double ftmp[9]; 1660 uint32_t tmp[1]; 1661 union av_intfloat64 filter1; 1662 union av_intfloat64 filter2; 1663 union av_intfloat64 filter3; 1664 union av_intfloat64 filter4; 1665 DECLARE_VAR_ALL64; 1666 filter1.i = filter[1]; 1667 filter2.i = filter[2]; 1668 filter3.i = filter[3]; 1669 filter4.i = filter[4]; 1670 1671 1672 /* 1673 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7]; 1674 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7]; 1675 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7]; 1676 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7]; 1677 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7]; 1678 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7]; 1679 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7]; 1680 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7]; 1681 */ 1682 __asm__ volatile ( 1683 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1684 "li %[tmp0], 0x07 \n\t" 1685 "mtc1 %[tmp0], %[ftmp4] \n\t" 1686 1687 "1: \n\t" 1688 PUT_VP8_EPEL8_H4_MMI(%[src], %[dst]) 1689 1690 "addiu %[h], %[h], -0x01 \n\t" 1691 PTR_ADDU "%[src], %[src], %[srcstride] \n\t" 1692 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t" 1693 "bnez %[h], 1b \n\t" 1694 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 1695 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 1696 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 1697 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 1698 [ftmp8]"=&f"(ftmp[8]), 1699 [tmp0]"=&r"(tmp[0]), 1700 RESTRICT_ASM_ALL64 1701 [h]"+&r"(h), 1702 [dst]"+&r"(dst), [src]"+&r"(src) 1703 : [ff_pw_64]"f"(ff_pw_64.f), 1704 [srcstride]"r"((mips_reg)srcstride), 1705 [dststride]"r"((mips_reg)dststride), 1706 [filter1]"f"(filter1.f), [filter2]"f"(filter2.f), 1707 [filter3]"f"(filter3.f), [filter4]"f"(filter4.f) 1708 : "memory" 1709 ); 1710#else 1711 const uint8_t *filter = subpel_filters[mx - 1]; 1712 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 1713 int x, y; 1714 1715 for (y = 0; y < h; y++) { 1716 for (x = 0; x < 8; x++) 1717 dst[x] = FILTER_4TAP(src, filter, 1); 1718 dst += dststride; 1719 src += srcstride; 1720 } 1721#endif 1722} 1723 1724void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 1725 ptrdiff_t srcstride, int h, int mx, int my) 1726{ 1727#if 1 1728 const uint64_t *filter = fourtap_subpel_filters[mx - 1]; 1729 double ftmp[6]; 1730 uint32_t tmp[1]; 1731 union av_intfloat64 filter1; 1732 union av_intfloat64 filter2; 1733 union av_intfloat64 filter3; 1734 union av_intfloat64 filter4; 1735 DECLARE_VAR_LOW32; 1736 filter1.i = filter[1]; 1737 filter2.i = filter[2]; 1738 filter3.i = filter[3]; 1739 filter4.i = filter[4]; 1740 1741 /* 1742 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7]; 1743 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7]; 1744 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7]; 1745 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7]; 1746 */ 1747 __asm__ volatile ( 1748 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1749 "li %[tmp0], 0x07 \n\t" 1750 "mtc1 %[tmp0], %[ftmp4] \n\t" 1751 1752 "1: \n\t" 1753 PUT_VP8_EPEL4_H4_MMI(%[src], %[dst]) 1754 1755 "addiu %[h], %[h], -0x01 \n\t" 1756 PTR_ADDU "%[src], %[src], %[srcstride] \n\t" 1757 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t" 1758 "bnez %[h], 1b \n\t" 1759 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 1760 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 1761 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 1762 [tmp0]"=&r"(tmp[0]), 1763 RESTRICT_ASM_LOW32 1764 [h]"+&r"(h), 1765 [dst]"+&r"(dst), [src]"+&r"(src) 1766 : [ff_pw_64]"f"(ff_pw_64.f), 1767 [srcstride]"r"((mips_reg)srcstride), 1768 [dststride]"r"((mips_reg)dststride), 1769 [filter1]"f"(filter1.f), [filter2]"f"(filter2.f), 1770 [filter3]"f"(filter3.f), [filter4]"f"(filter4.f) 1771 : "memory" 1772 ); 1773#else 1774 const uint8_t *filter = subpel_filters[mx - 1]; 1775 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 1776 int x, y; 1777 1778 for (y = 0; y < h; y++) { 1779 for (x = 0; x < 4; x++) 1780 dst[x] = FILTER_4TAP(src, filter, 1); 1781 dst += dststride; 1782 src += srcstride; 1783 } 1784#endif 1785} 1786 1787void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 1788 ptrdiff_t srcstride, int h, int mx, int my) 1789{ 1790#if 1 1791 const uint64_t *filter = fourtap_subpel_filters[mx - 1]; 1792 double ftmp[9]; 1793 uint32_t tmp[1]; 1794 mips_reg src1, dst1; 1795 union av_intfloat64 filter0; 1796 union av_intfloat64 filter1; 1797 union av_intfloat64 filter2; 1798 union av_intfloat64 filter3; 1799 union av_intfloat64 filter4; 1800 union av_intfloat64 filter5; 1801 DECLARE_VAR_ALL64; 1802 filter0.i = filter[0]; 1803 filter1.i = filter[1]; 1804 filter2.i = filter[2]; 1805 filter3.i = filter[3]; 1806 filter4.i = filter[4]; 1807 filter5.i = filter[5]; 1808 1809 /* 1810 dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7]; 1811 dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7]; 1812 dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7]; 1813 dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7]; 1814 dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7]; 1815 dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7]; 1816 dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7]; 1817 dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7]; 1818 1819 dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7]; 1820 dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7]; 1821 dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7]; 1822 dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7]; 1823 dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7]; 1824 dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7]; 1825 dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7]; 1826 dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7]; 1827 */ 1828 __asm__ volatile ( 1829 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1830 "li %[tmp0], 0x07 \n\t" 1831 "mtc1 %[tmp0], %[ftmp4] \n\t" 1832 1833 "1: \n\t" 1834 // 0 - 7 1835 PUT_VP8_EPEL8_H6_MMI(%[src], %[dst]) 1836 PTR_ADDIU "%[src1], %[src], 0x08 \n\t" 1837 PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t" 1838 // 8 - 15 1839 PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1]) 1840 1841 "addiu %[h], %[h], -0x01 \n\t" 1842 PTR_ADDU "%[src], %[src], %[srcstride] \n\t" 1843 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t" 1844 "bnez %[h], 1b \n\t" 1845 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 1846 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 1847 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 1848 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 1849 [ftmp8]"=&f"(ftmp[8]), 1850 [tmp0]"=&r"(tmp[0]), 1851 RESTRICT_ASM_ALL64 1852 [dst1]"=&r"(dst1), [src1]"=&r"(src1), 1853 [h]"+&r"(h), 1854 [dst]"+&r"(dst), [src]"+&r"(src) 1855 : [ff_pw_64]"f"(ff_pw_64.f), 1856 [srcstride]"r"((mips_reg)srcstride), 1857 [dststride]"r"((mips_reg)dststride), 1858 [filter0]"f"(filter0.f), [filter1]"f"(filter1.f), 1859 [filter2]"f"(filter2.f), [filter3]"f"(filter3.f), 1860 [filter4]"f"(filter4.f), [filter5]"f"(filter5.f) 1861 : "memory" 1862 ); 1863#else 1864 const uint8_t *filter = subpel_filters[mx - 1]; 1865 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 1866 int x, y; 1867 1868 for (y = 0; y < h; y++) { 1869 for (x = 0; x < 16; x++) 1870 dst[x] = FILTER_6TAP(src, filter, 1); 1871 dst += dststride; 1872 src += srcstride; 1873 } 1874#endif 1875} 1876 1877void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 1878 ptrdiff_t srcstride, int h, int mx, int my) 1879{ 1880#if 1 1881 const uint64_t *filter = fourtap_subpel_filters[mx - 1]; 1882 double ftmp[9]; 1883 uint32_t tmp[1]; 1884 union av_intfloat64 filter0; 1885 union av_intfloat64 filter1; 1886 union av_intfloat64 filter2; 1887 union av_intfloat64 filter3; 1888 union av_intfloat64 filter4; 1889 union av_intfloat64 filter5; 1890 DECLARE_VAR_ALL64; 1891 filter0.i = filter[0]; 1892 filter1.i = filter[1]; 1893 filter2.i = filter[2]; 1894 filter3.i = filter[3]; 1895 filter4.i = filter[4]; 1896 filter5.i = filter[5]; 1897 1898 /* 1899 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7]; 1900 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7]; 1901 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7]; 1902 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7]; 1903 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7]; 1904 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7]; 1905 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7]; 1906 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7]; 1907 */ 1908 __asm__ volatile ( 1909 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1910 "li %[tmp0], 0x07 \n\t" 1911 "mtc1 %[tmp0], %[ftmp4] \n\t" 1912 1913 "1: \n\t" 1914 PUT_VP8_EPEL8_H6_MMI(%[src], %[dst]) 1915 1916 "addiu %[h], %[h], -0x01 \n\t" 1917 PTR_ADDU "%[src], %[src], %[srcstride] \n\t" 1918 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t" 1919 "bnez %[h], 1b \n\t" 1920 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 1921 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 1922 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 1923 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 1924 [ftmp8]"=&f"(ftmp[8]), 1925 [tmp0]"=&r"(tmp[0]), 1926 RESTRICT_ASM_ALL64 1927 [h]"+&r"(h), 1928 [dst]"+&r"(dst), [src]"+&r"(src) 1929 : [ff_pw_64]"f"(ff_pw_64.f), 1930 [srcstride]"r"((mips_reg)srcstride), 1931 [dststride]"r"((mips_reg)dststride), 1932 [filter0]"f"(filter0.f), [filter1]"f"(filter1.f), 1933 [filter2]"f"(filter2.f), [filter3]"f"(filter3.f), 1934 [filter4]"f"(filter4.f), [filter5]"f"(filter5.f) 1935 : "memory" 1936 ); 1937#else 1938 const uint8_t *filter = subpel_filters[mx - 1]; 1939 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 1940 int x, y; 1941 1942 for (y = 0; y < h; y++) { 1943 for (x = 0; x < 8; x++) 1944 dst[x] = FILTER_6TAP(src, filter, 1); 1945 dst += dststride; 1946 src += srcstride; 1947 } 1948#endif 1949} 1950 1951void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 1952 ptrdiff_t srcstride, int h, int mx, int my) 1953{ 1954#if 1 1955 const uint64_t *filter = fourtap_subpel_filters[mx - 1]; 1956 double ftmp[6]; 1957 uint32_t tmp[1]; 1958 union av_intfloat64 filter0; 1959 union av_intfloat64 filter1; 1960 union av_intfloat64 filter2; 1961 union av_intfloat64 filter3; 1962 union av_intfloat64 filter4; 1963 union av_intfloat64 filter5; 1964 DECLARE_VAR_LOW32; 1965 filter0.i = filter[0]; 1966 filter1.i = filter[1]; 1967 filter2.i = filter[2]; 1968 filter3.i = filter[3]; 1969 filter4.i = filter[4]; 1970 filter5.i = filter[5]; 1971 1972 /* 1973 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7]; 1974 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7]; 1975 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7]; 1976 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7]; 1977 */ 1978 __asm__ volatile ( 1979 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1980 "li %[tmp0], 0x07 \n\t" 1981 "mtc1 %[tmp0], %[ftmp4] \n\t" 1982 1983 "1: \n\t" 1984 PUT_VP8_EPEL4_H6_MMI(%[src], %[dst]) 1985 1986 "addiu %[h], %[h], -0x01 \n\t" 1987 PTR_ADDU "%[src], %[src], %[srcstride] \n\t" 1988 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t" 1989 "bnez %[h], 1b \n\t" 1990 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 1991 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 1992 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 1993 [tmp0]"=&r"(tmp[0]), 1994 RESTRICT_ASM_LOW32 1995 [h]"+&r"(h), 1996 [dst]"+&r"(dst), [src]"+&r"(src) 1997 : [ff_pw_64]"f"(ff_pw_64.f), 1998 [srcstride]"r"((mips_reg)srcstride), 1999 [dststride]"r"((mips_reg)dststride), 2000 [filter0]"f"(filter0.f), [filter1]"f"(filter1.f), 2001 [filter2]"f"(filter2.f), [filter3]"f"(filter3.f), 2002 [filter4]"f"(filter4.f), [filter5]"f"(filter5.f) 2003 : "memory" 2004 ); 2005#else 2006 const uint8_t *filter = subpel_filters[mx - 1]; 2007 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2008 int x, y; 2009 2010 for (y = 0; y < h; y++) { 2011 for (x = 0; x < 4; x++) 2012 dst[x] = FILTER_6TAP(src, filter, 1); 2013 dst += dststride; 2014 src += srcstride; 2015 } 2016#endif 2017} 2018 2019void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2020 ptrdiff_t srcstride, int h, int mx, int my) 2021{ 2022#if 1 2023 const uint64_t *filter = fourtap_subpel_filters[my - 1]; 2024 double ftmp[9]; 2025 uint32_t tmp[1]; 2026 mips_reg src0, src1, dst0; 2027 union av_intfloat64 filter1; 2028 union av_intfloat64 filter2; 2029 union av_intfloat64 filter3; 2030 union av_intfloat64 filter4; 2031 DECLARE_VAR_ALL64; 2032 filter1.i = filter[1]; 2033 filter2.i = filter[2]; 2034 filter3.i = filter[3]; 2035 filter4.i = filter[4]; 2036 2037 /* 2038 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7]; 2039 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7]; 2040 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7]; 2041 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7]; 2042 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7]; 2043 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7]; 2044 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7]; 2045 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7]; 2046 2047 dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7]; 2048 dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7]; 2049 dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7]; 2050 dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7]; 2051 dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7]; 2052 dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7]; 2053 dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7]; 2054 dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7]; 2055 */ 2056 __asm__ volatile ( 2057 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2058 "li %[tmp0], 0x07 \n\t" 2059 "mtc1 %[tmp0], %[ftmp4] \n\t" 2060 2061 "1: \n\t" 2062 // 0 - 7 2063 PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride]) 2064 PTR_ADDIU "%[src0], %[src], 0x08 \n\t" 2065 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t" 2066 // 8 - 15 2067 PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride]) 2068 2069 "addiu %[h], %[h], -0x01 \n\t" 2070 PTR_ADDU "%[src], %[src], %[srcstride] \n\t" 2071 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t" 2072 "bnez %[h], 1b \n\t" 2073 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 2074 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 2075 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 2076 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 2077 [ftmp8]"=&f"(ftmp[8]), 2078 [tmp0]"=&r"(tmp[0]), 2079 RESTRICT_ASM_ALL64 2080 [src0]"=&r"(src0), [dst0]"=&r"(dst0), 2081 [src1]"=&r"(src1), 2082 [h]"+&r"(h), 2083 [dst]"+&r"(dst), [src]"+&r"(src) 2084 : [ff_pw_64]"f"(ff_pw_64.f), 2085 [srcstride]"r"((mips_reg)srcstride), 2086 [dststride]"r"((mips_reg)dststride), 2087 [filter1]"f"(filter1.f), [filter2]"f"(filter2.f), 2088 [filter3]"f"(filter3.f), [filter4]"f"(filter4.f) 2089 : "memory" 2090 ); 2091#else 2092 const uint8_t *filter = subpel_filters[my - 1]; 2093 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2094 int x, y; 2095 2096 for (y = 0; y < h; y++) { 2097 for (x = 0; x < 16; x++) 2098 dst[x] = FILTER_4TAP(src, filter, srcstride); 2099 dst += dststride; 2100 src += srcstride; 2101 } 2102#endif 2103} 2104 2105void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2106 ptrdiff_t srcstride, int h, int mx, int my) 2107{ 2108#if 1 2109 const uint64_t *filter = fourtap_subpel_filters[my - 1]; 2110 double ftmp[9]; 2111 uint32_t tmp[1]; 2112 mips_reg src1; 2113 union av_intfloat64 filter1; 2114 union av_intfloat64 filter2; 2115 union av_intfloat64 filter3; 2116 union av_intfloat64 filter4; 2117 DECLARE_VAR_ALL64; 2118 filter1.i = filter[1]; 2119 filter2.i = filter[2]; 2120 filter3.i = filter[3]; 2121 filter4.i = filter[4]; 2122 2123 /* 2124 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7]; 2125 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7]; 2126 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7]; 2127 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7]; 2128 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7]; 2129 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7]; 2130 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7]; 2131 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7]; 2132 */ 2133 __asm__ volatile ( 2134 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2135 "li %[tmp0], 0x07 \n\t" 2136 "mtc1 %[tmp0], %[ftmp4] \n\t" 2137 2138 "1: \n\t" 2139 PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride]) 2140 2141 "addiu %[h], %[h], -0x01 \n\t" 2142 PTR_ADDU "%[src], %[src], %[srcstride] \n\t" 2143 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t" 2144 "bnez %[h], 1b \n\t" 2145 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 2146 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 2147 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 2148 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 2149 [ftmp8]"=&f"(ftmp[8]), 2150 [tmp0]"=&r"(tmp[0]), 2151 RESTRICT_ASM_ALL64 2152 [src1]"=&r"(src1), 2153 [h]"+&r"(h), 2154 [dst]"+&r"(dst), [src]"+&r"(src) 2155 : [ff_pw_64]"f"(ff_pw_64.f), 2156 [srcstride]"r"((mips_reg)srcstride), 2157 [dststride]"r"((mips_reg)dststride), 2158 [filter1]"f"(filter1.f), [filter2]"f"(filter2.f), 2159 [filter3]"f"(filter3.f), [filter4]"f"(filter4.f) 2160 : "memory" 2161 ); 2162#else 2163 const uint8_t *filter = subpel_filters[my - 1]; 2164 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2165 int x, y; 2166 2167 for (y = 0; y < h; y++) { 2168 for (x = 0; x < 8; x++) 2169 dst[x] = FILTER_4TAP(src, filter, srcstride); 2170 dst += dststride; 2171 src += srcstride; 2172 } 2173#endif 2174} 2175 2176void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2177 ptrdiff_t srcstride, int h, int mx, int my) 2178{ 2179#if 1 2180 const uint64_t *filter = fourtap_subpel_filters[my - 1]; 2181 double ftmp[6]; 2182 uint32_t tmp[1]; 2183 mips_reg src1; 2184 union av_intfloat64 filter1; 2185 union av_intfloat64 filter2; 2186 union av_intfloat64 filter3; 2187 union av_intfloat64 filter4; 2188 DECLARE_VAR_LOW32; 2189 filter1.i = filter[1]; 2190 filter2.i = filter[2]; 2191 filter3.i = filter[3]; 2192 filter4.i = filter[4]; 2193 2194 /* 2195 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7]; 2196 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7]; 2197 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7]; 2198 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7]; 2199 */ 2200 __asm__ volatile ( 2201 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2202 "li %[tmp0], 0x07 \n\t" 2203 "mtc1 %[tmp0], %[ftmp4] \n\t" 2204 2205 "1: \n\t" 2206 PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride]) 2207 2208 "addiu %[h], %[h], -0x01 \n\t" 2209 PTR_ADDU "%[src], %[src], %[srcstride] \n\t" 2210 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t" 2211 "bnez %[h], 1b \n\t" 2212 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 2213 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 2214 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 2215 [tmp0]"=&r"(tmp[0]), 2216 RESTRICT_ASM_LOW32 2217 [src1]"=&r"(src1), 2218 [h]"+&r"(h), 2219 [dst]"+&r"(dst), [src]"+&r"(src) 2220 : [ff_pw_64]"f"(ff_pw_64.f), 2221 [srcstride]"r"((mips_reg)srcstride), 2222 [dststride]"r"((mips_reg)dststride), 2223 [filter1]"f"(filter1.f), [filter2]"f"(filter2.f), 2224 [filter3]"f"(filter3.f), [filter4]"f"(filter4.f) 2225 : "memory" 2226 ); 2227#else 2228 const uint8_t *filter = subpel_filters[my - 1]; 2229 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2230 int x, y; 2231 2232 for (y = 0; y < h; y++) { 2233 for (x = 0; x < 4; x++) 2234 dst[x] = FILTER_4TAP(src, filter, srcstride); 2235 dst += dststride; 2236 src += srcstride; 2237 } 2238#endif 2239} 2240 2241void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2242 ptrdiff_t srcstride, int h, int mx, int my) 2243{ 2244#if 1 2245 const uint64_t *filter = fourtap_subpel_filters[my - 1]; 2246 double ftmp[9]; 2247 uint32_t tmp[1]; 2248 mips_reg src0, src1, dst0; 2249 union av_intfloat64 filter0; 2250 union av_intfloat64 filter1; 2251 union av_intfloat64 filter2; 2252 union av_intfloat64 filter3; 2253 union av_intfloat64 filter4; 2254 union av_intfloat64 filter5; 2255 DECLARE_VAR_ALL64; 2256 filter0.i = filter[0]; 2257 filter1.i = filter[1]; 2258 filter2.i = filter[2]; 2259 filter3.i = filter[3]; 2260 filter4.i = filter[4]; 2261 filter5.i = filter[5]; 2262 2263 /* 2264 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7]; 2265 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7]; 2266 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7]; 2267 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7]; 2268 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7]; 2269 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7]; 2270 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7]; 2271 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7]; 2272 2273 dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7]; 2274 dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7]; 2275 dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7]; 2276 dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7]; 2277 dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7]; 2278 dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7]; 2279 dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7]; 2280 dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7]; 2281 */ 2282 __asm__ volatile ( 2283 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2284 "li %[tmp0], 0x07 \n\t" 2285 "mtc1 %[tmp0], %[ftmp4] \n\t" 2286 2287 "1: \n\t" 2288 // 0 - 7 2289 PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride]) 2290 PTR_ADDIU "%[src0], %[src], 0x08 \n\t" 2291 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t" 2292 // 8 - 15 2293 PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride]) 2294 2295 "addiu %[h], %[h], -0x01 \n\t" 2296 PTR_ADDU "%[src], %[src], %[srcstride] \n\t" 2297 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t" 2298 "bnez %[h], 1b \n\t" 2299 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 2300 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 2301 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 2302 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 2303 [ftmp8]"=&f"(ftmp[8]), 2304 [tmp0]"=&r"(tmp[0]), 2305 RESTRICT_ASM_ALL64 2306 [src0]"=&r"(src0), [dst0]"=&r"(dst0), 2307 [src1]"=&r"(src1), 2308 [h]"+&r"(h), 2309 [dst]"+&r"(dst), [src]"+&r"(src) 2310 : [ff_pw_64]"f"(ff_pw_64.f), 2311 [srcstride]"r"((mips_reg)srcstride), 2312 [dststride]"r"((mips_reg)dststride), 2313 [filter0]"f"(filter0.f), [filter1]"f"(filter1.f), 2314 [filter2]"f"(filter2.f), [filter3]"f"(filter3.f), 2315 [filter4]"f"(filter4.f), [filter5]"f"(filter5.f) 2316 : "memory" 2317 ); 2318#else 2319 const uint8_t *filter = subpel_filters[my - 1]; 2320 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2321 int x, y; 2322 2323 for (y = 0; y < h; y++) { 2324 for (x = 0; x < 16; x++) 2325 dst[x] = FILTER_6TAP(src, filter, srcstride); 2326 dst += dststride; 2327 src += srcstride; 2328 } 2329#endif 2330} 2331 2332void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2333 ptrdiff_t srcstride, int h, int mx, int my) 2334{ 2335#if 1 2336 const uint64_t *filter = fourtap_subpel_filters[my - 1]; 2337 double ftmp[9]; 2338 uint32_t tmp[1]; 2339 mips_reg src1; 2340 union av_intfloat64 filter0; 2341 union av_intfloat64 filter1; 2342 union av_intfloat64 filter2; 2343 union av_intfloat64 filter3; 2344 union av_intfloat64 filter4; 2345 union av_intfloat64 filter5; 2346 DECLARE_VAR_ALL64; 2347 filter0.i = filter[0]; 2348 filter1.i = filter[1]; 2349 filter2.i = filter[2]; 2350 filter3.i = filter[3]; 2351 filter4.i = filter[4]; 2352 filter5.i = filter[5]; 2353 2354 /* 2355 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7]; 2356 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7]; 2357 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7]; 2358 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7]; 2359 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7]; 2360 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7]; 2361 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7]; 2362 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7]; 2363 */ 2364 __asm__ volatile ( 2365 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2366 "li %[tmp0], 0x07 \n\t" 2367 "mtc1 %[tmp0], %[ftmp4] \n\t" 2368 2369 "1: \n\t" 2370 PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride]) 2371 2372 "addiu %[h], %[h], -0x01 \n\t" 2373 PTR_ADDU "%[src], %[src], %[srcstride] \n\t" 2374 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t" 2375 "bnez %[h], 1b \n\t" 2376 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 2377 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 2378 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 2379 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 2380 [ftmp8]"=&f"(ftmp[8]), 2381 [tmp0]"=&r"(tmp[0]), 2382 RESTRICT_ASM_ALL64 2383 [src1]"=&r"(src1), 2384 [h]"+&r"(h), 2385 [dst]"+&r"(dst), [src]"+&r"(src) 2386 : [ff_pw_64]"f"(ff_pw_64.f), 2387 [srcstride]"r"((mips_reg)srcstride), 2388 [dststride]"r"((mips_reg)dststride), 2389 [filter0]"f"(filter0.f), [filter1]"f"(filter1.f), 2390 [filter2]"f"(filter2.f), [filter3]"f"(filter3.f), 2391 [filter4]"f"(filter4.f), [filter5]"f"(filter5.f) 2392 : "memory" 2393 ); 2394#else 2395 const uint8_t *filter = subpel_filters[my - 1]; 2396 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2397 int x, y; 2398 2399 for (y = 0; y < h; y++) { 2400 for (x = 0; x < 8; x++) 2401 dst[x] = FILTER_6TAP(src, filter, srcstride); 2402 dst += dststride; 2403 src += srcstride; 2404 } 2405#endif 2406} 2407 2408void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2409 ptrdiff_t srcstride, int h, int mx, int my) 2410{ 2411#if 1 2412 const uint64_t *filter = fourtap_subpel_filters[my - 1]; 2413 double ftmp[6]; 2414 uint32_t tmp[1]; 2415 mips_reg src1; 2416 union av_intfloat64 filter0; 2417 union av_intfloat64 filter1; 2418 union av_intfloat64 filter2; 2419 union av_intfloat64 filter3; 2420 union av_intfloat64 filter4; 2421 union av_intfloat64 filter5; 2422 DECLARE_VAR_LOW32; 2423 filter0.i = filter[0]; 2424 filter1.i = filter[1]; 2425 filter2.i = filter[2]; 2426 filter3.i = filter[3]; 2427 filter4.i = filter[4]; 2428 filter5.i = filter[5]; 2429 2430 /* 2431 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7]; 2432 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7]; 2433 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7]; 2434 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7]; 2435 */ 2436 __asm__ volatile ( 2437 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2438 "li %[tmp0], 0x07 \n\t" 2439 "mtc1 %[tmp0], %[ftmp4] \n\t" 2440 2441 "1: \n\t" 2442 PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride]) 2443 2444 "addiu %[h], %[h], -0x01 \n\t" 2445 PTR_ADDU "%[src], %[src], %[srcstride] \n\t" 2446 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t" 2447 "bnez %[h], 1b \n\t" 2448 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 2449 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 2450 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 2451 [tmp0]"=&r"(tmp[0]), 2452 RESTRICT_ASM_LOW32 2453 [src1]"=&r"(src1), 2454 [h]"+&r"(h), 2455 [dst]"+&r"(dst), [src]"+&r"(src) 2456 : [ff_pw_64]"f"(ff_pw_64.f), 2457 [srcstride]"r"((mips_reg)srcstride), 2458 [dststride]"r"((mips_reg)dststride), 2459 [filter0]"f"(filter0.f), [filter1]"f"(filter1.f), 2460 [filter2]"f"(filter2.f), [filter3]"f"(filter3.f), 2461 [filter4]"f"(filter4.f), [filter5]"f"(filter5.f) 2462 : "memory" 2463 ); 2464#else 2465 const uint8_t *filter = subpel_filters[my - 1]; 2466 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2467 int x, y; 2468 2469 for (y = 0; y < h; y++) { 2470 for (x = 0; x < 4; x++) 2471 dst[x] = FILTER_6TAP(src, filter, srcstride); 2472 dst += dststride; 2473 src += srcstride; 2474 } 2475#endif 2476} 2477 2478void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2479 ptrdiff_t srcstride, int h, int mx, int my) 2480{ 2481#if 1 2482 DECLARE_ALIGNED(8, uint8_t, tmp_array[560]); 2483 uint8_t *tmp = tmp_array; 2484 2485 src -= srcstride; 2486 ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my); 2487 tmp = tmp_array + 16; 2488 ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my); 2489#else 2490 const uint8_t *filter = subpel_filters[mx - 1]; 2491 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2492 int x, y; 2493 uint8_t tmp_array[560]; 2494 uint8_t *tmp = tmp_array; 2495 2496 src -= srcstride; 2497 2498 for (y = 0; y < h + 3; y++) { 2499 for (x = 0; x < 16; x++) 2500 tmp[x] = FILTER_4TAP(src, filter, 1); 2501 tmp += 16; 2502 src += srcstride; 2503 } 2504 2505 tmp = tmp_array + 16; 2506 filter = subpel_filters[my - 1]; 2507 2508 for (y = 0; y < h; y++) { 2509 for (x = 0; x < 16; x++) 2510 dst[x] = FILTER_4TAP(tmp, filter, 16); 2511 dst += dststride; 2512 tmp += 16; 2513 } 2514#endif 2515} 2516 2517void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2518 ptrdiff_t srcstride, int h, int mx, int my) 2519{ 2520#if 1 2521 DECLARE_ALIGNED(8, uint8_t, tmp_array[152]); 2522 uint8_t *tmp = tmp_array; 2523 2524 src -= srcstride; 2525 ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my); 2526 tmp = tmp_array + 8; 2527 ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my); 2528#else 2529 const uint8_t *filter = subpel_filters[mx - 1]; 2530 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2531 int x, y; 2532 uint8_t tmp_array[152]; 2533 uint8_t *tmp = tmp_array; 2534 2535 src -= srcstride; 2536 2537 for (y = 0; y < h + 3; y++) { 2538 for (x = 0; x < 8; x++) 2539 tmp[x] = FILTER_4TAP(src, filter, 1); 2540 tmp += 8; 2541 src += srcstride; 2542 } 2543 2544 tmp = tmp_array + 8; 2545 filter = subpel_filters[my - 1]; 2546 2547 for (y = 0; y < h; y++) { 2548 for (x = 0; x < 8; x++) 2549 dst[x] = FILTER_4TAP(tmp, filter, 8); 2550 dst += dststride; 2551 tmp += 8; 2552 } 2553#endif 2554} 2555 2556void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2557 ptrdiff_t srcstride, int h, int mx, int my) 2558{ 2559#if 1 2560 DECLARE_ALIGNED(4, uint8_t, tmp_array[44]); 2561 uint8_t *tmp = tmp_array; 2562 2563 src -= srcstride; 2564 ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my); 2565 tmp = tmp_array + 4; 2566 ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my); 2567#else 2568 const uint8_t *filter = subpel_filters[mx - 1]; 2569 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2570 int x, y; 2571 uint8_t tmp_array[44]; 2572 uint8_t *tmp = tmp_array; 2573 2574 src -= srcstride; 2575 2576 for (y = 0; y < h + 3; y++) { 2577 for (x = 0; x < 4; x++) 2578 tmp[x] = FILTER_4TAP(src, filter, 1); 2579 tmp += 4; 2580 src += srcstride; 2581 } 2582 tmp = tmp_array + 4; 2583 filter = subpel_filters[my - 1]; 2584 2585 for (y = 0; y < h; y++) { 2586 for (x = 0; x < 4; x++) 2587 dst[x] = FILTER_4TAP(tmp, filter, 4); 2588 dst += dststride; 2589 tmp += 4; 2590 } 2591#endif 2592} 2593 2594void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2595 ptrdiff_t srcstride, int h, int mx, int my) 2596{ 2597#if 1 2598 DECLARE_ALIGNED(8, uint8_t, tmp_array[592]); 2599 uint8_t *tmp = tmp_array; 2600 2601 src -= 2 * srcstride; 2602 ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my); 2603 tmp = tmp_array + 32; 2604 ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my); 2605#else 2606 const uint8_t *filter = subpel_filters[mx - 1]; 2607 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2608 int x, y; 2609 uint8_t tmp_array[592]; 2610 uint8_t *tmp = tmp_array; 2611 2612 src -= 2 * srcstride; 2613 2614 for (y = 0; y < h + 5; y++) { 2615 for (x = 0; x < 16; x++) 2616 tmp[x] = FILTER_4TAP(src, filter, 1); 2617 tmp += 16; 2618 src += srcstride; 2619 } 2620 2621 tmp = tmp_array + 32; 2622 filter = subpel_filters[my - 1]; 2623 2624 for (y = 0; y < h; y++) { 2625 for (x = 0; x < 16; x++) 2626 dst[x] = FILTER_6TAP(tmp, filter, 16); 2627 dst += dststride; 2628 tmp += 16; 2629 } 2630#endif 2631} 2632 2633void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2634 ptrdiff_t srcstride, int h, int mx, int my) 2635{ 2636#if 1 2637 DECLARE_ALIGNED(8, uint8_t, tmp_array[168]); 2638 uint8_t *tmp = tmp_array; 2639 2640 src -= 2 * srcstride; 2641 ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my); 2642 tmp = tmp_array + 16; 2643 ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my); 2644#else 2645 const uint8_t *filter = subpel_filters[mx - 1]; 2646 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2647 int x, y; 2648 uint8_t tmp_array[168]; 2649 uint8_t *tmp = tmp_array; 2650 2651 src -= 2 * srcstride; 2652 2653 for (y = 0; y < h + 5; y++) { 2654 for (x = 0; x < 8; x++) 2655 tmp[x] = FILTER_4TAP(src, filter, 1); 2656 tmp += 8; 2657 src += srcstride; 2658 } 2659 2660 tmp = tmp_array + 16; 2661 filter = subpel_filters[my - 1]; 2662 2663 for (y = 0; y < h; y++) { 2664 for (x = 0; x < 8; x++) 2665 dst[x] = FILTER_6TAP(tmp, filter, 8); 2666 dst += dststride; 2667 tmp += 8; 2668 } 2669#endif 2670} 2671 2672void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2673 ptrdiff_t srcstride, int h, int mx, int my) 2674{ 2675#if 1 2676 DECLARE_ALIGNED(4, uint8_t, tmp_array[52]); 2677 uint8_t *tmp = tmp_array; 2678 2679 src -= 2 * srcstride; 2680 ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my); 2681 tmp = tmp_array + 8; 2682 ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my); 2683#else 2684 const uint8_t *filter = subpel_filters[mx - 1]; 2685 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2686 int x, y; 2687 uint8_t tmp_array[52]; 2688 uint8_t *tmp = tmp_array; 2689 2690 src -= 2 * srcstride; 2691 2692 for (y = 0; y < h + 5; y++) { 2693 for (x = 0; x < 4; x++) 2694 tmp[x] = FILTER_4TAP(src, filter, 1); 2695 tmp += 4; 2696 src += srcstride; 2697 } 2698 2699 tmp = tmp_array + 8; 2700 filter = subpel_filters[my - 1]; 2701 2702 for (y = 0; y < h; y++) { 2703 for (x = 0; x < 4; x++) 2704 dst[x] = FILTER_6TAP(tmp, filter, 4); 2705 dst += dststride; 2706 tmp += 4; 2707 } 2708#endif 2709} 2710 2711void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2712 ptrdiff_t srcstride, int h, int mx, int my) 2713{ 2714#if 1 2715 DECLARE_ALIGNED(8, uint8_t, tmp_array[560]); 2716 uint8_t *tmp = tmp_array; 2717 2718 src -= srcstride; 2719 ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my); 2720 tmp = tmp_array + 16; 2721 ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my); 2722#else 2723 const uint8_t *filter = subpel_filters[mx - 1]; 2724 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2725 int x, y; 2726 uint8_t tmp_array[560]; 2727 uint8_t *tmp = tmp_array; 2728 2729 src -= srcstride; 2730 2731 for (y = 0; y < h + 3; y++) { 2732 for (x = 0; x < 16; x++) 2733 tmp[x] = FILTER_6TAP(src, filter, 1); 2734 tmp += 16; 2735 src += srcstride; 2736 } 2737 2738 tmp = tmp_array + 16; 2739 filter = subpel_filters[my - 1]; 2740 2741 for (y = 0; y < h; y++) { 2742 for (x = 0; x < 16; x++) 2743 dst[x] = FILTER_4TAP(tmp, filter, 16); 2744 dst += dststride; 2745 tmp += 16; 2746 } 2747#endif 2748} 2749 2750void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2751 ptrdiff_t srcstride, int h, int mx, int my) 2752{ 2753#if 1 2754 DECLARE_ALIGNED(8, uint8_t, tmp_array[152]); 2755 uint8_t *tmp = tmp_array; 2756 2757 src -= srcstride; 2758 ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my); 2759 tmp = tmp_array + 8; 2760 ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my); 2761#else 2762 const uint8_t *filter = subpel_filters[mx - 1]; 2763 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2764 int x, y; 2765 uint8_t tmp_array[152]; 2766 uint8_t *tmp = tmp_array; 2767 2768 src -= srcstride; 2769 2770 for (y = 0; y < h + 3; y++) { 2771 for (x = 0; x < 8; x++) 2772 tmp[x] = FILTER_6TAP(src, filter, 1); 2773 tmp += 8; 2774 src += srcstride; 2775 } 2776 2777 tmp = tmp_array + 8; 2778 filter = subpel_filters[my - 1]; 2779 2780 for (y = 0; y < h; y++) { 2781 for (x = 0; x < 8; x++) 2782 dst[x] = FILTER_4TAP(tmp, filter, 8); 2783 dst += dststride; 2784 tmp += 8; 2785 } 2786#endif 2787} 2788 2789void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2790 ptrdiff_t srcstride, int h, int mx, int my) 2791{ 2792#if 1 2793 DECLARE_ALIGNED(4, uint8_t, tmp_array[44]); 2794 uint8_t *tmp = tmp_array; 2795 2796 src -= srcstride; 2797 ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my); 2798 tmp = tmp_array + 4; 2799 ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my); 2800#else 2801 const uint8_t *filter = subpel_filters[mx - 1]; 2802 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2803 int x, y; 2804 uint8_t tmp_array[44]; 2805 uint8_t *tmp = tmp_array; 2806 2807 src -= srcstride; 2808 2809 for (y = 0; y < h + 3; y++) { 2810 for (x = 0; x < 4; x++) 2811 tmp[x] = FILTER_6TAP(src, filter, 1); 2812 tmp += 4; 2813 src += srcstride; 2814 } 2815 2816 tmp = tmp_array + 4; 2817 filter = subpel_filters[my - 1]; 2818 2819 for (y = 0; y < h; y++) { 2820 for (x = 0; x < 4; x++) 2821 dst[x] = FILTER_4TAP(tmp, filter, 4); 2822 dst += dststride; 2823 tmp += 4; 2824 } 2825#endif 2826} 2827 2828void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2829 ptrdiff_t srcstride, int h, int mx, int my) 2830{ 2831#if 1 2832 DECLARE_ALIGNED(8, uint8_t, tmp_array[592]); 2833 uint8_t *tmp = tmp_array; 2834 2835 src -= 2 * srcstride; 2836 ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my); 2837 tmp = tmp_array + 32; 2838 ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my); 2839#else 2840 const uint8_t *filter = subpel_filters[mx - 1]; 2841 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2842 int x, y; 2843 uint8_t tmp_array[592]; 2844 uint8_t *tmp = tmp_array; 2845 2846 src -= 2 * srcstride; 2847 2848 for (y = 0; y < h + 5; y++) { 2849 for (x = 0; x < 16; x++) 2850 tmp[x] = FILTER_6TAP(src, filter, 1); 2851 tmp += 16; 2852 src += srcstride; 2853 } 2854 2855 tmp = tmp_array + 32; 2856 filter = subpel_filters[my - 1]; 2857 2858 for (y = 0; y < h; y++) { 2859 for (x = 0; x < 16; x++) 2860 dst[x] = FILTER_6TAP(tmp, filter, 16); 2861 dst += dststride; 2862 tmp += 16; 2863 } 2864#endif 2865} 2866 2867void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2868 ptrdiff_t srcstride, int h, int mx, int my) 2869{ 2870#if 1 2871 DECLARE_ALIGNED(8, uint8_t, tmp_array[168]); 2872 uint8_t *tmp = tmp_array; 2873 2874 src -= 2 * srcstride; 2875 ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my); 2876 tmp = tmp_array + 16; 2877 ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my); 2878#else 2879 const uint8_t *filter = subpel_filters[mx - 1]; 2880 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2881 int x, y; 2882 uint8_t tmp_array[168]; 2883 uint8_t *tmp = tmp_array; 2884 2885 src -= 2 * srcstride; 2886 2887 for (y = 0; y < h + 5; y++) { 2888 for (x = 0; x < 8; x++) 2889 tmp[x] = FILTER_6TAP(src, filter, 1); 2890 tmp += 8; 2891 src += srcstride; 2892 } 2893 2894 tmp = tmp_array + 16; 2895 filter = subpel_filters[my - 1]; 2896 2897 for (y = 0; y < h; y++) { 2898 for (x = 0; x < 8; x++) 2899 dst[x] = FILTER_6TAP(tmp, filter, 8); 2900 dst += dststride; 2901 tmp += 8; 2902 } 2903#endif 2904} 2905 2906void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, 2907 ptrdiff_t srcstride, int h, int mx, int my) 2908{ 2909#if 1 2910 DECLARE_ALIGNED(4, uint8_t, tmp_array[52]); 2911 uint8_t *tmp = tmp_array; 2912 2913 src -= 2 * srcstride; 2914 ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my); 2915 tmp = tmp_array + 8; 2916 ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my); 2917#else 2918 const uint8_t *filter = subpel_filters[mx - 1]; 2919 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; 2920 int x, y; 2921 uint8_t tmp_array[52]; 2922 uint8_t *tmp = tmp_array; 2923 2924 src -= 2 * srcstride; 2925 2926 for (y = 0; y < h + 5; y++) { 2927 for (x = 0; x < 4; x++) 2928 tmp[x] = FILTER_6TAP(src, filter, 1); 2929 tmp += 4; 2930 src += srcstride; 2931 } 2932 2933 tmp = tmp_array + 8; 2934 filter = subpel_filters[my - 1]; 2935 2936 for (y = 0; y < h; y++) { 2937 for (x = 0; x < 4; x++) 2938 dst[x] = FILTER_6TAP(tmp, filter, 4); 2939 dst += dststride; 2940 tmp += 4; 2941 } 2942#endif 2943} 2944 2945void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, 2946 ptrdiff_t sstride, int h, int mx, int my) 2947{ 2948#if 1 2949 union mmi_intfloat64 a, b; 2950 double ftmp[7]; 2951 uint32_t tmp[1]; 2952 mips_reg dst0, src0; 2953 DECLARE_VAR_ALL64; 2954 a.i = 8 - mx; 2955 b.i = mx; 2956 2957 /* 2958 dst[0] = (a * src[0] + b * src[1] + 4) >> 3; 2959 dst[1] = (a * src[1] + b * src[2] + 4) >> 3; 2960 dst[2] = (a * src[2] + b * src[3] + 4) >> 3; 2961 dst[3] = (a * src[3] + b * src[4] + 4) >> 3; 2962 dst[4] = (a * src[4] + b * src[5] + 4) >> 3; 2963 dst[5] = (a * src[5] + b * src[6] + 4) >> 3; 2964 dst[6] = (a * src[6] + b * src[7] + 4) >> 3; 2965 dst[7] = (a * src[7] + b * src[8] + 4) >> 3; 2966 2967 dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3; 2968 dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3; 2969 dst[10] = (a * src[10] + b * src[11] + 4) >> 3; 2970 dst[11] = (a * src[11] + b * src[12] + 4) >> 3; 2971 dst[12] = (a * src[12] + b * src[13] + 4) >> 3; 2972 dst[13] = (a * src[13] + b * src[14] + 4) >> 3; 2973 dst[14] = (a * src[14] + b * src[15] + 4) >> 3; 2974 dst[15] = (a * src[15] + b * src[16] + 4) >> 3; 2975 */ 2976 __asm__ volatile ( 2977 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2978 "li %[tmp0], 0x03 \n\t" 2979 "mtc1 %[tmp0], %[ftmp4] \n\t" 2980 "pshufh %[a], %[a], %[ftmp0] \n\t" 2981 "pshufh %[b], %[b], %[ftmp0] \n\t" 2982 2983 "1: \n\t" 2984 // 0 - 7 2985 PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst]) 2986 PTR_ADDIU "%[src0], %[src], 0x08 \n\t" 2987 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t" 2988 // 8 - 15 2989 PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0]) 2990 2991 "addiu %[h], %[h], -0x01 \n\t" 2992 PTR_ADDU "%[src], %[src], %[sstride] \n\t" 2993 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t" 2994 "bnez %[h], 1b \n\t" 2995 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 2996 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 2997 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 2998 [ftmp6]"=&f"(ftmp[6]), 2999 [tmp0]"=&r"(tmp[0]), 3000 RESTRICT_ASM_ALL64 3001 [dst0]"=&r"(dst0), [src0]"=&r"(src0), 3002 [h]"+&r"(h), 3003 [dst]"+&r"(dst), [src]"+&r"(src), 3004 [a]"+&f"(a.f), [b]"+&f"(b.f) 3005 : [sstride]"r"((mips_reg)sstride), 3006 [dstride]"r"((mips_reg)dstride), 3007 [ff_pw_4]"f"(ff_pw_4.f) 3008 : "memory" 3009 ); 3010#else 3011 int a = 8 - mx, b = mx; 3012 int x, y; 3013 3014 for (y = 0; y < h; y++) { 3015 for (x = 0; x < 16; x++) 3016 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3; 3017 dst += dstride; 3018 src += sstride; 3019 } 3020#endif 3021} 3022 3023void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, 3024 ptrdiff_t sstride, int h, int mx, int my) 3025{ 3026#if 1 3027 union mmi_intfloat64 c, d; 3028 double ftmp[7]; 3029 uint32_t tmp[1]; 3030 mips_reg src0, src1, dst0; 3031 DECLARE_VAR_ALL64; 3032 c.i = 8 - my; 3033 d.i = my; 3034 3035 /* 3036 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3; 3037 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3; 3038 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3; 3039 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3; 3040 dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3; 3041 dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3; 3042 dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3; 3043 dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3; 3044 */ 3045 __asm__ volatile ( 3046 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 3047 "li %[tmp0], 0x03 \n\t" 3048 "mtc1 %[tmp0], %[ftmp4] \n\t" 3049 "pshufh %[c], %[c], %[ftmp0] \n\t" 3050 "pshufh %[d], %[d], %[ftmp0] \n\t" 3051 3052 "1: \n\t" 3053 // 0 - 7 3054 PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride]) 3055 PTR_ADDIU "%[src0], %[src], 0x08 \n\t" 3056 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t" 3057 // 8 - 15 3058 PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride]) 3059 3060 "addiu %[h], %[h], -0x01 \n\t" 3061 PTR_ADDU "%[src], %[src], %[sstride] \n\t" 3062 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t" 3063 "bnez %[h], 1b \n\t" 3064 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 3065 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 3066 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 3067 [ftmp6]"=&f"(ftmp[6]), 3068 [tmp0]"=&r"(tmp[0]), 3069 RESTRICT_ASM_ALL64 3070 [src0]"=&r"(src0), [dst0]"=&r"(dst0), 3071 [src1]"=&r"(src1), 3072 [h]"+&r"(h), 3073 [dst]"+&r"(dst), [src]"+&r"(src), 3074 [c]"+&f"(c.f), [d]"+&f"(d.f) 3075 : [sstride]"r"((mips_reg)sstride), 3076 [dstride]"r"((mips_reg)dstride), 3077 [ff_pw_4]"f"(ff_pw_4.f) 3078 : "memory" 3079 ); 3080#else 3081 int c = 8 - my, d = my; 3082 int x, y; 3083 3084 for (y = 0; y < h; y++) { 3085 for (x = 0; x < 16; x++) 3086 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3; 3087 dst += dstride; 3088 src += sstride; 3089 } 3090#endif 3091} 3092 3093void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, 3094 ptrdiff_t sstride, int h, int mx, int my) 3095{ 3096#if 1 3097 DECLARE_ALIGNED(8, uint8_t, tmp_array[528]); 3098 uint8_t *tmp = tmp_array; 3099 3100 ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my); 3101 ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my); 3102#else 3103 int a = 8 - mx, b = mx; 3104 int c = 8 - my, d = my; 3105 int x, y; 3106 uint8_t tmp_array[528]; 3107 uint8_t *tmp = tmp_array; 3108 3109 for (y = 0; y < h + 1; y++) { 3110 for (x = 0; x < 16; x++) 3111 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3; 3112 tmp += 16; 3113 src += sstride; 3114 } 3115 3116 tmp = tmp_array; 3117 3118 for (y = 0; y < h; y++) { 3119 for (x = 0; x < 16; x++) 3120 dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3; 3121 dst += dstride; 3122 tmp += 16; 3123 } 3124#endif 3125} 3126 3127void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, 3128 ptrdiff_t sstride, int h, int mx, int my) 3129{ 3130#if 1 3131 union mmi_intfloat64 a, b; 3132 double ftmp[7]; 3133 uint32_t tmp[1]; 3134 DECLARE_VAR_ALL64; 3135 a.i = 8 - mx; 3136 b.i = mx; 3137 3138 /* 3139 dst[0] = (a * src[0] + b * src[1] + 4) >> 3; 3140 dst[1] = (a * src[1] + b * src[2] + 4) >> 3; 3141 dst[2] = (a * src[2] + b * src[3] + 4) >> 3; 3142 dst[3] = (a * src[3] + b * src[4] + 4) >> 3; 3143 dst[4] = (a * src[4] + b * src[5] + 4) >> 3; 3144 dst[5] = (a * src[5] + b * src[6] + 4) >> 3; 3145 dst[6] = (a * src[6] + b * src[7] + 4) >> 3; 3146 dst[7] = (a * src[7] + b * src[8] + 4) >> 3; 3147 */ 3148 __asm__ volatile ( 3149 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 3150 "li %[tmp0], 0x03 \n\t" 3151 "mtc1 %[tmp0], %[ftmp4] \n\t" 3152 "pshufh %[a], %[a], %[ftmp0] \n\t" 3153 "pshufh %[b], %[b], %[ftmp0] \n\t" 3154 3155 "1: \n\t" 3156 PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst]) 3157 3158 "addiu %[h], %[h], -0x01 \n\t" 3159 PTR_ADDU "%[src], %[src], %[sstride] \n\t" 3160 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t" 3161 "bnez %[h], 1b \n\t" 3162 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 3163 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 3164 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 3165 [ftmp6]"=&f"(ftmp[6]), 3166 [tmp0]"=&r"(tmp[0]), 3167 RESTRICT_ASM_ALL64 3168 [h]"+&r"(h), 3169 [dst]"+&r"(dst), [src]"+&r"(src), 3170 [a]"+&f"(a.f), [b]"+&f"(b.f) 3171 : [sstride]"r"((mips_reg)sstride), 3172 [dstride]"r"((mips_reg)dstride), 3173 [ff_pw_4]"f"(ff_pw_4.f) 3174 : "memory" 3175 ); 3176#else 3177 int a = 8 - mx, b = mx; 3178 int x, y; 3179 3180 for (y = 0; y < h; y++) { 3181 for (x = 0; x < 8; x++) 3182 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3; 3183 dst += dstride; 3184 src += sstride; 3185 } 3186#endif 3187} 3188 3189void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, 3190 ptrdiff_t sstride, int h, int mx, int my) 3191{ 3192#if 1 3193 union mmi_intfloat64 c, d; 3194 double ftmp[7]; 3195 uint32_t tmp[1]; 3196 mips_reg src1; 3197 DECLARE_VAR_ALL64; 3198 c.i = 8 - my; 3199 d.i = my; 3200 3201 /* 3202 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3; 3203 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3; 3204 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3; 3205 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3; 3206 dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3; 3207 dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3; 3208 dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3; 3209 dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3; 3210 */ 3211 __asm__ volatile ( 3212 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 3213 "li %[tmp0], 0x03 \n\t" 3214 "mtc1 %[tmp0], %[ftmp4] \n\t" 3215 "pshufh %[c], %[c], %[ftmp0] \n\t" 3216 "pshufh %[d], %[d], %[ftmp0] \n\t" 3217 3218 "1: \n\t" 3219 PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride]) 3220 3221 "addiu %[h], %[h], -0x01 \n\t" 3222 PTR_ADDU "%[src], %[src], %[sstride] \n\t" 3223 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t" 3224 "bnez %[h], 1b \n\t" 3225 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 3226 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 3227 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 3228 [ftmp6]"=&f"(ftmp[6]), 3229 [tmp0]"=&r"(tmp[0]), 3230 RESTRICT_ASM_ALL64 3231 [src1]"=&r"(src1), 3232 [h]"+&r"(h), 3233 [dst]"+&r"(dst), [src]"+&r"(src), 3234 [c]"+&f"(c.f), [d]"+&f"(d.f) 3235 : [sstride]"r"((mips_reg)sstride), 3236 [dstride]"r"((mips_reg)dstride), 3237 [ff_pw_4]"f"(ff_pw_4.f) 3238 : "memory" 3239 ); 3240#else 3241 int c = 8 - my, d = my; 3242 int x, y; 3243 3244 for (y = 0; y < h; y++) { 3245 for (x = 0; x < 8; x++) 3246 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3; 3247 dst += dstride; 3248 src += sstride; 3249 } 3250#endif 3251} 3252 3253void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, 3254 ptrdiff_t sstride, int h, int mx, int my) 3255{ 3256#if 1 3257 DECLARE_ALIGNED(8, uint8_t, tmp_array[136]); 3258 uint8_t *tmp = tmp_array; 3259 3260 ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my); 3261 ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my); 3262#else 3263 int a = 8 - mx, b = mx; 3264 int c = 8 - my, d = my; 3265 int x, y; 3266 uint8_t tmp_array[136]; 3267 uint8_t *tmp = tmp_array; 3268 3269 for (y = 0; y < h + 1; y++) { 3270 for (x = 0; x < 8; x++) 3271 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3; 3272 tmp += 8; 3273 src += sstride; 3274 } 3275 3276 tmp = tmp_array; 3277 3278 for (y = 0; y < h; y++) { 3279 for (x = 0; x < 8; x++) 3280 dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3; 3281 dst += dstride; 3282 tmp += 8; 3283 } 3284#endif 3285} 3286 3287void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, 3288 ptrdiff_t sstride, int h, int mx, int my) 3289{ 3290#if 1 3291 union mmi_intfloat64 a, b; 3292 double ftmp[5]; 3293 uint32_t tmp[1]; 3294 DECLARE_VAR_LOW32; 3295 DECLARE_VAR_ALL64; 3296 a.i = 8 - mx; 3297 b.i = mx; 3298 3299 /* 3300 dst[0] = (a * src[0] + b * src[1] + 4) >> 3; 3301 dst[1] = (a * src[1] + b * src[2] + 4) >> 3; 3302 dst[2] = (a * src[2] + b * src[3] + 4) >> 3; 3303 dst[3] = (a * src[3] + b * src[4] + 4) >> 3; 3304 */ 3305 __asm__ volatile ( 3306 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 3307 "li %[tmp0], 0x03 \n\t" 3308 "mtc1 %[tmp0], %[ftmp4] \n\t" 3309 "pshufh %[a], %[a], %[ftmp0] \n\t" 3310 "pshufh %[b], %[b], %[ftmp0] \n\t" 3311 3312 "1: \n\t" 3313 PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst]) 3314 3315 "addiu %[h], %[h], -0x01 \n\t" 3316 PTR_ADDU "%[src], %[src], %[sstride] \n\t" 3317 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t" 3318 "bnez %[h], 1b \n\t" 3319 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 3320 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 3321 [ftmp4]"=&f"(ftmp[4]), 3322 [tmp0]"=&r"(tmp[0]), 3323 RESTRICT_ASM_LOW32 3324 RESTRICT_ASM_ALL64 3325 [h]"+&r"(h), 3326 [dst]"+&r"(dst), [src]"+&r"(src), 3327 [a]"+&f"(a.f), [b]"+&f"(b.f) 3328 : [sstride]"r"((mips_reg)sstride), 3329 [dstride]"r"((mips_reg)dstride), 3330 [ff_pw_4]"f"(ff_pw_4.f) 3331 : "memory" 3332 ); 3333#else 3334 int a = 8 - mx, b = mx; 3335 int x, y; 3336 3337 for (y = 0; y < h; y++) { 3338 for (x = 0; x < 4; x++) 3339 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3; 3340 dst += dstride; 3341 src += sstride; 3342 } 3343#endif 3344} 3345 3346void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, 3347 ptrdiff_t sstride, int h, int mx, int my) 3348{ 3349#if 1 3350 union mmi_intfloat64 c, d; 3351 double ftmp[7]; 3352 uint32_t tmp[1]; 3353 mips_reg src1; 3354 DECLARE_VAR_LOW32; 3355 DECLARE_VAR_ALL64; 3356 c.i = 8 - my; 3357 d.i = my; 3358 3359 /* 3360 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3; 3361 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3; 3362 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3; 3363 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3; 3364 */ 3365 __asm__ volatile ( 3366 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 3367 "li %[tmp0], 0x03 \n\t" 3368 "mtc1 %[tmp0], %[ftmp4] \n\t" 3369 "pshufh %[c], %[c], %[ftmp0] \n\t" 3370 "pshufh %[d], %[d], %[ftmp0] \n\t" 3371 3372 "1: \n\t" 3373 PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride]) 3374 3375 "addiu %[h], %[h], -0x01 \n\t" 3376 PTR_ADDU "%[src], %[src], %[sstride] \n\t" 3377 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t" 3378 "bnez %[h], 1b \n\t" 3379 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 3380 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 3381 [ftmp4]"=&f"(ftmp[4]), 3382 [tmp0]"=&r"(tmp[0]), 3383 RESTRICT_ASM_LOW32 3384 RESTRICT_ASM_ALL64 3385 [src1]"=&r"(src1), 3386 [h]"+&r"(h), 3387 [dst]"+&r"(dst), [src]"+&r"(src), 3388 [c]"+&f"(c.f), [d]"+&f"(d.f) 3389 : [sstride]"r"((mips_reg)sstride), 3390 [dstride]"r"((mips_reg)dstride), 3391 [ff_pw_4]"f"(ff_pw_4.f) 3392 : "memory" 3393 ); 3394#else 3395 int c = 8 - my, d = my; 3396 int x, y; 3397 3398 for (y = 0; y < h; y++) { 3399 for (x = 0; x < 4; x++) 3400 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3; 3401 dst += dstride; 3402 src += sstride; 3403 } 3404#endif 3405} 3406 3407void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, 3408 ptrdiff_t sstride, int h, int mx, int my) 3409{ 3410#if 1 3411 DECLARE_ALIGNED(4, uint8_t, tmp_array[36]); 3412 uint8_t *tmp = tmp_array; 3413 3414 ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my); 3415 ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my); 3416#else 3417 int a = 8 - mx, b = mx; 3418 int c = 8 - my, d = my; 3419 int x, y; 3420 uint8_t tmp_array[36]; 3421 uint8_t *tmp = tmp_array; 3422 3423 for (y = 0; y < h + 1; y++) { 3424 for (x = 0; x < 4; x++) 3425 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3; 3426 tmp += 4; 3427 src += sstride; 3428 } 3429 3430 tmp = tmp_array; 3431 3432 for (y = 0; y < h; y++) { 3433 for (x = 0; x < 4; x++) 3434 dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3; 3435 dst += dstride; 3436 tmp += 4; 3437 } 3438#endif 3439} 3440