1/* 2 * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include <stdint.h> 22 23#include "libavutil/x86/asm.h" 24#include "libswscale/swscale_internal.h" 25 26#undef REAL_MOVNTQ 27#undef MOVNTQ 28#undef MOVNTQ2 29#undef PREFETCH 30 31 32#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" 33#define MOVNTQ2 "movntq " 34#define MOVNTQ(a,b) REAL_MOVNTQ(a,b) 35 36#define YSCALEYUV2PACKEDX_UV \ 37 __asm__ volatile(\ 38 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ 39 ".p2align 4 \n\t"\ 40 "nop \n\t"\ 41 "1: \n\t"\ 42 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\ 43 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 44 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 45 "movq %%mm3, %%mm4 \n\t"\ 46 ".p2align 4 \n\t"\ 47 "2: \n\t"\ 48 "movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\ 49 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* UsrcData */\ 50 "add %6, %%"FF_REG_S" \n\t" \ 51 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" /* VsrcData */\ 52 "add $16, %%"FF_REG_d" \n\t"\ 53 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 54 "pmulhw %%mm0, %%mm2 \n\t"\ 55 "pmulhw %%mm0, %%mm5 \n\t"\ 56 "paddw %%mm2, %%mm3 \n\t"\ 57 "paddw %%mm5, %%mm4 \n\t"\ 58 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 59 " jnz 2b \n\t"\ 60 61#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ 62 "lea "offset"(%0), %%"FF_REG_d" \n\t"\ 63 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 64 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ 65 "movq "#dst1", "#dst2" \n\t"\ 66 ".p2align 4 \n\t"\ 67 "2: \n\t"\ 68 "movq 8(%%"FF_REG_d"), "#coeff" \n\t" /* filterCoeff */\ 69 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" /* Y1srcData */\ 70 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" /* Y2srcData */\ 71 "add $16, %%"FF_REG_d" \n\t"\ 72 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 73 "pmulhw "#coeff", "#src1" \n\t"\ 74 "pmulhw "#coeff", "#src2" \n\t"\ 75 "paddw "#src1", "#dst1" \n\t"\ 76 "paddw "#src2", "#dst2" \n\t"\ 77 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 78 " jnz 2b \n\t"\ 79 80#define YSCALEYUV2PACKEDX \ 81 YSCALEYUV2PACKEDX_UV \ 82 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ 83 84#define YSCALEYUV2PACKEDX_END \ 85 :: "r" (&c->redDither), \ 86 "m" (dummy), "m" (dummy), "m" (dummy),\ 87 "r" (dest), "m" (dstW_reg), "m"(uv_off) \ 88 NAMED_CONSTRAINTS_ADD(bF8,bFC) \ 89 : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \ 90 ); 91 92#define YSCALEYUV2PACKEDX_ACCURATE_UV \ 93 __asm__ volatile(\ 94 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ 95 ".p2align 4 \n\t"\ 96 "nop \n\t"\ 97 "1: \n\t"\ 98 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\ 99 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 100 "pxor %%mm4, %%mm4 \n\t"\ 101 "pxor %%mm5, %%mm5 \n\t"\ 102 "pxor %%mm6, %%mm6 \n\t"\ 103 "pxor %%mm7, %%mm7 \n\t"\ 104 ".p2align 4 \n\t"\ 105 "2: \n\t"\ 106 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm0 \n\t" /* UsrcData */\ 107 "add %6, %%"FF_REG_S" \n\t" \ 108 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* VsrcData */\ 109 "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 110 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm1 \n\t" /* UsrcData */\ 111 "movq %%mm0, %%mm3 \n\t"\ 112 "punpcklwd %%mm1, %%mm0 \n\t"\ 113 "punpckhwd %%mm1, %%mm3 \n\t"\ 114 "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1 \n\t" /* filterCoeff */\ 115 "pmaddwd %%mm1, %%mm0 \n\t"\ 116 "pmaddwd %%mm1, %%mm3 \n\t"\ 117 "paddd %%mm0, %%mm4 \n\t"\ 118 "paddd %%mm3, %%mm5 \n\t"\ 119 "add %6, %%"FF_REG_S" \n\t" \ 120 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm3 \n\t" /* VsrcData */\ 121 "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 122 "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\ 123 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 124 "movq %%mm2, %%mm0 \n\t"\ 125 "punpcklwd %%mm3, %%mm2 \n\t"\ 126 "punpckhwd %%mm3, %%mm0 \n\t"\ 127 "pmaddwd %%mm1, %%mm2 \n\t"\ 128 "pmaddwd %%mm1, %%mm0 \n\t"\ 129 "paddd %%mm2, %%mm6 \n\t"\ 130 "paddd %%mm0, %%mm7 \n\t"\ 131 " jnz 2b \n\t"\ 132 "psrad $16, %%mm4 \n\t"\ 133 "psrad $16, %%mm5 \n\t"\ 134 "psrad $16, %%mm6 \n\t"\ 135 "psrad $16, %%mm7 \n\t"\ 136 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 137 "packssdw %%mm5, %%mm4 \n\t"\ 138 "packssdw %%mm7, %%mm6 \n\t"\ 139 "paddw %%mm0, %%mm4 \n\t"\ 140 "paddw %%mm0, %%mm6 \n\t"\ 141 "movq %%mm4, "U_TEMP"(%0) \n\t"\ 142 "movq %%mm6, "V_TEMP"(%0) \n\t"\ 143 144#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ 145 "lea "offset"(%0), %%"FF_REG_d" \n\t"\ 146 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 147 "pxor %%mm1, %%mm1 \n\t"\ 148 "pxor %%mm5, %%mm5 \n\t"\ 149 "pxor %%mm7, %%mm7 \n\t"\ 150 "pxor %%mm6, %%mm6 \n\t"\ 151 ".p2align 4 \n\t"\ 152 "2: \n\t"\ 153 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ 154 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ 155 "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 156 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ 157 "movq %%mm0, %%mm3 \n\t"\ 158 "punpcklwd %%mm4, %%mm0 \n\t"\ 159 "punpckhwd %%mm4, %%mm3 \n\t"\ 160 "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4 \n\t" /* filterCoeff */\ 161 "pmaddwd %%mm4, %%mm0 \n\t"\ 162 "pmaddwd %%mm4, %%mm3 \n\t"\ 163 "paddd %%mm0, %%mm1 \n\t"\ 164 "paddd %%mm3, %%mm5 \n\t"\ 165 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ 166 "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 167 "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\ 168 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 169 "movq %%mm2, %%mm0 \n\t"\ 170 "punpcklwd %%mm3, %%mm2 \n\t"\ 171 "punpckhwd %%mm3, %%mm0 \n\t"\ 172 "pmaddwd %%mm4, %%mm2 \n\t"\ 173 "pmaddwd %%mm4, %%mm0 \n\t"\ 174 "paddd %%mm2, %%mm7 \n\t"\ 175 "paddd %%mm0, %%mm6 \n\t"\ 176 " jnz 2b \n\t"\ 177 "psrad $16, %%mm1 \n\t"\ 178 "psrad $16, %%mm5 \n\t"\ 179 "psrad $16, %%mm7 \n\t"\ 180 "psrad $16, %%mm6 \n\t"\ 181 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 182 "packssdw %%mm5, %%mm1 \n\t"\ 183 "packssdw %%mm6, %%mm7 \n\t"\ 184 "paddw %%mm0, %%mm1 \n\t"\ 185 "paddw %%mm0, %%mm7 \n\t"\ 186 "movq "U_TEMP"(%0), %%mm3 \n\t"\ 187 "movq "V_TEMP"(%0), %%mm4 \n\t"\ 188 189#define YSCALEYUV2PACKEDX_ACCURATE \ 190 YSCALEYUV2PACKEDX_ACCURATE_UV \ 191 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) 192 193#define YSCALEYUV2RGBX \ 194 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ 195 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ 196 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 197 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 198 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ 199 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ 200 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 201 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ 202 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ 203 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ 204 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ 205 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ 206 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ 207 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 208 "paddw %%mm3, %%mm4 \n\t"\ 209 "movq %%mm2, %%mm0 \n\t"\ 210 "movq %%mm5, %%mm6 \n\t"\ 211 "movq %%mm4, %%mm3 \n\t"\ 212 "punpcklwd %%mm2, %%mm2 \n\t"\ 213 "punpcklwd %%mm5, %%mm5 \n\t"\ 214 "punpcklwd %%mm4, %%mm4 \n\t"\ 215 "paddw %%mm1, %%mm2 \n\t"\ 216 "paddw %%mm1, %%mm5 \n\t"\ 217 "paddw %%mm1, %%mm4 \n\t"\ 218 "punpckhwd %%mm0, %%mm0 \n\t"\ 219 "punpckhwd %%mm6, %%mm6 \n\t"\ 220 "punpckhwd %%mm3, %%mm3 \n\t"\ 221 "paddw %%mm7, %%mm0 \n\t"\ 222 "paddw %%mm7, %%mm6 \n\t"\ 223 "paddw %%mm7, %%mm3 \n\t"\ 224 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 225 "packuswb %%mm0, %%mm2 \n\t"\ 226 "packuswb %%mm6, %%mm5 \n\t"\ 227 "packuswb %%mm3, %%mm4 \n\t"\ 228 229#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ 230 "movq "#b", "#q2" \n\t" /* B */\ 231 "movq "#r", "#t" \n\t" /* R */\ 232 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\ 233 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\ 234 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\ 235 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\ 236 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\ 237 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\ 238 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\ 239 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\ 240 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\ 241 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\ 242\ 243 MOVNTQ( q0, (dst, index, 4))\ 244 MOVNTQ( b, 8(dst, index, 4))\ 245 MOVNTQ( q2, 16(dst, index, 4))\ 246 MOVNTQ( q3, 24(dst, index, 4))\ 247\ 248 "add $8, "#index" \n\t"\ 249 "cmp "dstw", "#index" \n\t"\ 250 " jb 1b \n\t" 251#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) 252 253static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, 254 const int16_t **lumSrc, int lumFilterSize, 255 const int16_t *chrFilter, const int16_t **chrUSrc, 256 const int16_t **chrVSrc, 257 int chrFilterSize, const int16_t **alpSrc, 258 uint8_t *dest, int dstW, int dstY) 259{ 260 x86_reg dummy=0; 261 x86_reg dstW_reg = dstW; 262 x86_reg uv_off = c->uv_offx2; 263 264 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { 265 YSCALEYUV2PACKEDX_ACCURATE 266 YSCALEYUV2RGBX 267 "movq %%mm2, "U_TEMP"(%0) \n\t" 268 "movq %%mm4, "V_TEMP"(%0) \n\t" 269 "movq %%mm5, "Y_TEMP"(%0) \n\t" 270 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET) 271 "movq "Y_TEMP"(%0), %%mm5 \n\t" 272 "psraw $3, %%mm1 \n\t" 273 "psraw $3, %%mm7 \n\t" 274 "packuswb %%mm7, %%mm1 \n\t" 275 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) 276 YSCALEYUV2PACKEDX_END 277 } else { 278 YSCALEYUV2PACKEDX_ACCURATE 279 YSCALEYUV2RGBX 280 "pcmpeqd %%mm7, %%mm7 \n\t" 281 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 282 YSCALEYUV2PACKEDX_END 283 } 284} 285 286static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, 287 const int16_t **lumSrc, int lumFilterSize, 288 const int16_t *chrFilter, const int16_t **chrUSrc, 289 const int16_t **chrVSrc, 290 int chrFilterSize, const int16_t **alpSrc, 291 uint8_t *dest, int dstW, int dstY) 292{ 293 x86_reg dummy=0; 294 x86_reg dstW_reg = dstW; 295 x86_reg uv_off = c->uv_offx2; 296 297 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { 298 YSCALEYUV2PACKEDX 299 YSCALEYUV2RGBX 300 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) 301 "psraw $3, %%mm1 \n\t" 302 "psraw $3, %%mm7 \n\t" 303 "packuswb %%mm7, %%mm1 \n\t" 304 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 305 YSCALEYUV2PACKEDX_END 306 } else { 307 YSCALEYUV2PACKEDX 308 YSCALEYUV2RGBX 309 "pcmpeqd %%mm7, %%mm7 \n\t" 310 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 311 YSCALEYUV2PACKEDX_END 312 } 313} 314 315static void RENAME(yuv2bgr32_X)(SwsContext *c, const int16_t *lumFilter, 316 const int16_t **lumSrc, int lumFilterSize, 317 const int16_t *chrFilter, const int16_t **chrUSrc, 318 const int16_t **chrVSrc, 319 int chrFilterSize, const int16_t **alpSrc, 320 uint8_t *dest, int dstW, int dstY) 321{ 322 x86_reg dummy=0; 323 x86_reg dstW_reg = dstW; 324 x86_reg uv_off = c->uv_offx2; 325 326 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { 327 YSCALEYUV2PACKEDX 328 YSCALEYUV2RGBX 329 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) 330 "psraw $3, %%mm1 \n\t" 331 "psraw $3, %%mm7 \n\t" 332 "packuswb %%mm7, %%mm1 \n\t" 333 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 334 YSCALEYUV2PACKEDX_END 335 } else { 336 YSCALEYUV2PACKEDX 337 YSCALEYUV2RGBX 338 "pcmpeqd %%mm7, %%mm7 \n\t" 339 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 340 YSCALEYUV2PACKEDX_END 341 } 342} 343 344#define REAL_WRITERGB16(dst, dstw, index) \ 345 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ 346 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ 347 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ 348 "psrlq $3, %%mm2 \n\t"\ 349\ 350 "movq %%mm2, %%mm1 \n\t"\ 351 "movq %%mm4, %%mm3 \n\t"\ 352\ 353 "punpcklbw %%mm7, %%mm3 \n\t"\ 354 "punpcklbw %%mm5, %%mm2 \n\t"\ 355 "punpckhbw %%mm7, %%mm4 \n\t"\ 356 "punpckhbw %%mm5, %%mm1 \n\t"\ 357\ 358 "psllq $3, %%mm3 \n\t"\ 359 "psllq $3, %%mm4 \n\t"\ 360\ 361 "por %%mm3, %%mm2 \n\t"\ 362 "por %%mm4, %%mm1 \n\t"\ 363\ 364 MOVNTQ(%%mm2, (dst, index, 2))\ 365 MOVNTQ(%%mm1, 8(dst, index, 2))\ 366\ 367 "add $8, "#index" \n\t"\ 368 "cmp "dstw", "#index" \n\t"\ 369 " jb 1b \n\t" 370#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) 371 372static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, 373 const int16_t **lumSrc, int lumFilterSize, 374 const int16_t *chrFilter, const int16_t **chrUSrc, 375 const int16_t **chrVSrc, 376 int chrFilterSize, const int16_t **alpSrc, 377 uint8_t *dest, int dstW, int dstY) 378{ 379 x86_reg dummy=0; 380 x86_reg dstW_reg = dstW; 381 x86_reg uv_off = c->uv_offx2; 382 383 YSCALEYUV2PACKEDX_ACCURATE 384 YSCALEYUV2RGBX 385 "pxor %%mm7, %%mm7 \n\t" 386 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 387#ifdef DITHER1XBPP 388 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" 389 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" 390 "paddusb "RED_DITHER"(%0), %%mm5\n\t" 391#endif 392 WRITERGB16(%4, "%5", %%FF_REGa) 393 YSCALEYUV2PACKEDX_END 394} 395 396static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, 397 const int16_t **lumSrc, int lumFilterSize, 398 const int16_t *chrFilter, const int16_t **chrUSrc, 399 const int16_t **chrVSrc, 400 int chrFilterSize, const int16_t **alpSrc, 401 uint8_t *dest, int dstW, int dstY) 402{ 403 x86_reg dummy=0; 404 x86_reg dstW_reg = dstW; 405 x86_reg uv_off = c->uv_offx2; 406 407 YSCALEYUV2PACKEDX 408 YSCALEYUV2RGBX 409 "pxor %%mm7, %%mm7 \n\t" 410 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 411#ifdef DITHER1XBPP 412 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" 413 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" 414 "paddusb "RED_DITHER"(%0), %%mm5 \n\t" 415#endif 416 WRITERGB16(%4, "%5", %%FF_REGa) 417 YSCALEYUV2PACKEDX_END 418} 419 420#define REAL_WRITERGB15(dst, dstw, index) \ 421 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ 422 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ 423 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ 424 "psrlq $3, %%mm2 \n\t"\ 425 "psrlq $1, %%mm5 \n\t"\ 426\ 427 "movq %%mm2, %%mm1 \n\t"\ 428 "movq %%mm4, %%mm3 \n\t"\ 429\ 430 "punpcklbw %%mm7, %%mm3 \n\t"\ 431 "punpcklbw %%mm5, %%mm2 \n\t"\ 432 "punpckhbw %%mm7, %%mm4 \n\t"\ 433 "punpckhbw %%mm5, %%mm1 \n\t"\ 434\ 435 "psllq $2, %%mm3 \n\t"\ 436 "psllq $2, %%mm4 \n\t"\ 437\ 438 "por %%mm3, %%mm2 \n\t"\ 439 "por %%mm4, %%mm1 \n\t"\ 440\ 441 MOVNTQ(%%mm2, (dst, index, 2))\ 442 MOVNTQ(%%mm1, 8(dst, index, 2))\ 443\ 444 "add $8, "#index" \n\t"\ 445 "cmp "dstw", "#index" \n\t"\ 446 " jb 1b \n\t" 447#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) 448 449static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, 450 const int16_t **lumSrc, int lumFilterSize, 451 const int16_t *chrFilter, const int16_t **chrUSrc, 452 const int16_t **chrVSrc, 453 int chrFilterSize, const int16_t **alpSrc, 454 uint8_t *dest, int dstW, int dstY) 455{ 456 x86_reg dummy=0; 457 x86_reg dstW_reg = dstW; 458 x86_reg uv_off = c->uv_offx2; 459 460 YSCALEYUV2PACKEDX_ACCURATE 461 YSCALEYUV2RGBX 462 "pxor %%mm7, %%mm7 \n\t" 463 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 464#ifdef DITHER1XBPP 465 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" 466 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" 467 "paddusb "RED_DITHER"(%0), %%mm5\n\t" 468#endif 469 WRITERGB15(%4, "%5", %%FF_REGa) 470 YSCALEYUV2PACKEDX_END 471} 472 473static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, 474 const int16_t **lumSrc, int lumFilterSize, 475 const int16_t *chrFilter, const int16_t **chrUSrc, 476 const int16_t **chrVSrc, 477 int chrFilterSize, const int16_t **alpSrc, 478 uint8_t *dest, int dstW, int dstY) 479{ 480 x86_reg dummy=0; 481 x86_reg dstW_reg = dstW; 482 x86_reg uv_off = c->uv_offx2; 483 484 YSCALEYUV2PACKEDX 485 YSCALEYUV2RGBX 486 "pxor %%mm7, %%mm7 \n\t" 487 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 488#ifdef DITHER1XBPP 489 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" 490 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" 491 "paddusb "RED_DITHER"(%0), %%mm5 \n\t" 492#endif 493 WRITERGB15(%4, "%5", %%FF_REGa) 494 YSCALEYUV2PACKEDX_END 495} 496 497#define WRITEBGR24MMX(dst, dstw, index) \ 498 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 499 "movq %%mm2, %%mm1 \n\t" /* B */\ 500 "movq %%mm5, %%mm6 \n\t" /* R */\ 501 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ 502 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ 503 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ 504 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ 505 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ 506 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ 507 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ 508 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 509 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ 510 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ 511\ 512 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ 513 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ 514 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ 515 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ 516\ 517 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ 518 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ 519 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ 520 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ 521\ 522 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ 523 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ 524 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ 525 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ 526\ 527 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ 528 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ 529 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ 530 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ 531 MOVNTQ(%%mm0, (dst))\ 532\ 533 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ 534 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ 535 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ 536 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ 537 MOVNTQ(%%mm6, 8(dst))\ 538\ 539 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ 540 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ 541 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ 542 MOVNTQ(%%mm5, 16(dst))\ 543\ 544 "add $24, "#dst" \n\t"\ 545\ 546 "add $8, "#index" \n\t"\ 547 "cmp "dstw", "#index" \n\t"\ 548 " jb 1b \n\t" 549 550#define WRITEBGR24MMXEXT(dst, dstw, index) \ 551 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 552 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ 553 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ 554 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ 555 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ 556 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ 557\ 558 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ 559 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ 560 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ 561\ 562 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ 563 "por %%mm1, %%mm6 \n\t"\ 564 "por %%mm3, %%mm6 \n\t"\ 565 MOVNTQ(%%mm6, (dst))\ 566\ 567 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ 568 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ 569 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ 570 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ 571\ 572 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ 573 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ 574 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ 575\ 576 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ 577 "por %%mm3, %%mm6 \n\t"\ 578 MOVNTQ(%%mm6, 8(dst))\ 579\ 580 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ 581 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ 582 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ 583\ 584 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ 585 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ 586 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ 587\ 588 "por %%mm1, %%mm3 \n\t"\ 589 "por %%mm3, %%mm6 \n\t"\ 590 MOVNTQ(%%mm6, 16(dst))\ 591\ 592 "add $24, "#dst" \n\t"\ 593\ 594 "add $8, "#index" \n\t"\ 595 "cmp "dstw", "#index" \n\t"\ 596 " jb 1b \n\t" 597 598#undef WRITEBGR24 599#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index) 600 601#if HAVE_6REGS 602static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, 603 const int16_t **lumSrc, int lumFilterSize, 604 const int16_t *chrFilter, const int16_t **chrUSrc, 605 const int16_t **chrVSrc, 606 int chrFilterSize, const int16_t **alpSrc, 607 uint8_t *dest, int dstW, int dstY) 608{ 609 x86_reg dummy=0; 610 x86_reg dstW_reg = dstW; 611 x86_reg uv_off = c->uv_offx2; 612 613 YSCALEYUV2PACKEDX_ACCURATE 614 YSCALEYUV2RGBX 615 "pxor %%mm7, %%mm7 \n\t" 616 "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c"\n\t" //FIXME optimize 617 "add %4, %%"FF_REG_c" \n\t" 618 WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa) 619 :: "r" (&c->redDither), 620 "m" (dummy), "m" (dummy), "m" (dummy), 621 "r" (dest), "m" (dstW_reg), "m"(uv_off) 622 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) 623 : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S 624 ); 625} 626 627static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, 628 const int16_t **lumSrc, int lumFilterSize, 629 const int16_t *chrFilter, const int16_t **chrUSrc, 630 const int16_t **chrVSrc, 631 int chrFilterSize, const int16_t **alpSrc, 632 uint8_t *dest, int dstW, int dstY) 633{ 634 x86_reg dummy=0; 635 x86_reg dstW_reg = dstW; 636 x86_reg uv_off = c->uv_offx2; 637 638 YSCALEYUV2PACKEDX 639 YSCALEYUV2RGBX 640 "pxor %%mm7, %%mm7 \n\t" 641 "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" //FIXME optimize 642 "add %4, %%"FF_REG_c" \n\t" 643 WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa) 644 :: "r" (&c->redDither), 645 "m" (dummy), "m" (dummy), "m" (dummy), 646 "r" (dest), "m" (dstW_reg), "m"(uv_off) 647 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) 648 : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S 649 ); 650} 651#endif /* HAVE_6REGS */ 652 653#define REAL_WRITEYUY2(dst, dstw, index) \ 654 "packuswb %%mm3, %%mm3 \n\t"\ 655 "packuswb %%mm4, %%mm4 \n\t"\ 656 "packuswb %%mm7, %%mm1 \n\t"\ 657 "punpcklbw %%mm4, %%mm3 \n\t"\ 658 "movq %%mm1, %%mm7 \n\t"\ 659 "punpcklbw %%mm3, %%mm1 \n\t"\ 660 "punpckhbw %%mm3, %%mm7 \n\t"\ 661\ 662 MOVNTQ(%%mm1, (dst, index, 2))\ 663 MOVNTQ(%%mm7, 8(dst, index, 2))\ 664\ 665 "add $8, "#index" \n\t"\ 666 "cmp "dstw", "#index" \n\t"\ 667 " jb 1b \n\t" 668#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) 669 670static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, 671 const int16_t **lumSrc, int lumFilterSize, 672 const int16_t *chrFilter, const int16_t **chrUSrc, 673 const int16_t **chrVSrc, 674 int chrFilterSize, const int16_t **alpSrc, 675 uint8_t *dest, int dstW, int dstY) 676{ 677 x86_reg dummy=0; 678 x86_reg dstW_reg = dstW; 679 x86_reg uv_off = c->uv_offx2; 680 681 YSCALEYUV2PACKEDX_ACCURATE 682 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 683 "psraw $3, %%mm3 \n\t" 684 "psraw $3, %%mm4 \n\t" 685 "psraw $3, %%mm1 \n\t" 686 "psraw $3, %%mm7 \n\t" 687 WRITEYUY2(%4, "%5", %%FF_REGa) 688 YSCALEYUV2PACKEDX_END 689} 690 691static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, 692 const int16_t **lumSrc, int lumFilterSize, 693 const int16_t *chrFilter, const int16_t **chrUSrc, 694 const int16_t **chrVSrc, 695 int chrFilterSize, const int16_t **alpSrc, 696 uint8_t *dest, int dstW, int dstY) 697{ 698 x86_reg dummy=0; 699 x86_reg dstW_reg = dstW; 700 x86_reg uv_off = c->uv_offx2; 701 702 YSCALEYUV2PACKEDX 703 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 704 "psraw $3, %%mm3 \n\t" 705 "psraw $3, %%mm4 \n\t" 706 "psraw $3, %%mm1 \n\t" 707 "psraw $3, %%mm7 \n\t" 708 WRITEYUY2(%4, "%5", %%FF_REGa) 709 YSCALEYUV2PACKEDX_END 710} 711 712#define REAL_YSCALEYUV2RGB_UV(index, c) \ 713 "xor "#index", "#index" \n\t"\ 714 ".p2align 4 \n\t"\ 715 "1: \n\t"\ 716 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 717 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 718 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 719 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 720 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 721 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 722 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ 723 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ 724 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 725 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ 726 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ 727 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 728 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 729 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ 730 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ 731 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 732 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 733 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 734 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 735 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 736 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 737 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 738 739#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ 740 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ 741 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ 742 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ 743 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ 744 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ 745 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ 746 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 747 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 748 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 749 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 750 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 751 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 752 753#define REAL_YSCALEYUV2RGB_COEFF(c) \ 754 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 755 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 756 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 757 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 758 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 759 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 760 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 761 "paddw %%mm3, %%mm4 \n\t"\ 762 "movq %%mm2, %%mm0 \n\t"\ 763 "movq %%mm5, %%mm6 \n\t"\ 764 "movq %%mm4, %%mm3 \n\t"\ 765 "punpcklwd %%mm2, %%mm2 \n\t"\ 766 "punpcklwd %%mm5, %%mm5 \n\t"\ 767 "punpcklwd %%mm4, %%mm4 \n\t"\ 768 "paddw %%mm1, %%mm2 \n\t"\ 769 "paddw %%mm1, %%mm5 \n\t"\ 770 "paddw %%mm1, %%mm4 \n\t"\ 771 "punpckhwd %%mm0, %%mm0 \n\t"\ 772 "punpckhwd %%mm6, %%mm6 \n\t"\ 773 "punpckhwd %%mm3, %%mm3 \n\t"\ 774 "paddw %%mm7, %%mm0 \n\t"\ 775 "paddw %%mm7, %%mm6 \n\t"\ 776 "paddw %%mm7, %%mm3 \n\t"\ 777 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 778 "packuswb %%mm0, %%mm2 \n\t"\ 779 "packuswb %%mm6, %%mm5 \n\t"\ 780 "packuswb %%mm3, %%mm4 \n\t"\ 781 782#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) 783 784#define YSCALEYUV2RGB(index, c) \ 785 REAL_YSCALEYUV2RGB_UV(index, c) \ 786 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ 787 REAL_YSCALEYUV2RGB_COEFF(c) 788 789/** 790 * vertical bilinear scale YV12 to RGB 791 */ 792static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], 793 const int16_t *ubuf[2], const int16_t *vbuf[2], 794 const int16_t *abuf[2], uint8_t *dest, 795 int dstW, int yalpha, int uvalpha, int y) 796{ 797 const int16_t *buf0 = buf[0], *buf1 = buf[1], 798 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; 799 800 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { 801 const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1]; 802#if ARCH_X86_64 803 __asm__ volatile( 804 YSCALEYUV2RGB(%%r8, %5) 805 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7) 806 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 807 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 808 "packuswb %%mm7, %%mm1 \n\t" 809 WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 810 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest), 811 "a" (&c->redDither), 812 "r" (abuf0), "r" (abuf1) 813 : "%r8" 814 ); 815#else 816 c->u_temp=(intptr_t)abuf0; 817 c->v_temp=(intptr_t)abuf1; 818 __asm__ volatile( 819 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 820 "mov %4, %%"FF_REG_b" \n\t" 821 "push %%"FF_REG_BP" \n\t" 822 YSCALEYUV2RGB(%%FF_REGBP, %5) 823 "push %0 \n\t" 824 "push %1 \n\t" 825 "mov "U_TEMP"(%5), %0 \n\t" 826 "mov "V_TEMP"(%5), %1 \n\t" 827 YSCALEYUV2RGB_YA(%%FF_REGBP, %5, %0, %1) 828 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 829 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 830 "packuswb %%mm7, %%mm1 \n\t" 831 "pop %1 \n\t" 832 "pop %0 \n\t" 833 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 834 "pop %%"FF_REG_BP" \n\t" 835 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 836 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 837 "a" (&c->redDither) 838 ); 839#endif 840 } else { 841 __asm__ volatile( 842 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 843 "mov %4, %%"FF_REG_b" \n\t" 844 "push %%"FF_REG_BP" \n\t" 845 YSCALEYUV2RGB(%%FF_REGBP, %5) 846 "pcmpeqd %%mm7, %%mm7 \n\t" 847 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 848 "pop %%"FF_REG_BP" \n\t" 849 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 850 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 851 "a" (&c->redDither) 852 ); 853 } 854} 855 856static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2], 857 const int16_t *ubuf[2], const int16_t *vbuf[2], 858 const int16_t *abuf[2], uint8_t *dest, 859 int dstW, int yalpha, int uvalpha, int y) 860{ 861 const int16_t *buf0 = buf[0], *buf1 = buf[1], 862 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; 863 864 __asm__ volatile( 865 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 866 "mov %4, %%"FF_REG_b" \n\t" 867 "push %%"FF_REG_BP" \n\t" 868 YSCALEYUV2RGB(%%FF_REGBP, %5) 869 "pxor %%mm7, %%mm7 \n\t" 870 WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 871 "pop %%"FF_REG_BP" \n\t" 872 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 873 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 874 "a" (&c->redDither) 875 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) 876 ); 877} 878 879static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2], 880 const int16_t *ubuf[2], const int16_t *vbuf[2], 881 const int16_t *abuf[2], uint8_t *dest, 882 int dstW, int yalpha, int uvalpha, int y) 883{ 884 const int16_t *buf0 = buf[0], *buf1 = buf[1], 885 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; 886 887 __asm__ volatile( 888 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 889 "mov %4, %%"FF_REG_b" \n\t" 890 "push %%"FF_REG_BP" \n\t" 891 YSCALEYUV2RGB(%%FF_REGBP, %5) 892 "pxor %%mm7, %%mm7 \n\t" 893 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 894#ifdef DITHER1XBPP 895 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 896 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 897 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 898#endif 899 WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 900 "pop %%"FF_REG_BP" \n\t" 901 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 902 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 903 "a" (&c->redDither) 904 NAMED_CONSTRAINTS_ADD(bF8) 905 ); 906} 907 908static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], 909 const int16_t *ubuf[2], const int16_t *vbuf[2], 910 const int16_t *abuf[2], uint8_t *dest, 911 int dstW, int yalpha, int uvalpha, int y) 912{ 913 const int16_t *buf0 = buf[0], *buf1 = buf[1], 914 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; 915 916 __asm__ volatile( 917 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 918 "mov %4, %%"FF_REG_b" \n\t" 919 "push %%"FF_REG_BP" \n\t" 920 YSCALEYUV2RGB(%%FF_REGBP, %5) 921 "pxor %%mm7, %%mm7 \n\t" 922 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 923#ifdef DITHER1XBPP 924 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 925 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 926 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 927#endif 928 WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 929 "pop %%"FF_REG_BP" \n\t" 930 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 931 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 932 "a" (&c->redDither) 933 NAMED_CONSTRAINTS_ADD(bF8,bFC) 934 ); 935} 936 937#define REAL_YSCALEYUV2PACKED(index, c) \ 938 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 939 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ 940 "psraw $3, %%mm0 \n\t"\ 941 "psraw $3, %%mm1 \n\t"\ 942 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 943 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 944 "xor "#index", "#index" \n\t"\ 945 ".p2align 4 \n\t"\ 946 "1: \n\t"\ 947 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 948 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 949 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 950 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 951 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 952 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 953 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ 954 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ 955 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 956 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ 957 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ 958 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 959 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 960 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ 961 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ 962 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ 963 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ 964 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ 965 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ 966 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ 967 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ 968 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 969 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 970 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 971 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 972 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 973 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 974 975#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) 976 977static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], 978 const int16_t *ubuf[2], const int16_t *vbuf[2], 979 const int16_t *abuf[2], uint8_t *dest, 980 int dstW, int yalpha, int uvalpha, int y) 981{ 982 const int16_t *buf0 = buf[0], *buf1 = buf[1], 983 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; 984 985 __asm__ volatile( 986 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 987 "mov %4, %%"FF_REG_b" \n\t" 988 "push %%"FF_REG_BP" \n\t" 989 YSCALEYUV2PACKED(%%FF_REGBP, %5) 990 WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 991 "pop %%"FF_REG_BP" \n\t" 992 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 993 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 994 "a" (&c->redDither) 995 ); 996} 997 998#define REAL_YSCALEYUV2RGB1(index, c) \ 999 "xor "#index", "#index" \n\t"\ 1000 ".p2align 4 \n\t"\ 1001 "1: \n\t"\ 1002 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ 1003 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1004 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 1005 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1006 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 1007 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 1008 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 1009 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 1010 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 1011 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 1012 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 1013 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 1014 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 1015 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 1016 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 1017 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 1018 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 1019 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 1020 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 1021 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 1022 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 1023 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 1024 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 1025 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 1026 "paddw %%mm3, %%mm4 \n\t"\ 1027 "movq %%mm2, %%mm0 \n\t"\ 1028 "movq %%mm5, %%mm6 \n\t"\ 1029 "movq %%mm4, %%mm3 \n\t"\ 1030 "punpcklwd %%mm2, %%mm2 \n\t"\ 1031 "punpcklwd %%mm5, %%mm5 \n\t"\ 1032 "punpcklwd %%mm4, %%mm4 \n\t"\ 1033 "paddw %%mm1, %%mm2 \n\t"\ 1034 "paddw %%mm1, %%mm5 \n\t"\ 1035 "paddw %%mm1, %%mm4 \n\t"\ 1036 "punpckhwd %%mm0, %%mm0 \n\t"\ 1037 "punpckhwd %%mm6, %%mm6 \n\t"\ 1038 "punpckhwd %%mm3, %%mm3 \n\t"\ 1039 "paddw %%mm7, %%mm0 \n\t"\ 1040 "paddw %%mm7, %%mm6 \n\t"\ 1041 "paddw %%mm7, %%mm3 \n\t"\ 1042 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 1043 "packuswb %%mm0, %%mm2 \n\t"\ 1044 "packuswb %%mm6, %%mm5 \n\t"\ 1045 "packuswb %%mm3, %%mm4 \n\t"\ 1046 1047#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) 1048 1049// do vertical chrominance interpolation 1050#define REAL_YSCALEYUV2RGB1b(index, c) \ 1051 "xor "#index", "#index" \n\t"\ 1052 ".p2align 4 \n\t"\ 1053 "1: \n\t"\ 1054 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 1055 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 1056 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1057 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 1058 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 1059 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1060 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ 1061 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ 1062 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ 1063 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ 1064 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 1065 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 1066 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 1067 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 1068 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 1069 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 1070 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 1071 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 1072 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 1073 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 1074 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 1075 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 1076 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 1077 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 1078 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 1079 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 1080 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 1081 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 1082 "paddw %%mm3, %%mm4 \n\t"\ 1083 "movq %%mm2, %%mm0 \n\t"\ 1084 "movq %%mm5, %%mm6 \n\t"\ 1085 "movq %%mm4, %%mm3 \n\t"\ 1086 "punpcklwd %%mm2, %%mm2 \n\t"\ 1087 "punpcklwd %%mm5, %%mm5 \n\t"\ 1088 "punpcklwd %%mm4, %%mm4 \n\t"\ 1089 "paddw %%mm1, %%mm2 \n\t"\ 1090 "paddw %%mm1, %%mm5 \n\t"\ 1091 "paddw %%mm1, %%mm4 \n\t"\ 1092 "punpckhwd %%mm0, %%mm0 \n\t"\ 1093 "punpckhwd %%mm6, %%mm6 \n\t"\ 1094 "punpckhwd %%mm3, %%mm3 \n\t"\ 1095 "paddw %%mm7, %%mm0 \n\t"\ 1096 "paddw %%mm7, %%mm6 \n\t"\ 1097 "paddw %%mm7, %%mm3 \n\t"\ 1098 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 1099 "packuswb %%mm0, %%mm2 \n\t"\ 1100 "packuswb %%mm6, %%mm5 \n\t"\ 1101 "packuswb %%mm3, %%mm4 \n\t"\ 1102 1103#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) 1104 1105#define REAL_YSCALEYUV2RGB1_ALPHA(index) \ 1106 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\ 1107 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\ 1108 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\ 1109 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\ 1110 "packuswb %%mm1, %%mm7 \n\t" 1111#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) 1112 1113/** 1114 * YV12 to RGB without scaling or interpolating 1115 */ 1116static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, 1117 const int16_t *ubuf[2], const int16_t *vbuf[2], 1118 const int16_t *abuf0, uint8_t *dest, 1119 int dstW, int uvalpha, int y) 1120{ 1121 const int16_t *ubuf0 = ubuf[0]; 1122 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1123 1124 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1125 const int16_t *ubuf1 = ubuf[0]; 1126 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { 1127 __asm__ volatile( 1128 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1129 "mov %4, %%"FF_REG_b" \n\t" 1130 "push %%"FF_REG_BP" \n\t" 1131 YSCALEYUV2RGB1(%%FF_REGBP, %5) 1132 YSCALEYUV2RGB1_ALPHA(%%FF_REGBP) 1133 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1134 "pop %%"FF_REG_BP" \n\t" 1135 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1136 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1137 "a" (&c->redDither) 1138 ); 1139 } else { 1140 __asm__ volatile( 1141 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1142 "mov %4, %%"FF_REG_b" \n\t" 1143 "push %%"FF_REG_BP" \n\t" 1144 YSCALEYUV2RGB1(%%FF_REGBP, %5) 1145 "pcmpeqd %%mm7, %%mm7 \n\t" 1146 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1147 "pop %%"FF_REG_BP" \n\t" 1148 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1149 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1150 "a" (&c->redDither) 1151 ); 1152 } 1153 } else { 1154 const int16_t *ubuf1 = ubuf[1]; 1155 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { 1156 __asm__ volatile( 1157 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1158 "mov %4, %%"FF_REG_b" \n\t" 1159 "push %%"FF_REG_BP" \n\t" 1160 YSCALEYUV2RGB1b(%%FF_REGBP, %5) 1161 YSCALEYUV2RGB1_ALPHA(%%FF_REGBP) 1162 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1163 "pop %%"FF_REG_BP" \n\t" 1164 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1165 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1166 "a" (&c->redDither) 1167 ); 1168 } else { 1169 __asm__ volatile( 1170 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1171 "mov %4, %%"FF_REG_b" \n\t" 1172 "push %%"FF_REG_BP" \n\t" 1173 YSCALEYUV2RGB1b(%%FF_REGBP, %5) 1174 "pcmpeqd %%mm7, %%mm7 \n\t" 1175 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1176 "pop %%"FF_REG_BP" \n\t" 1177 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1178 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1179 "a" (&c->redDither) 1180 ); 1181 } 1182 } 1183} 1184 1185static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0, 1186 const int16_t *ubuf[2], const int16_t *vbuf[2], 1187 const int16_t *abuf0, uint8_t *dest, 1188 int dstW, int uvalpha, int y) 1189{ 1190 const int16_t *ubuf0 = ubuf[0]; 1191 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1192 1193 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1194 const int16_t *ubuf1 = ubuf[0]; 1195 __asm__ volatile( 1196 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1197 "mov %4, %%"FF_REG_b" \n\t" 1198 "push %%"FF_REG_BP" \n\t" 1199 YSCALEYUV2RGB1(%%FF_REGBP, %5) 1200 "pxor %%mm7, %%mm7 \n\t" 1201 WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1202 "pop %%"FF_REG_BP" \n\t" 1203 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1204 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1205 "a" (&c->redDither) 1206 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) 1207 ); 1208 } else { 1209 const int16_t *ubuf1 = ubuf[1]; 1210 __asm__ volatile( 1211 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1212 "mov %4, %%"FF_REG_b" \n\t" 1213 "push %%"FF_REG_BP" \n\t" 1214 YSCALEYUV2RGB1b(%%FF_REGBP, %5) 1215 "pxor %%mm7, %%mm7 \n\t" 1216 WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1217 "pop %%"FF_REG_BP" \n\t" 1218 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1219 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1220 "a" (&c->redDither) 1221 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) 1222 ); 1223 } 1224} 1225 1226static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0, 1227 const int16_t *ubuf[2], const int16_t *vbuf[2], 1228 const int16_t *abuf0, uint8_t *dest, 1229 int dstW, int uvalpha, int y) 1230{ 1231 const int16_t *ubuf0 = ubuf[0]; 1232 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1233 1234 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1235 const int16_t *ubuf1 = ubuf[0]; 1236 __asm__ volatile( 1237 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1238 "mov %4, %%"FF_REG_b" \n\t" 1239 "push %%"FF_REG_BP" \n\t" 1240 YSCALEYUV2RGB1(%%FF_REGBP, %5) 1241 "pxor %%mm7, %%mm7 \n\t" 1242 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1243#ifdef DITHER1XBPP 1244 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1245 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1246 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1247#endif 1248 WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1249 "pop %%"FF_REG_BP" \n\t" 1250 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1251 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1252 "a" (&c->redDither) 1253 NAMED_CONSTRAINTS_ADD(bF8) 1254 ); 1255 } else { 1256 const int16_t *ubuf1 = ubuf[1]; 1257 __asm__ volatile( 1258 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1259 "mov %4, %%"FF_REG_b" \n\t" 1260 "push %%"FF_REG_BP" \n\t" 1261 YSCALEYUV2RGB1b(%%FF_REGBP, %5) 1262 "pxor %%mm7, %%mm7 \n\t" 1263 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1264#ifdef DITHER1XBPP 1265 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1266 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1267 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1268#endif 1269 WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1270 "pop %%"FF_REG_BP" \n\t" 1271 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1272 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1273 "a" (&c->redDither) 1274 NAMED_CONSTRAINTS_ADD(bF8) 1275 ); 1276 } 1277} 1278 1279static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, 1280 const int16_t *ubuf[2], const int16_t *vbuf[2], 1281 const int16_t *abuf0, uint8_t *dest, 1282 int dstW, int uvalpha, int y) 1283{ 1284 const int16_t *ubuf0 = ubuf[0]; 1285 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1286 1287 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1288 const int16_t *ubuf1 = ubuf[0]; 1289 __asm__ volatile( 1290 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1291 "mov %4, %%"FF_REG_b" \n\t" 1292 "push %%"FF_REG_BP" \n\t" 1293 YSCALEYUV2RGB1(%%FF_REGBP, %5) 1294 "pxor %%mm7, %%mm7 \n\t" 1295 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1296#ifdef DITHER1XBPP 1297 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1298 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1299 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1300#endif 1301 WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1302 "pop %%"FF_REG_BP" \n\t" 1303 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1304 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1305 "a" (&c->redDither) 1306 NAMED_CONSTRAINTS_ADD(bF8,bFC) 1307 ); 1308 } else { 1309 const int16_t *ubuf1 = ubuf[1]; 1310 __asm__ volatile( 1311 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1312 "mov %4, %%"FF_REG_b" \n\t" 1313 "push %%"FF_REG_BP" \n\t" 1314 YSCALEYUV2RGB1b(%%FF_REGBP, %5) 1315 "pxor %%mm7, %%mm7 \n\t" 1316 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1317#ifdef DITHER1XBPP 1318 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1319 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1320 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1321#endif 1322 WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1323 "pop %%"FF_REG_BP" \n\t" 1324 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1325 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1326 "a" (&c->redDither) 1327 NAMED_CONSTRAINTS_ADD(bF8,bFC) 1328 ); 1329 } 1330} 1331 1332#define REAL_YSCALEYUV2PACKED1(index, c) \ 1333 "xor "#index", "#index" \n\t"\ 1334 ".p2align 4 \n\t"\ 1335 "1: \n\t"\ 1336 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ 1337 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1338 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 1339 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1340 "psraw $7, %%mm3 \n\t" \ 1341 "psraw $7, %%mm4 \n\t" \ 1342 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 1343 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 1344 "psraw $7, %%mm1 \n\t" \ 1345 "psraw $7, %%mm7 \n\t" \ 1346 1347#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) 1348 1349#define REAL_YSCALEYUV2PACKED1b(index, c) \ 1350 "xor "#index", "#index" \n\t"\ 1351 ".p2align 4 \n\t"\ 1352 "1: \n\t"\ 1353 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 1354 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 1355 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1356 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 1357 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 1358 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1359 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ 1360 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ 1361 "psrlw $8, %%mm3 \n\t" \ 1362 "psrlw $8, %%mm4 \n\t" \ 1363 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 1364 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 1365 "psraw $7, %%mm1 \n\t" \ 1366 "psraw $7, %%mm7 \n\t" 1367#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) 1368 1369static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, 1370 const int16_t *ubuf[2], const int16_t *vbuf[2], 1371 const int16_t *abuf0, uint8_t *dest, 1372 int dstW, int uvalpha, int y) 1373{ 1374 const int16_t *ubuf0 = ubuf[0]; 1375 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1376 1377 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1378 const int16_t *ubuf1 = ubuf[0]; 1379 __asm__ volatile( 1380 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1381 "mov %4, %%"FF_REG_b" \n\t" 1382 "push %%"FF_REG_BP" \n\t" 1383 YSCALEYUV2PACKED1(%%FF_REGBP, %5) 1384 WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1385 "pop %%"FF_REG_BP" \n\t" 1386 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1387 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1388 "a" (&c->redDither) 1389 ); 1390 } else { 1391 const int16_t *ubuf1 = ubuf[1]; 1392 __asm__ volatile( 1393 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1394 "mov %4, %%"FF_REG_b" \n\t" 1395 "push %%"FF_REG_BP" \n\t" 1396 YSCALEYUV2PACKED1b(%%FF_REGBP, %5) 1397 WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1398 "pop %%"FF_REG_BP" \n\t" 1399 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1400 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1401 "a" (&c->redDither) 1402 ); 1403 } 1404} 1405static av_cold void RENAME(sws_init_swscale)(SwsContext *c) 1406{ 1407 enum AVPixelFormat dstFormat = c->dstFormat; 1408 1409 c->use_mmx_vfilter= 0; 1410 if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) 1411 && dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE 1412 && !(c->flags & SWS_BITEXACT)) { 1413 if (c->flags & SWS_ACCURATE_RND) { 1414 if (!(c->flags & SWS_FULL_CHR_H_INT)) { 1415 switch (c->dstFormat) { 1416 case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break; 1417#if HAVE_6REGS 1418 case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break; 1419#endif 1420 case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break; 1421 case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break; 1422 case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break; 1423 default: break; 1424 } 1425 } 1426 } else { 1427 c->use_mmx_vfilter= 1; 1428 if (!(c->flags & SWS_FULL_CHR_H_INT)) { 1429 switch (c->dstFormat) { 1430 case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break; 1431 case AV_PIX_FMT_BGR32: c->yuv2packedX = RENAME(yuv2bgr32_X); break; 1432#if HAVE_6REGS 1433 case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break; 1434#endif 1435 case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break; 1436 case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break; 1437 case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break; 1438 default: break; 1439 } 1440 } 1441 } 1442 if (!(c->flags & SWS_FULL_CHR_H_INT)) { 1443 switch (c->dstFormat) { 1444 case AV_PIX_FMT_RGB32: 1445 c->yuv2packed1 = RENAME(yuv2rgb32_1); 1446 c->yuv2packed2 = RENAME(yuv2rgb32_2); 1447 break; 1448 case AV_PIX_FMT_BGR24: 1449 c->yuv2packed1 = RENAME(yuv2bgr24_1); 1450 c->yuv2packed2 = RENAME(yuv2bgr24_2); 1451 break; 1452 case AV_PIX_FMT_RGB555: 1453 c->yuv2packed1 = RENAME(yuv2rgb555_1); 1454 c->yuv2packed2 = RENAME(yuv2rgb555_2); 1455 break; 1456 case AV_PIX_FMT_RGB565: 1457 c->yuv2packed1 = RENAME(yuv2rgb565_1); 1458 c->yuv2packed2 = RENAME(yuv2rgb565_2); 1459 break; 1460 case AV_PIX_FMT_YUYV422: 1461 c->yuv2packed1 = RENAME(yuv2yuyv422_1); 1462 c->yuv2packed2 = RENAME(yuv2yuyv422_2); 1463 break; 1464 default: 1465 break; 1466 } 1467 } 1468 } 1469 1470 if (c->srcBpc == 8 && c->dstBpc <= 14) { 1471 // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one). 1472 if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) { 1473 c->hyscale_fast = ff_hyscale_fast_mmxext; 1474 c->hcscale_fast = ff_hcscale_fast_mmxext; 1475 } else { 1476 c->hyscale_fast = NULL; 1477 c->hcscale_fast = NULL; 1478 } 1479 } 1480} 1481