1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include <stdint.h> 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci#include "libavutil/x86/asm.h" 24cabdff1aSopenharmony_ci#include "libswscale/swscale_internal.h" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci#undef REAL_MOVNTQ 27cabdff1aSopenharmony_ci#undef MOVNTQ 28cabdff1aSopenharmony_ci#undef MOVNTQ2 29cabdff1aSopenharmony_ci#undef PREFETCH 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_ci#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" 33cabdff1aSopenharmony_ci#define MOVNTQ2 "movntq " 34cabdff1aSopenharmony_ci#define MOVNTQ(a,b) REAL_MOVNTQ(a,b) 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_ci#define YSCALEYUV2PACKEDX_UV \ 37cabdff1aSopenharmony_ci __asm__ volatile(\ 38cabdff1aSopenharmony_ci "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ 39cabdff1aSopenharmony_ci ".p2align 4 \n\t"\ 40cabdff1aSopenharmony_ci "nop \n\t"\ 41cabdff1aSopenharmony_ci "1: \n\t"\ 42cabdff1aSopenharmony_ci "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\ 43cabdff1aSopenharmony_ci "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 44cabdff1aSopenharmony_ci "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 45cabdff1aSopenharmony_ci "movq %%mm3, %%mm4 \n\t"\ 46cabdff1aSopenharmony_ci ".p2align 4 \n\t"\ 47cabdff1aSopenharmony_ci "2: \n\t"\ 48cabdff1aSopenharmony_ci "movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\ 49cabdff1aSopenharmony_ci "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* UsrcData */\ 50cabdff1aSopenharmony_ci "add %6, %%"FF_REG_S" \n\t" \ 51cabdff1aSopenharmony_ci "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" /* VsrcData */\ 52cabdff1aSopenharmony_ci "add $16, %%"FF_REG_d" \n\t"\ 53cabdff1aSopenharmony_ci "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 54cabdff1aSopenharmony_ci "pmulhw %%mm0, %%mm2 \n\t"\ 55cabdff1aSopenharmony_ci "pmulhw %%mm0, %%mm5 \n\t"\ 56cabdff1aSopenharmony_ci "paddw %%mm2, %%mm3 \n\t"\ 57cabdff1aSopenharmony_ci "paddw %%mm5, %%mm4 \n\t"\ 58cabdff1aSopenharmony_ci "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 59cabdff1aSopenharmony_ci " jnz 2b \n\t"\ 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_ci#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ 62cabdff1aSopenharmony_ci "lea "offset"(%0), %%"FF_REG_d" \n\t"\ 63cabdff1aSopenharmony_ci "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 64cabdff1aSopenharmony_ci "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ 65cabdff1aSopenharmony_ci "movq "#dst1", "#dst2" \n\t"\ 66cabdff1aSopenharmony_ci ".p2align 4 \n\t"\ 67cabdff1aSopenharmony_ci "2: \n\t"\ 68cabdff1aSopenharmony_ci "movq 8(%%"FF_REG_d"), "#coeff" \n\t" /* filterCoeff */\ 69cabdff1aSopenharmony_ci "movq (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" /* Y1srcData */\ 70cabdff1aSopenharmony_ci "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" /* Y2srcData */\ 71cabdff1aSopenharmony_ci "add $16, %%"FF_REG_d" \n\t"\ 72cabdff1aSopenharmony_ci "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 73cabdff1aSopenharmony_ci "pmulhw "#coeff", "#src1" \n\t"\ 74cabdff1aSopenharmony_ci "pmulhw "#coeff", "#src2" \n\t"\ 75cabdff1aSopenharmony_ci "paddw "#src1", "#dst1" \n\t"\ 76cabdff1aSopenharmony_ci "paddw "#src2", "#dst2" \n\t"\ 77cabdff1aSopenharmony_ci "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 78cabdff1aSopenharmony_ci " jnz 2b \n\t"\ 79cabdff1aSopenharmony_ci 80cabdff1aSopenharmony_ci#define YSCALEYUV2PACKEDX \ 81cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_UV \ 82cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ 83cabdff1aSopenharmony_ci 84cabdff1aSopenharmony_ci#define YSCALEYUV2PACKEDX_END \ 85cabdff1aSopenharmony_ci :: "r" (&c->redDither), \ 86cabdff1aSopenharmony_ci "m" (dummy), "m" (dummy), "m" (dummy),\ 87cabdff1aSopenharmony_ci "r" (dest), "m" (dstW_reg), "m"(uv_off) \ 88cabdff1aSopenharmony_ci NAMED_CONSTRAINTS_ADD(bF8,bFC) \ 89cabdff1aSopenharmony_ci : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \ 90cabdff1aSopenharmony_ci ); 91cabdff1aSopenharmony_ci 92cabdff1aSopenharmony_ci#define YSCALEYUV2PACKEDX_ACCURATE_UV \ 93cabdff1aSopenharmony_ci __asm__ volatile(\ 94cabdff1aSopenharmony_ci "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ 95cabdff1aSopenharmony_ci ".p2align 4 \n\t"\ 96cabdff1aSopenharmony_ci "nop \n\t"\ 97cabdff1aSopenharmony_ci "1: \n\t"\ 98cabdff1aSopenharmony_ci "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\ 99cabdff1aSopenharmony_ci "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 100cabdff1aSopenharmony_ci "pxor %%mm4, %%mm4 \n\t"\ 101cabdff1aSopenharmony_ci "pxor %%mm5, %%mm5 \n\t"\ 102cabdff1aSopenharmony_ci "pxor %%mm6, %%mm6 \n\t"\ 103cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t"\ 104cabdff1aSopenharmony_ci ".p2align 4 \n\t"\ 105cabdff1aSopenharmony_ci "2: \n\t"\ 106cabdff1aSopenharmony_ci "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm0 \n\t" /* UsrcData */\ 107cabdff1aSopenharmony_ci "add %6, %%"FF_REG_S" \n\t" \ 108cabdff1aSopenharmony_ci "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* VsrcData */\ 109cabdff1aSopenharmony_ci "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 110cabdff1aSopenharmony_ci "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm1 \n\t" /* UsrcData */\ 111cabdff1aSopenharmony_ci "movq %%mm0, %%mm3 \n\t"\ 112cabdff1aSopenharmony_ci "punpcklwd %%mm1, %%mm0 \n\t"\ 113cabdff1aSopenharmony_ci "punpckhwd %%mm1, %%mm3 \n\t"\ 114cabdff1aSopenharmony_ci "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1 \n\t" /* filterCoeff */\ 115cabdff1aSopenharmony_ci "pmaddwd %%mm1, %%mm0 \n\t"\ 116cabdff1aSopenharmony_ci "pmaddwd %%mm1, %%mm3 \n\t"\ 117cabdff1aSopenharmony_ci "paddd %%mm0, %%mm4 \n\t"\ 118cabdff1aSopenharmony_ci "paddd %%mm3, %%mm5 \n\t"\ 119cabdff1aSopenharmony_ci "add %6, %%"FF_REG_S" \n\t" \ 120cabdff1aSopenharmony_ci "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm3 \n\t" /* VsrcData */\ 121cabdff1aSopenharmony_ci "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 122cabdff1aSopenharmony_ci "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\ 123cabdff1aSopenharmony_ci "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 124cabdff1aSopenharmony_ci "movq %%mm2, %%mm0 \n\t"\ 125cabdff1aSopenharmony_ci "punpcklwd %%mm3, %%mm2 \n\t"\ 126cabdff1aSopenharmony_ci "punpckhwd %%mm3, %%mm0 \n\t"\ 127cabdff1aSopenharmony_ci "pmaddwd %%mm1, %%mm2 \n\t"\ 128cabdff1aSopenharmony_ci "pmaddwd %%mm1, %%mm0 \n\t"\ 129cabdff1aSopenharmony_ci "paddd %%mm2, %%mm6 \n\t"\ 130cabdff1aSopenharmony_ci "paddd %%mm0, %%mm7 \n\t"\ 131cabdff1aSopenharmony_ci " jnz 2b \n\t"\ 132cabdff1aSopenharmony_ci "psrad $16, %%mm4 \n\t"\ 133cabdff1aSopenharmony_ci "psrad $16, %%mm5 \n\t"\ 134cabdff1aSopenharmony_ci "psrad $16, %%mm6 \n\t"\ 135cabdff1aSopenharmony_ci "psrad $16, %%mm7 \n\t"\ 136cabdff1aSopenharmony_ci "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 137cabdff1aSopenharmony_ci "packssdw %%mm5, %%mm4 \n\t"\ 138cabdff1aSopenharmony_ci "packssdw %%mm7, %%mm6 \n\t"\ 139cabdff1aSopenharmony_ci "paddw %%mm0, %%mm4 \n\t"\ 140cabdff1aSopenharmony_ci "paddw %%mm0, %%mm6 \n\t"\ 141cabdff1aSopenharmony_ci "movq %%mm4, "U_TEMP"(%0) \n\t"\ 142cabdff1aSopenharmony_ci "movq %%mm6, "V_TEMP"(%0) \n\t"\ 143cabdff1aSopenharmony_ci 144cabdff1aSopenharmony_ci#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ 145cabdff1aSopenharmony_ci "lea "offset"(%0), %%"FF_REG_d" \n\t"\ 146cabdff1aSopenharmony_ci "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 147cabdff1aSopenharmony_ci "pxor %%mm1, %%mm1 \n\t"\ 148cabdff1aSopenharmony_ci "pxor %%mm5, %%mm5 \n\t"\ 149cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t"\ 150cabdff1aSopenharmony_ci "pxor %%mm6, %%mm6 \n\t"\ 151cabdff1aSopenharmony_ci ".p2align 4 \n\t"\ 152cabdff1aSopenharmony_ci "2: \n\t"\ 153cabdff1aSopenharmony_ci "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ 154cabdff1aSopenharmony_ci "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ 155cabdff1aSopenharmony_ci "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 156cabdff1aSopenharmony_ci "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ 157cabdff1aSopenharmony_ci "movq %%mm0, %%mm3 \n\t"\ 158cabdff1aSopenharmony_ci "punpcklwd %%mm4, %%mm0 \n\t"\ 159cabdff1aSopenharmony_ci "punpckhwd %%mm4, %%mm3 \n\t"\ 160cabdff1aSopenharmony_ci "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4 \n\t" /* filterCoeff */\ 161cabdff1aSopenharmony_ci "pmaddwd %%mm4, %%mm0 \n\t"\ 162cabdff1aSopenharmony_ci "pmaddwd %%mm4, %%mm3 \n\t"\ 163cabdff1aSopenharmony_ci "paddd %%mm0, %%mm1 \n\t"\ 164cabdff1aSopenharmony_ci "paddd %%mm3, %%mm5 \n\t"\ 165cabdff1aSopenharmony_ci "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ 166cabdff1aSopenharmony_ci "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 167cabdff1aSopenharmony_ci "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\ 168cabdff1aSopenharmony_ci "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 169cabdff1aSopenharmony_ci "movq %%mm2, %%mm0 \n\t"\ 170cabdff1aSopenharmony_ci "punpcklwd %%mm3, %%mm2 \n\t"\ 171cabdff1aSopenharmony_ci "punpckhwd %%mm3, %%mm0 \n\t"\ 172cabdff1aSopenharmony_ci "pmaddwd %%mm4, %%mm2 \n\t"\ 173cabdff1aSopenharmony_ci "pmaddwd %%mm4, %%mm0 \n\t"\ 174cabdff1aSopenharmony_ci "paddd %%mm2, %%mm7 \n\t"\ 175cabdff1aSopenharmony_ci "paddd %%mm0, %%mm6 \n\t"\ 176cabdff1aSopenharmony_ci " jnz 2b \n\t"\ 177cabdff1aSopenharmony_ci "psrad $16, %%mm1 \n\t"\ 178cabdff1aSopenharmony_ci "psrad $16, %%mm5 \n\t"\ 179cabdff1aSopenharmony_ci "psrad $16, %%mm7 \n\t"\ 180cabdff1aSopenharmony_ci "psrad $16, %%mm6 \n\t"\ 181cabdff1aSopenharmony_ci "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 182cabdff1aSopenharmony_ci "packssdw %%mm5, %%mm1 \n\t"\ 183cabdff1aSopenharmony_ci "packssdw %%mm6, %%mm7 \n\t"\ 184cabdff1aSopenharmony_ci "paddw %%mm0, %%mm1 \n\t"\ 185cabdff1aSopenharmony_ci "paddw %%mm0, %%mm7 \n\t"\ 186cabdff1aSopenharmony_ci "movq "U_TEMP"(%0), %%mm3 \n\t"\ 187cabdff1aSopenharmony_ci "movq "V_TEMP"(%0), %%mm4 \n\t"\ 188cabdff1aSopenharmony_ci 189cabdff1aSopenharmony_ci#define YSCALEYUV2PACKEDX_ACCURATE \ 190cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_ACCURATE_UV \ 191cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) 192cabdff1aSopenharmony_ci 193cabdff1aSopenharmony_ci#define YSCALEYUV2RGBX \ 194cabdff1aSopenharmony_ci "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ 195cabdff1aSopenharmony_ci "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ 196cabdff1aSopenharmony_ci "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 197cabdff1aSopenharmony_ci "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 198cabdff1aSopenharmony_ci "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ 199cabdff1aSopenharmony_ci "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ 200cabdff1aSopenharmony_ci /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 201cabdff1aSopenharmony_ci "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ 202cabdff1aSopenharmony_ci "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ 203cabdff1aSopenharmony_ci "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ 204cabdff1aSopenharmony_ci "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ 205cabdff1aSopenharmony_ci "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ 206cabdff1aSopenharmony_ci "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ 207cabdff1aSopenharmony_ci /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 208cabdff1aSopenharmony_ci "paddw %%mm3, %%mm4 \n\t"\ 209cabdff1aSopenharmony_ci "movq %%mm2, %%mm0 \n\t"\ 210cabdff1aSopenharmony_ci "movq %%mm5, %%mm6 \n\t"\ 211cabdff1aSopenharmony_ci "movq %%mm4, %%mm3 \n\t"\ 212cabdff1aSopenharmony_ci "punpcklwd %%mm2, %%mm2 \n\t"\ 213cabdff1aSopenharmony_ci "punpcklwd %%mm5, %%mm5 \n\t"\ 214cabdff1aSopenharmony_ci "punpcklwd %%mm4, %%mm4 \n\t"\ 215cabdff1aSopenharmony_ci "paddw %%mm1, %%mm2 \n\t"\ 216cabdff1aSopenharmony_ci "paddw %%mm1, %%mm5 \n\t"\ 217cabdff1aSopenharmony_ci "paddw %%mm1, %%mm4 \n\t"\ 218cabdff1aSopenharmony_ci "punpckhwd %%mm0, %%mm0 \n\t"\ 219cabdff1aSopenharmony_ci "punpckhwd %%mm6, %%mm6 \n\t"\ 220cabdff1aSopenharmony_ci "punpckhwd %%mm3, %%mm3 \n\t"\ 221cabdff1aSopenharmony_ci "paddw %%mm7, %%mm0 \n\t"\ 222cabdff1aSopenharmony_ci "paddw %%mm7, %%mm6 \n\t"\ 223cabdff1aSopenharmony_ci "paddw %%mm7, %%mm3 \n\t"\ 224cabdff1aSopenharmony_ci /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 225cabdff1aSopenharmony_ci "packuswb %%mm0, %%mm2 \n\t"\ 226cabdff1aSopenharmony_ci "packuswb %%mm6, %%mm5 \n\t"\ 227cabdff1aSopenharmony_ci "packuswb %%mm3, %%mm4 \n\t"\ 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ci#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ 230cabdff1aSopenharmony_ci "movq "#b", "#q2" \n\t" /* B */\ 231cabdff1aSopenharmony_ci "movq "#r", "#t" \n\t" /* R */\ 232cabdff1aSopenharmony_ci "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\ 233cabdff1aSopenharmony_ci "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\ 234cabdff1aSopenharmony_ci "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\ 235cabdff1aSopenharmony_ci "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\ 236cabdff1aSopenharmony_ci "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\ 237cabdff1aSopenharmony_ci "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\ 238cabdff1aSopenharmony_ci "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\ 239cabdff1aSopenharmony_ci "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\ 240cabdff1aSopenharmony_ci "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\ 241cabdff1aSopenharmony_ci "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\ 242cabdff1aSopenharmony_ci\ 243cabdff1aSopenharmony_ci MOVNTQ( q0, (dst, index, 4))\ 244cabdff1aSopenharmony_ci MOVNTQ( b, 8(dst, index, 4))\ 245cabdff1aSopenharmony_ci MOVNTQ( q2, 16(dst, index, 4))\ 246cabdff1aSopenharmony_ci MOVNTQ( q3, 24(dst, index, 4))\ 247cabdff1aSopenharmony_ci\ 248cabdff1aSopenharmony_ci "add $8, "#index" \n\t"\ 249cabdff1aSopenharmony_ci "cmp "dstw", "#index" \n\t"\ 250cabdff1aSopenharmony_ci " jb 1b \n\t" 251cabdff1aSopenharmony_ci#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) 252cabdff1aSopenharmony_ci 253cabdff1aSopenharmony_cistatic void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, 254cabdff1aSopenharmony_ci const int16_t **lumSrc, int lumFilterSize, 255cabdff1aSopenharmony_ci const int16_t *chrFilter, const int16_t **chrUSrc, 256cabdff1aSopenharmony_ci const int16_t **chrVSrc, 257cabdff1aSopenharmony_ci int chrFilterSize, const int16_t **alpSrc, 258cabdff1aSopenharmony_ci uint8_t *dest, int dstW, int dstY) 259cabdff1aSopenharmony_ci{ 260cabdff1aSopenharmony_ci x86_reg dummy=0; 261cabdff1aSopenharmony_ci x86_reg dstW_reg = dstW; 262cabdff1aSopenharmony_ci x86_reg uv_off = c->uv_offx2; 263cabdff1aSopenharmony_ci 264cabdff1aSopenharmony_ci if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { 265cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_ACCURATE 266cabdff1aSopenharmony_ci YSCALEYUV2RGBX 267cabdff1aSopenharmony_ci "movq %%mm2, "U_TEMP"(%0) \n\t" 268cabdff1aSopenharmony_ci "movq %%mm4, "V_TEMP"(%0) \n\t" 269cabdff1aSopenharmony_ci "movq %%mm5, "Y_TEMP"(%0) \n\t" 270cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET) 271cabdff1aSopenharmony_ci "movq "Y_TEMP"(%0), %%mm5 \n\t" 272cabdff1aSopenharmony_ci "psraw $3, %%mm1 \n\t" 273cabdff1aSopenharmony_ci "psraw $3, %%mm7 \n\t" 274cabdff1aSopenharmony_ci "packuswb %%mm7, %%mm1 \n\t" 275cabdff1aSopenharmony_ci WRITEBGR32(%4, "%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) 276cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_END 277cabdff1aSopenharmony_ci } else { 278cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_ACCURATE 279cabdff1aSopenharmony_ci YSCALEYUV2RGBX 280cabdff1aSopenharmony_ci "pcmpeqd %%mm7, %%mm7 \n\t" 281cabdff1aSopenharmony_ci WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 282cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_END 283cabdff1aSopenharmony_ci } 284cabdff1aSopenharmony_ci} 285cabdff1aSopenharmony_ci 286cabdff1aSopenharmony_cistatic void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, 287cabdff1aSopenharmony_ci const int16_t **lumSrc, int lumFilterSize, 288cabdff1aSopenharmony_ci const int16_t *chrFilter, const int16_t **chrUSrc, 289cabdff1aSopenharmony_ci const int16_t **chrVSrc, 290cabdff1aSopenharmony_ci int chrFilterSize, const int16_t **alpSrc, 291cabdff1aSopenharmony_ci uint8_t *dest, int dstW, int dstY) 292cabdff1aSopenharmony_ci{ 293cabdff1aSopenharmony_ci x86_reg dummy=0; 294cabdff1aSopenharmony_ci x86_reg dstW_reg = dstW; 295cabdff1aSopenharmony_ci x86_reg uv_off = c->uv_offx2; 296cabdff1aSopenharmony_ci 297cabdff1aSopenharmony_ci if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { 298cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX 299cabdff1aSopenharmony_ci YSCALEYUV2RGBX 300cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) 301cabdff1aSopenharmony_ci "psraw $3, %%mm1 \n\t" 302cabdff1aSopenharmony_ci "psraw $3, %%mm7 \n\t" 303cabdff1aSopenharmony_ci "packuswb %%mm7, %%mm1 \n\t" 304cabdff1aSopenharmony_ci WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 305cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_END 306cabdff1aSopenharmony_ci } else { 307cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX 308cabdff1aSopenharmony_ci YSCALEYUV2RGBX 309cabdff1aSopenharmony_ci "pcmpeqd %%mm7, %%mm7 \n\t" 310cabdff1aSopenharmony_ci WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 311cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_END 312cabdff1aSopenharmony_ci } 313cabdff1aSopenharmony_ci} 314cabdff1aSopenharmony_ci 315cabdff1aSopenharmony_cistatic void RENAME(yuv2bgr32_X)(SwsContext *c, const int16_t *lumFilter, 316cabdff1aSopenharmony_ci const int16_t **lumSrc, int lumFilterSize, 317cabdff1aSopenharmony_ci const int16_t *chrFilter, const int16_t **chrUSrc, 318cabdff1aSopenharmony_ci const int16_t **chrVSrc, 319cabdff1aSopenharmony_ci int chrFilterSize, const int16_t **alpSrc, 320cabdff1aSopenharmony_ci uint8_t *dest, int dstW, int dstY) 321cabdff1aSopenharmony_ci{ 322cabdff1aSopenharmony_ci x86_reg dummy=0; 323cabdff1aSopenharmony_ci x86_reg dstW_reg = dstW; 324cabdff1aSopenharmony_ci x86_reg uv_off = c->uv_offx2; 325cabdff1aSopenharmony_ci 326cabdff1aSopenharmony_ci if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { 327cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX 328cabdff1aSopenharmony_ci YSCALEYUV2RGBX 329cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) 330cabdff1aSopenharmony_ci "psraw $3, %%mm1 \n\t" 331cabdff1aSopenharmony_ci "psraw $3, %%mm7 \n\t" 332cabdff1aSopenharmony_ci "packuswb %%mm7, %%mm1 \n\t" 333cabdff1aSopenharmony_ci WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 334cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_END 335cabdff1aSopenharmony_ci } else { 336cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX 337cabdff1aSopenharmony_ci YSCALEYUV2RGBX 338cabdff1aSopenharmony_ci "pcmpeqd %%mm7, %%mm7 \n\t" 339cabdff1aSopenharmony_ci WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 340cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_END 341cabdff1aSopenharmony_ci } 342cabdff1aSopenharmony_ci} 343cabdff1aSopenharmony_ci 344cabdff1aSopenharmony_ci#define REAL_WRITERGB16(dst, dstw, index) \ 345cabdff1aSopenharmony_ci "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ 346cabdff1aSopenharmony_ci "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ 347cabdff1aSopenharmony_ci "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ 348cabdff1aSopenharmony_ci "psrlq $3, %%mm2 \n\t"\ 349cabdff1aSopenharmony_ci\ 350cabdff1aSopenharmony_ci "movq %%mm2, %%mm1 \n\t"\ 351cabdff1aSopenharmony_ci "movq %%mm4, %%mm3 \n\t"\ 352cabdff1aSopenharmony_ci\ 353cabdff1aSopenharmony_ci "punpcklbw %%mm7, %%mm3 \n\t"\ 354cabdff1aSopenharmony_ci "punpcklbw %%mm5, %%mm2 \n\t"\ 355cabdff1aSopenharmony_ci "punpckhbw %%mm7, %%mm4 \n\t"\ 356cabdff1aSopenharmony_ci "punpckhbw %%mm5, %%mm1 \n\t"\ 357cabdff1aSopenharmony_ci\ 358cabdff1aSopenharmony_ci "psllq $3, %%mm3 \n\t"\ 359cabdff1aSopenharmony_ci "psllq $3, %%mm4 \n\t"\ 360cabdff1aSopenharmony_ci\ 361cabdff1aSopenharmony_ci "por %%mm3, %%mm2 \n\t"\ 362cabdff1aSopenharmony_ci "por %%mm4, %%mm1 \n\t"\ 363cabdff1aSopenharmony_ci\ 364cabdff1aSopenharmony_ci MOVNTQ(%%mm2, (dst, index, 2))\ 365cabdff1aSopenharmony_ci MOVNTQ(%%mm1, 8(dst, index, 2))\ 366cabdff1aSopenharmony_ci\ 367cabdff1aSopenharmony_ci "add $8, "#index" \n\t"\ 368cabdff1aSopenharmony_ci "cmp "dstw", "#index" \n\t"\ 369cabdff1aSopenharmony_ci " jb 1b \n\t" 370cabdff1aSopenharmony_ci#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) 371cabdff1aSopenharmony_ci 372cabdff1aSopenharmony_cistatic void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, 373cabdff1aSopenharmony_ci const int16_t **lumSrc, int lumFilterSize, 374cabdff1aSopenharmony_ci const int16_t *chrFilter, const int16_t **chrUSrc, 375cabdff1aSopenharmony_ci const int16_t **chrVSrc, 376cabdff1aSopenharmony_ci int chrFilterSize, const int16_t **alpSrc, 377cabdff1aSopenharmony_ci uint8_t *dest, int dstW, int dstY) 378cabdff1aSopenharmony_ci{ 379cabdff1aSopenharmony_ci x86_reg dummy=0; 380cabdff1aSopenharmony_ci x86_reg dstW_reg = dstW; 381cabdff1aSopenharmony_ci x86_reg uv_off = c->uv_offx2; 382cabdff1aSopenharmony_ci 383cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_ACCURATE 384cabdff1aSopenharmony_ci YSCALEYUV2RGBX 385cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 386cabdff1aSopenharmony_ci /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 387cabdff1aSopenharmony_ci#ifdef DITHER1XBPP 388cabdff1aSopenharmony_ci "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" 389cabdff1aSopenharmony_ci "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" 390cabdff1aSopenharmony_ci "paddusb "RED_DITHER"(%0), %%mm5\n\t" 391cabdff1aSopenharmony_ci#endif 392cabdff1aSopenharmony_ci WRITERGB16(%4, "%5", %%FF_REGa) 393cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_END 394cabdff1aSopenharmony_ci} 395cabdff1aSopenharmony_ci 396cabdff1aSopenharmony_cistatic void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, 397cabdff1aSopenharmony_ci const int16_t **lumSrc, int lumFilterSize, 398cabdff1aSopenharmony_ci const int16_t *chrFilter, const int16_t **chrUSrc, 399cabdff1aSopenharmony_ci const int16_t **chrVSrc, 400cabdff1aSopenharmony_ci int chrFilterSize, const int16_t **alpSrc, 401cabdff1aSopenharmony_ci uint8_t *dest, int dstW, int dstY) 402cabdff1aSopenharmony_ci{ 403cabdff1aSopenharmony_ci x86_reg dummy=0; 404cabdff1aSopenharmony_ci x86_reg dstW_reg = dstW; 405cabdff1aSopenharmony_ci x86_reg uv_off = c->uv_offx2; 406cabdff1aSopenharmony_ci 407cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX 408cabdff1aSopenharmony_ci YSCALEYUV2RGBX 409cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 410cabdff1aSopenharmony_ci /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 411cabdff1aSopenharmony_ci#ifdef DITHER1XBPP 412cabdff1aSopenharmony_ci "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" 413cabdff1aSopenharmony_ci "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" 414cabdff1aSopenharmony_ci "paddusb "RED_DITHER"(%0), %%mm5 \n\t" 415cabdff1aSopenharmony_ci#endif 416cabdff1aSopenharmony_ci WRITERGB16(%4, "%5", %%FF_REGa) 417cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_END 418cabdff1aSopenharmony_ci} 419cabdff1aSopenharmony_ci 420cabdff1aSopenharmony_ci#define REAL_WRITERGB15(dst, dstw, index) \ 421cabdff1aSopenharmony_ci "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ 422cabdff1aSopenharmony_ci "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ 423cabdff1aSopenharmony_ci "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ 424cabdff1aSopenharmony_ci "psrlq $3, %%mm2 \n\t"\ 425cabdff1aSopenharmony_ci "psrlq $1, %%mm5 \n\t"\ 426cabdff1aSopenharmony_ci\ 427cabdff1aSopenharmony_ci "movq %%mm2, %%mm1 \n\t"\ 428cabdff1aSopenharmony_ci "movq %%mm4, %%mm3 \n\t"\ 429cabdff1aSopenharmony_ci\ 430cabdff1aSopenharmony_ci "punpcklbw %%mm7, %%mm3 \n\t"\ 431cabdff1aSopenharmony_ci "punpcklbw %%mm5, %%mm2 \n\t"\ 432cabdff1aSopenharmony_ci "punpckhbw %%mm7, %%mm4 \n\t"\ 433cabdff1aSopenharmony_ci "punpckhbw %%mm5, %%mm1 \n\t"\ 434cabdff1aSopenharmony_ci\ 435cabdff1aSopenharmony_ci "psllq $2, %%mm3 \n\t"\ 436cabdff1aSopenharmony_ci "psllq $2, %%mm4 \n\t"\ 437cabdff1aSopenharmony_ci\ 438cabdff1aSopenharmony_ci "por %%mm3, %%mm2 \n\t"\ 439cabdff1aSopenharmony_ci "por %%mm4, %%mm1 \n\t"\ 440cabdff1aSopenharmony_ci\ 441cabdff1aSopenharmony_ci MOVNTQ(%%mm2, (dst, index, 2))\ 442cabdff1aSopenharmony_ci MOVNTQ(%%mm1, 8(dst, index, 2))\ 443cabdff1aSopenharmony_ci\ 444cabdff1aSopenharmony_ci "add $8, "#index" \n\t"\ 445cabdff1aSopenharmony_ci "cmp "dstw", "#index" \n\t"\ 446cabdff1aSopenharmony_ci " jb 1b \n\t" 447cabdff1aSopenharmony_ci#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) 448cabdff1aSopenharmony_ci 449cabdff1aSopenharmony_cistatic void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, 450cabdff1aSopenharmony_ci const int16_t **lumSrc, int lumFilterSize, 451cabdff1aSopenharmony_ci const int16_t *chrFilter, const int16_t **chrUSrc, 452cabdff1aSopenharmony_ci const int16_t **chrVSrc, 453cabdff1aSopenharmony_ci int chrFilterSize, const int16_t **alpSrc, 454cabdff1aSopenharmony_ci uint8_t *dest, int dstW, int dstY) 455cabdff1aSopenharmony_ci{ 456cabdff1aSopenharmony_ci x86_reg dummy=0; 457cabdff1aSopenharmony_ci x86_reg dstW_reg = dstW; 458cabdff1aSopenharmony_ci x86_reg uv_off = c->uv_offx2; 459cabdff1aSopenharmony_ci 460cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_ACCURATE 461cabdff1aSopenharmony_ci YSCALEYUV2RGBX 462cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 463cabdff1aSopenharmony_ci /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 464cabdff1aSopenharmony_ci#ifdef DITHER1XBPP 465cabdff1aSopenharmony_ci "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" 466cabdff1aSopenharmony_ci "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" 467cabdff1aSopenharmony_ci "paddusb "RED_DITHER"(%0), %%mm5\n\t" 468cabdff1aSopenharmony_ci#endif 469cabdff1aSopenharmony_ci WRITERGB15(%4, "%5", %%FF_REGa) 470cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_END 471cabdff1aSopenharmony_ci} 472cabdff1aSopenharmony_ci 473cabdff1aSopenharmony_cistatic void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, 474cabdff1aSopenharmony_ci const int16_t **lumSrc, int lumFilterSize, 475cabdff1aSopenharmony_ci const int16_t *chrFilter, const int16_t **chrUSrc, 476cabdff1aSopenharmony_ci const int16_t **chrVSrc, 477cabdff1aSopenharmony_ci int chrFilterSize, const int16_t **alpSrc, 478cabdff1aSopenharmony_ci uint8_t *dest, int dstW, int dstY) 479cabdff1aSopenharmony_ci{ 480cabdff1aSopenharmony_ci x86_reg dummy=0; 481cabdff1aSopenharmony_ci x86_reg dstW_reg = dstW; 482cabdff1aSopenharmony_ci x86_reg uv_off = c->uv_offx2; 483cabdff1aSopenharmony_ci 484cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX 485cabdff1aSopenharmony_ci YSCALEYUV2RGBX 486cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 487cabdff1aSopenharmony_ci /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 488cabdff1aSopenharmony_ci#ifdef DITHER1XBPP 489cabdff1aSopenharmony_ci "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" 490cabdff1aSopenharmony_ci "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" 491cabdff1aSopenharmony_ci "paddusb "RED_DITHER"(%0), %%mm5 \n\t" 492cabdff1aSopenharmony_ci#endif 493cabdff1aSopenharmony_ci WRITERGB15(%4, "%5", %%FF_REGa) 494cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_END 495cabdff1aSopenharmony_ci} 496cabdff1aSopenharmony_ci 497cabdff1aSopenharmony_ci#define WRITEBGR24MMX(dst, dstw, index) \ 498cabdff1aSopenharmony_ci /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 499cabdff1aSopenharmony_ci "movq %%mm2, %%mm1 \n\t" /* B */\ 500cabdff1aSopenharmony_ci "movq %%mm5, %%mm6 \n\t" /* R */\ 501cabdff1aSopenharmony_ci "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ 502cabdff1aSopenharmony_ci "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ 503cabdff1aSopenharmony_ci "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ 504cabdff1aSopenharmony_ci "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ 505cabdff1aSopenharmony_ci "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ 506cabdff1aSopenharmony_ci "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ 507cabdff1aSopenharmony_ci "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ 508cabdff1aSopenharmony_ci "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 509cabdff1aSopenharmony_ci "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ 510cabdff1aSopenharmony_ci "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ 511cabdff1aSopenharmony_ci\ 512cabdff1aSopenharmony_ci "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ 513cabdff1aSopenharmony_ci "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ 514cabdff1aSopenharmony_ci "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ 515cabdff1aSopenharmony_ci "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ 516cabdff1aSopenharmony_ci\ 517cabdff1aSopenharmony_ci "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ 518cabdff1aSopenharmony_ci "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ 519cabdff1aSopenharmony_ci "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ 520cabdff1aSopenharmony_ci "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ 521cabdff1aSopenharmony_ci\ 522cabdff1aSopenharmony_ci "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ 523cabdff1aSopenharmony_ci "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ 524cabdff1aSopenharmony_ci "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ 525cabdff1aSopenharmony_ci "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ 526cabdff1aSopenharmony_ci\ 527cabdff1aSopenharmony_ci "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ 528cabdff1aSopenharmony_ci "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ 529cabdff1aSopenharmony_ci "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ 530cabdff1aSopenharmony_ci "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ 531cabdff1aSopenharmony_ci MOVNTQ(%%mm0, (dst))\ 532cabdff1aSopenharmony_ci\ 533cabdff1aSopenharmony_ci "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ 534cabdff1aSopenharmony_ci "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ 535cabdff1aSopenharmony_ci "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ 536cabdff1aSopenharmony_ci "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ 537cabdff1aSopenharmony_ci MOVNTQ(%%mm6, 8(dst))\ 538cabdff1aSopenharmony_ci\ 539cabdff1aSopenharmony_ci "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ 540cabdff1aSopenharmony_ci "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ 541cabdff1aSopenharmony_ci "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ 542cabdff1aSopenharmony_ci MOVNTQ(%%mm5, 16(dst))\ 543cabdff1aSopenharmony_ci\ 544cabdff1aSopenharmony_ci "add $24, "#dst" \n\t"\ 545cabdff1aSopenharmony_ci\ 546cabdff1aSopenharmony_ci "add $8, "#index" \n\t"\ 547cabdff1aSopenharmony_ci "cmp "dstw", "#index" \n\t"\ 548cabdff1aSopenharmony_ci " jb 1b \n\t" 549cabdff1aSopenharmony_ci 550cabdff1aSopenharmony_ci#define WRITEBGR24MMXEXT(dst, dstw, index) \ 551cabdff1aSopenharmony_ci /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 552cabdff1aSopenharmony_ci "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ 553cabdff1aSopenharmony_ci "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ 554cabdff1aSopenharmony_ci "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ 555cabdff1aSopenharmony_ci "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ 556cabdff1aSopenharmony_ci "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ 557cabdff1aSopenharmony_ci\ 558cabdff1aSopenharmony_ci "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ 559cabdff1aSopenharmony_ci "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ 560cabdff1aSopenharmony_ci "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ 561cabdff1aSopenharmony_ci\ 562cabdff1aSopenharmony_ci "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ 563cabdff1aSopenharmony_ci "por %%mm1, %%mm6 \n\t"\ 564cabdff1aSopenharmony_ci "por %%mm3, %%mm6 \n\t"\ 565cabdff1aSopenharmony_ci MOVNTQ(%%mm6, (dst))\ 566cabdff1aSopenharmony_ci\ 567cabdff1aSopenharmony_ci "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ 568cabdff1aSopenharmony_ci "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ 569cabdff1aSopenharmony_ci "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ 570cabdff1aSopenharmony_ci "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ 571cabdff1aSopenharmony_ci\ 572cabdff1aSopenharmony_ci "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ 573cabdff1aSopenharmony_ci "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ 574cabdff1aSopenharmony_ci "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ 575cabdff1aSopenharmony_ci\ 576cabdff1aSopenharmony_ci "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ 577cabdff1aSopenharmony_ci "por %%mm3, %%mm6 \n\t"\ 578cabdff1aSopenharmony_ci MOVNTQ(%%mm6, 8(dst))\ 579cabdff1aSopenharmony_ci\ 580cabdff1aSopenharmony_ci "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ 581cabdff1aSopenharmony_ci "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ 582cabdff1aSopenharmony_ci "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ 583cabdff1aSopenharmony_ci\ 584cabdff1aSopenharmony_ci "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ 585cabdff1aSopenharmony_ci "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ 586cabdff1aSopenharmony_ci "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ 587cabdff1aSopenharmony_ci\ 588cabdff1aSopenharmony_ci "por %%mm1, %%mm3 \n\t"\ 589cabdff1aSopenharmony_ci "por %%mm3, %%mm6 \n\t"\ 590cabdff1aSopenharmony_ci MOVNTQ(%%mm6, 16(dst))\ 591cabdff1aSopenharmony_ci\ 592cabdff1aSopenharmony_ci "add $24, "#dst" \n\t"\ 593cabdff1aSopenharmony_ci\ 594cabdff1aSopenharmony_ci "add $8, "#index" \n\t"\ 595cabdff1aSopenharmony_ci "cmp "dstw", "#index" \n\t"\ 596cabdff1aSopenharmony_ci " jb 1b \n\t" 597cabdff1aSopenharmony_ci 598cabdff1aSopenharmony_ci#undef WRITEBGR24 599cabdff1aSopenharmony_ci#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index) 600cabdff1aSopenharmony_ci 601cabdff1aSopenharmony_ci#if HAVE_6REGS 602cabdff1aSopenharmony_cistatic void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, 603cabdff1aSopenharmony_ci const int16_t **lumSrc, int lumFilterSize, 604cabdff1aSopenharmony_ci const int16_t *chrFilter, const int16_t **chrUSrc, 605cabdff1aSopenharmony_ci const int16_t **chrVSrc, 606cabdff1aSopenharmony_ci int chrFilterSize, const int16_t **alpSrc, 607cabdff1aSopenharmony_ci uint8_t *dest, int dstW, int dstY) 608cabdff1aSopenharmony_ci{ 609cabdff1aSopenharmony_ci x86_reg dummy=0; 610cabdff1aSopenharmony_ci x86_reg dstW_reg = dstW; 611cabdff1aSopenharmony_ci x86_reg uv_off = c->uv_offx2; 612cabdff1aSopenharmony_ci 613cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_ACCURATE 614cabdff1aSopenharmony_ci YSCALEYUV2RGBX 615cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 616cabdff1aSopenharmony_ci "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c"\n\t" //FIXME optimize 617cabdff1aSopenharmony_ci "add %4, %%"FF_REG_c" \n\t" 618cabdff1aSopenharmony_ci WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa) 619cabdff1aSopenharmony_ci :: "r" (&c->redDither), 620cabdff1aSopenharmony_ci "m" (dummy), "m" (dummy), "m" (dummy), 621cabdff1aSopenharmony_ci "r" (dest), "m" (dstW_reg), "m"(uv_off) 622cabdff1aSopenharmony_ci NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) 623cabdff1aSopenharmony_ci : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S 624cabdff1aSopenharmony_ci ); 625cabdff1aSopenharmony_ci} 626cabdff1aSopenharmony_ci 627cabdff1aSopenharmony_cistatic void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, 628cabdff1aSopenharmony_ci const int16_t **lumSrc, int lumFilterSize, 629cabdff1aSopenharmony_ci const int16_t *chrFilter, const int16_t **chrUSrc, 630cabdff1aSopenharmony_ci const int16_t **chrVSrc, 631cabdff1aSopenharmony_ci int chrFilterSize, const int16_t **alpSrc, 632cabdff1aSopenharmony_ci uint8_t *dest, int dstW, int dstY) 633cabdff1aSopenharmony_ci{ 634cabdff1aSopenharmony_ci x86_reg dummy=0; 635cabdff1aSopenharmony_ci x86_reg dstW_reg = dstW; 636cabdff1aSopenharmony_ci x86_reg uv_off = c->uv_offx2; 637cabdff1aSopenharmony_ci 638cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX 639cabdff1aSopenharmony_ci YSCALEYUV2RGBX 640cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 641cabdff1aSopenharmony_ci "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" //FIXME optimize 642cabdff1aSopenharmony_ci "add %4, %%"FF_REG_c" \n\t" 643cabdff1aSopenharmony_ci WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa) 644cabdff1aSopenharmony_ci :: "r" (&c->redDither), 645cabdff1aSopenharmony_ci "m" (dummy), "m" (dummy), "m" (dummy), 646cabdff1aSopenharmony_ci "r" (dest), "m" (dstW_reg), "m"(uv_off) 647cabdff1aSopenharmony_ci NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) 648cabdff1aSopenharmony_ci : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S 649cabdff1aSopenharmony_ci ); 650cabdff1aSopenharmony_ci} 651cabdff1aSopenharmony_ci#endif /* HAVE_6REGS */ 652cabdff1aSopenharmony_ci 653cabdff1aSopenharmony_ci#define REAL_WRITEYUY2(dst, dstw, index) \ 654cabdff1aSopenharmony_ci "packuswb %%mm3, %%mm3 \n\t"\ 655cabdff1aSopenharmony_ci "packuswb %%mm4, %%mm4 \n\t"\ 656cabdff1aSopenharmony_ci "packuswb %%mm7, %%mm1 \n\t"\ 657cabdff1aSopenharmony_ci "punpcklbw %%mm4, %%mm3 \n\t"\ 658cabdff1aSopenharmony_ci "movq %%mm1, %%mm7 \n\t"\ 659cabdff1aSopenharmony_ci "punpcklbw %%mm3, %%mm1 \n\t"\ 660cabdff1aSopenharmony_ci "punpckhbw %%mm3, %%mm7 \n\t"\ 661cabdff1aSopenharmony_ci\ 662cabdff1aSopenharmony_ci MOVNTQ(%%mm1, (dst, index, 2))\ 663cabdff1aSopenharmony_ci MOVNTQ(%%mm7, 8(dst, index, 2))\ 664cabdff1aSopenharmony_ci\ 665cabdff1aSopenharmony_ci "add $8, "#index" \n\t"\ 666cabdff1aSopenharmony_ci "cmp "dstw", "#index" \n\t"\ 667cabdff1aSopenharmony_ci " jb 1b \n\t" 668cabdff1aSopenharmony_ci#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) 669cabdff1aSopenharmony_ci 670cabdff1aSopenharmony_cistatic void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, 671cabdff1aSopenharmony_ci const int16_t **lumSrc, int lumFilterSize, 672cabdff1aSopenharmony_ci const int16_t *chrFilter, const int16_t **chrUSrc, 673cabdff1aSopenharmony_ci const int16_t **chrVSrc, 674cabdff1aSopenharmony_ci int chrFilterSize, const int16_t **alpSrc, 675cabdff1aSopenharmony_ci uint8_t *dest, int dstW, int dstY) 676cabdff1aSopenharmony_ci{ 677cabdff1aSopenharmony_ci x86_reg dummy=0; 678cabdff1aSopenharmony_ci x86_reg dstW_reg = dstW; 679cabdff1aSopenharmony_ci x86_reg uv_off = c->uv_offx2; 680cabdff1aSopenharmony_ci 681cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_ACCURATE 682cabdff1aSopenharmony_ci /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 683cabdff1aSopenharmony_ci "psraw $3, %%mm3 \n\t" 684cabdff1aSopenharmony_ci "psraw $3, %%mm4 \n\t" 685cabdff1aSopenharmony_ci "psraw $3, %%mm1 \n\t" 686cabdff1aSopenharmony_ci "psraw $3, %%mm7 \n\t" 687cabdff1aSopenharmony_ci WRITEYUY2(%4, "%5", %%FF_REGa) 688cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_END 689cabdff1aSopenharmony_ci} 690cabdff1aSopenharmony_ci 691cabdff1aSopenharmony_cistatic void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, 692cabdff1aSopenharmony_ci const int16_t **lumSrc, int lumFilterSize, 693cabdff1aSopenharmony_ci const int16_t *chrFilter, const int16_t **chrUSrc, 694cabdff1aSopenharmony_ci const int16_t **chrVSrc, 695cabdff1aSopenharmony_ci int chrFilterSize, const int16_t **alpSrc, 696cabdff1aSopenharmony_ci uint8_t *dest, int dstW, int dstY) 697cabdff1aSopenharmony_ci{ 698cabdff1aSopenharmony_ci x86_reg dummy=0; 699cabdff1aSopenharmony_ci x86_reg dstW_reg = dstW; 700cabdff1aSopenharmony_ci x86_reg uv_off = c->uv_offx2; 701cabdff1aSopenharmony_ci 702cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX 703cabdff1aSopenharmony_ci /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 704cabdff1aSopenharmony_ci "psraw $3, %%mm3 \n\t" 705cabdff1aSopenharmony_ci "psraw $3, %%mm4 \n\t" 706cabdff1aSopenharmony_ci "psraw $3, %%mm1 \n\t" 707cabdff1aSopenharmony_ci "psraw $3, %%mm7 \n\t" 708cabdff1aSopenharmony_ci WRITEYUY2(%4, "%5", %%FF_REGa) 709cabdff1aSopenharmony_ci YSCALEYUV2PACKEDX_END 710cabdff1aSopenharmony_ci} 711cabdff1aSopenharmony_ci 712cabdff1aSopenharmony_ci#define REAL_YSCALEYUV2RGB_UV(index, c) \ 713cabdff1aSopenharmony_ci "xor "#index", "#index" \n\t"\ 714cabdff1aSopenharmony_ci ".p2align 4 \n\t"\ 715cabdff1aSopenharmony_ci "1: \n\t"\ 716cabdff1aSopenharmony_ci "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 717cabdff1aSopenharmony_ci "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 718cabdff1aSopenharmony_ci "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 719cabdff1aSopenharmony_ci "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 720cabdff1aSopenharmony_ci "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 721cabdff1aSopenharmony_ci "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 722cabdff1aSopenharmony_ci "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ 723cabdff1aSopenharmony_ci "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ 724cabdff1aSopenharmony_ci "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 725cabdff1aSopenharmony_ci "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ 726cabdff1aSopenharmony_ci "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ 727cabdff1aSopenharmony_ci "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 728cabdff1aSopenharmony_ci "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 729cabdff1aSopenharmony_ci "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ 730cabdff1aSopenharmony_ci "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ 731cabdff1aSopenharmony_ci "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 732cabdff1aSopenharmony_ci "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 733cabdff1aSopenharmony_ci "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 734cabdff1aSopenharmony_ci "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 735cabdff1aSopenharmony_ci "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 736cabdff1aSopenharmony_ci "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 737cabdff1aSopenharmony_ci /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 738cabdff1aSopenharmony_ci 739cabdff1aSopenharmony_ci#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ 740cabdff1aSopenharmony_ci "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ 741cabdff1aSopenharmony_ci "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ 742cabdff1aSopenharmony_ci "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ 743cabdff1aSopenharmony_ci "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ 744cabdff1aSopenharmony_ci "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ 745cabdff1aSopenharmony_ci "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ 746cabdff1aSopenharmony_ci "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 747cabdff1aSopenharmony_ci "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 748cabdff1aSopenharmony_ci "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 749cabdff1aSopenharmony_ci "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 750cabdff1aSopenharmony_ci "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 751cabdff1aSopenharmony_ci "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 752cabdff1aSopenharmony_ci 753cabdff1aSopenharmony_ci#define REAL_YSCALEYUV2RGB_COEFF(c) \ 754cabdff1aSopenharmony_ci "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 755cabdff1aSopenharmony_ci "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 756cabdff1aSopenharmony_ci "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 757cabdff1aSopenharmony_ci "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 758cabdff1aSopenharmony_ci "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 759cabdff1aSopenharmony_ci "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 760cabdff1aSopenharmony_ci /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 761cabdff1aSopenharmony_ci "paddw %%mm3, %%mm4 \n\t"\ 762cabdff1aSopenharmony_ci "movq %%mm2, %%mm0 \n\t"\ 763cabdff1aSopenharmony_ci "movq %%mm5, %%mm6 \n\t"\ 764cabdff1aSopenharmony_ci "movq %%mm4, %%mm3 \n\t"\ 765cabdff1aSopenharmony_ci "punpcklwd %%mm2, %%mm2 \n\t"\ 766cabdff1aSopenharmony_ci "punpcklwd %%mm5, %%mm5 \n\t"\ 767cabdff1aSopenharmony_ci "punpcklwd %%mm4, %%mm4 \n\t"\ 768cabdff1aSopenharmony_ci "paddw %%mm1, %%mm2 \n\t"\ 769cabdff1aSopenharmony_ci "paddw %%mm1, %%mm5 \n\t"\ 770cabdff1aSopenharmony_ci "paddw %%mm1, %%mm4 \n\t"\ 771cabdff1aSopenharmony_ci "punpckhwd %%mm0, %%mm0 \n\t"\ 772cabdff1aSopenharmony_ci "punpckhwd %%mm6, %%mm6 \n\t"\ 773cabdff1aSopenharmony_ci "punpckhwd %%mm3, %%mm3 \n\t"\ 774cabdff1aSopenharmony_ci "paddw %%mm7, %%mm0 \n\t"\ 775cabdff1aSopenharmony_ci "paddw %%mm7, %%mm6 \n\t"\ 776cabdff1aSopenharmony_ci "paddw %%mm7, %%mm3 \n\t"\ 777cabdff1aSopenharmony_ci /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 778cabdff1aSopenharmony_ci "packuswb %%mm0, %%mm2 \n\t"\ 779cabdff1aSopenharmony_ci "packuswb %%mm6, %%mm5 \n\t"\ 780cabdff1aSopenharmony_ci "packuswb %%mm3, %%mm4 \n\t"\ 781cabdff1aSopenharmony_ci 782cabdff1aSopenharmony_ci#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) 783cabdff1aSopenharmony_ci 784cabdff1aSopenharmony_ci#define YSCALEYUV2RGB(index, c) \ 785cabdff1aSopenharmony_ci REAL_YSCALEYUV2RGB_UV(index, c) \ 786cabdff1aSopenharmony_ci REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ 787cabdff1aSopenharmony_ci REAL_YSCALEYUV2RGB_COEFF(c) 788cabdff1aSopenharmony_ci 789cabdff1aSopenharmony_ci/** 790cabdff1aSopenharmony_ci * vertical bilinear scale YV12 to RGB 791cabdff1aSopenharmony_ci */ 792cabdff1aSopenharmony_cistatic void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], 793cabdff1aSopenharmony_ci const int16_t *ubuf[2], const int16_t *vbuf[2], 794cabdff1aSopenharmony_ci const int16_t *abuf[2], uint8_t *dest, 795cabdff1aSopenharmony_ci int dstW, int yalpha, int uvalpha, int y) 796cabdff1aSopenharmony_ci{ 797cabdff1aSopenharmony_ci const int16_t *buf0 = buf[0], *buf1 = buf[1], 798cabdff1aSopenharmony_ci *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; 799cabdff1aSopenharmony_ci 800cabdff1aSopenharmony_ci if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { 801cabdff1aSopenharmony_ci const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1]; 802cabdff1aSopenharmony_ci#if ARCH_X86_64 803cabdff1aSopenharmony_ci __asm__ volatile( 804cabdff1aSopenharmony_ci YSCALEYUV2RGB(%%r8, %5) 805cabdff1aSopenharmony_ci YSCALEYUV2RGB_YA(%%r8, %5, %6, %7) 806cabdff1aSopenharmony_ci "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 807cabdff1aSopenharmony_ci "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 808cabdff1aSopenharmony_ci "packuswb %%mm7, %%mm1 \n\t" 809cabdff1aSopenharmony_ci WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 810cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest), 811cabdff1aSopenharmony_ci "a" (&c->redDither), 812cabdff1aSopenharmony_ci "r" (abuf0), "r" (abuf1) 813cabdff1aSopenharmony_ci : "%r8" 814cabdff1aSopenharmony_ci ); 815cabdff1aSopenharmony_ci#else 816cabdff1aSopenharmony_ci c->u_temp=(intptr_t)abuf0; 817cabdff1aSopenharmony_ci c->v_temp=(intptr_t)abuf1; 818cabdff1aSopenharmony_ci __asm__ volatile( 819cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 820cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 821cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 822cabdff1aSopenharmony_ci YSCALEYUV2RGB(%%FF_REGBP, %5) 823cabdff1aSopenharmony_ci "push %0 \n\t" 824cabdff1aSopenharmony_ci "push %1 \n\t" 825cabdff1aSopenharmony_ci "mov "U_TEMP"(%5), %0 \n\t" 826cabdff1aSopenharmony_ci "mov "V_TEMP"(%5), %1 \n\t" 827cabdff1aSopenharmony_ci YSCALEYUV2RGB_YA(%%FF_REGBP, %5, %0, %1) 828cabdff1aSopenharmony_ci "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 829cabdff1aSopenharmony_ci "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 830cabdff1aSopenharmony_ci "packuswb %%mm7, %%mm1 \n\t" 831cabdff1aSopenharmony_ci "pop %1 \n\t" 832cabdff1aSopenharmony_ci "pop %0 \n\t" 833cabdff1aSopenharmony_ci WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 834cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 835cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 836cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 837cabdff1aSopenharmony_ci "a" (&c->redDither) 838cabdff1aSopenharmony_ci ); 839cabdff1aSopenharmony_ci#endif 840cabdff1aSopenharmony_ci } else { 841cabdff1aSopenharmony_ci __asm__ volatile( 842cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 843cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 844cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 845cabdff1aSopenharmony_ci YSCALEYUV2RGB(%%FF_REGBP, %5) 846cabdff1aSopenharmony_ci "pcmpeqd %%mm7, %%mm7 \n\t" 847cabdff1aSopenharmony_ci WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 848cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 849cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 850cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 851cabdff1aSopenharmony_ci "a" (&c->redDither) 852cabdff1aSopenharmony_ci ); 853cabdff1aSopenharmony_ci } 854cabdff1aSopenharmony_ci} 855cabdff1aSopenharmony_ci 856cabdff1aSopenharmony_cistatic void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2], 857cabdff1aSopenharmony_ci const int16_t *ubuf[2], const int16_t *vbuf[2], 858cabdff1aSopenharmony_ci const int16_t *abuf[2], uint8_t *dest, 859cabdff1aSopenharmony_ci int dstW, int yalpha, int uvalpha, int y) 860cabdff1aSopenharmony_ci{ 861cabdff1aSopenharmony_ci const int16_t *buf0 = buf[0], *buf1 = buf[1], 862cabdff1aSopenharmony_ci *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; 863cabdff1aSopenharmony_ci 864cabdff1aSopenharmony_ci __asm__ volatile( 865cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 866cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 867cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 868cabdff1aSopenharmony_ci YSCALEYUV2RGB(%%FF_REGBP, %5) 869cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 870cabdff1aSopenharmony_ci WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 871cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 872cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 873cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 874cabdff1aSopenharmony_ci "a" (&c->redDither) 875cabdff1aSopenharmony_ci NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) 876cabdff1aSopenharmony_ci ); 877cabdff1aSopenharmony_ci} 878cabdff1aSopenharmony_ci 879cabdff1aSopenharmony_cistatic void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2], 880cabdff1aSopenharmony_ci const int16_t *ubuf[2], const int16_t *vbuf[2], 881cabdff1aSopenharmony_ci const int16_t *abuf[2], uint8_t *dest, 882cabdff1aSopenharmony_ci int dstW, int yalpha, int uvalpha, int y) 883cabdff1aSopenharmony_ci{ 884cabdff1aSopenharmony_ci const int16_t *buf0 = buf[0], *buf1 = buf[1], 885cabdff1aSopenharmony_ci *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; 886cabdff1aSopenharmony_ci 887cabdff1aSopenharmony_ci __asm__ volatile( 888cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 889cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 890cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 891cabdff1aSopenharmony_ci YSCALEYUV2RGB(%%FF_REGBP, %5) 892cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 893cabdff1aSopenharmony_ci /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 894cabdff1aSopenharmony_ci#ifdef DITHER1XBPP 895cabdff1aSopenharmony_ci "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 896cabdff1aSopenharmony_ci "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 897cabdff1aSopenharmony_ci "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 898cabdff1aSopenharmony_ci#endif 899cabdff1aSopenharmony_ci WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 900cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 901cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 902cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 903cabdff1aSopenharmony_ci "a" (&c->redDither) 904cabdff1aSopenharmony_ci NAMED_CONSTRAINTS_ADD(bF8) 905cabdff1aSopenharmony_ci ); 906cabdff1aSopenharmony_ci} 907cabdff1aSopenharmony_ci 908cabdff1aSopenharmony_cistatic void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], 909cabdff1aSopenharmony_ci const int16_t *ubuf[2], const int16_t *vbuf[2], 910cabdff1aSopenharmony_ci const int16_t *abuf[2], uint8_t *dest, 911cabdff1aSopenharmony_ci int dstW, int yalpha, int uvalpha, int y) 912cabdff1aSopenharmony_ci{ 913cabdff1aSopenharmony_ci const int16_t *buf0 = buf[0], *buf1 = buf[1], 914cabdff1aSopenharmony_ci *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; 915cabdff1aSopenharmony_ci 916cabdff1aSopenharmony_ci __asm__ volatile( 917cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 918cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 919cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 920cabdff1aSopenharmony_ci YSCALEYUV2RGB(%%FF_REGBP, %5) 921cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 922cabdff1aSopenharmony_ci /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 923cabdff1aSopenharmony_ci#ifdef DITHER1XBPP 924cabdff1aSopenharmony_ci "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 925cabdff1aSopenharmony_ci "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 926cabdff1aSopenharmony_ci "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 927cabdff1aSopenharmony_ci#endif 928cabdff1aSopenharmony_ci WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 929cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 930cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 931cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 932cabdff1aSopenharmony_ci "a" (&c->redDither) 933cabdff1aSopenharmony_ci NAMED_CONSTRAINTS_ADD(bF8,bFC) 934cabdff1aSopenharmony_ci ); 935cabdff1aSopenharmony_ci} 936cabdff1aSopenharmony_ci 937cabdff1aSopenharmony_ci#define REAL_YSCALEYUV2PACKED(index, c) \ 938cabdff1aSopenharmony_ci "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 939cabdff1aSopenharmony_ci "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ 940cabdff1aSopenharmony_ci "psraw $3, %%mm0 \n\t"\ 941cabdff1aSopenharmony_ci "psraw $3, %%mm1 \n\t"\ 942cabdff1aSopenharmony_ci "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 943cabdff1aSopenharmony_ci "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 944cabdff1aSopenharmony_ci "xor "#index", "#index" \n\t"\ 945cabdff1aSopenharmony_ci ".p2align 4 \n\t"\ 946cabdff1aSopenharmony_ci "1: \n\t"\ 947cabdff1aSopenharmony_ci "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 948cabdff1aSopenharmony_ci "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 949cabdff1aSopenharmony_ci "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 950cabdff1aSopenharmony_ci "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 951cabdff1aSopenharmony_ci "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 952cabdff1aSopenharmony_ci "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 953cabdff1aSopenharmony_ci "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ 954cabdff1aSopenharmony_ci "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ 955cabdff1aSopenharmony_ci "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 956cabdff1aSopenharmony_ci "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ 957cabdff1aSopenharmony_ci "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ 958cabdff1aSopenharmony_ci "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 959cabdff1aSopenharmony_ci "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 960cabdff1aSopenharmony_ci "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ 961cabdff1aSopenharmony_ci "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ 962cabdff1aSopenharmony_ci "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ 963cabdff1aSopenharmony_ci "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ 964cabdff1aSopenharmony_ci "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ 965cabdff1aSopenharmony_ci "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ 966cabdff1aSopenharmony_ci "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ 967cabdff1aSopenharmony_ci "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ 968cabdff1aSopenharmony_ci "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 969cabdff1aSopenharmony_ci "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 970cabdff1aSopenharmony_ci "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 971cabdff1aSopenharmony_ci "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 972cabdff1aSopenharmony_ci "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 973cabdff1aSopenharmony_ci "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 974cabdff1aSopenharmony_ci 975cabdff1aSopenharmony_ci#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) 976cabdff1aSopenharmony_ci 977cabdff1aSopenharmony_cistatic void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], 978cabdff1aSopenharmony_ci const int16_t *ubuf[2], const int16_t *vbuf[2], 979cabdff1aSopenharmony_ci const int16_t *abuf[2], uint8_t *dest, 980cabdff1aSopenharmony_ci int dstW, int yalpha, int uvalpha, int y) 981cabdff1aSopenharmony_ci{ 982cabdff1aSopenharmony_ci const int16_t *buf0 = buf[0], *buf1 = buf[1], 983cabdff1aSopenharmony_ci *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; 984cabdff1aSopenharmony_ci 985cabdff1aSopenharmony_ci __asm__ volatile( 986cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 987cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 988cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 989cabdff1aSopenharmony_ci YSCALEYUV2PACKED(%%FF_REGBP, %5) 990cabdff1aSopenharmony_ci WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 991cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 992cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 993cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 994cabdff1aSopenharmony_ci "a" (&c->redDither) 995cabdff1aSopenharmony_ci ); 996cabdff1aSopenharmony_ci} 997cabdff1aSopenharmony_ci 998cabdff1aSopenharmony_ci#define REAL_YSCALEYUV2RGB1(index, c) \ 999cabdff1aSopenharmony_ci "xor "#index", "#index" \n\t"\ 1000cabdff1aSopenharmony_ci ".p2align 4 \n\t"\ 1001cabdff1aSopenharmony_ci "1: \n\t"\ 1002cabdff1aSopenharmony_ci "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ 1003cabdff1aSopenharmony_ci "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1004cabdff1aSopenharmony_ci "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 1005cabdff1aSopenharmony_ci "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1006cabdff1aSopenharmony_ci "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 1007cabdff1aSopenharmony_ci "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 1008cabdff1aSopenharmony_ci "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 1009cabdff1aSopenharmony_ci "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 1010cabdff1aSopenharmony_ci "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 1011cabdff1aSopenharmony_ci "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 1012cabdff1aSopenharmony_ci "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 1013cabdff1aSopenharmony_ci "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 1014cabdff1aSopenharmony_ci /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 1015cabdff1aSopenharmony_ci "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 1016cabdff1aSopenharmony_ci "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 1017cabdff1aSopenharmony_ci "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 1018cabdff1aSopenharmony_ci "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 1019cabdff1aSopenharmony_ci "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 1020cabdff1aSopenharmony_ci "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 1021cabdff1aSopenharmony_ci "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 1022cabdff1aSopenharmony_ci "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 1023cabdff1aSopenharmony_ci "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 1024cabdff1aSopenharmony_ci "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 1025cabdff1aSopenharmony_ci /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 1026cabdff1aSopenharmony_ci "paddw %%mm3, %%mm4 \n\t"\ 1027cabdff1aSopenharmony_ci "movq %%mm2, %%mm0 \n\t"\ 1028cabdff1aSopenharmony_ci "movq %%mm5, %%mm6 \n\t"\ 1029cabdff1aSopenharmony_ci "movq %%mm4, %%mm3 \n\t"\ 1030cabdff1aSopenharmony_ci "punpcklwd %%mm2, %%mm2 \n\t"\ 1031cabdff1aSopenharmony_ci "punpcklwd %%mm5, %%mm5 \n\t"\ 1032cabdff1aSopenharmony_ci "punpcklwd %%mm4, %%mm4 \n\t"\ 1033cabdff1aSopenharmony_ci "paddw %%mm1, %%mm2 \n\t"\ 1034cabdff1aSopenharmony_ci "paddw %%mm1, %%mm5 \n\t"\ 1035cabdff1aSopenharmony_ci "paddw %%mm1, %%mm4 \n\t"\ 1036cabdff1aSopenharmony_ci "punpckhwd %%mm0, %%mm0 \n\t"\ 1037cabdff1aSopenharmony_ci "punpckhwd %%mm6, %%mm6 \n\t"\ 1038cabdff1aSopenharmony_ci "punpckhwd %%mm3, %%mm3 \n\t"\ 1039cabdff1aSopenharmony_ci "paddw %%mm7, %%mm0 \n\t"\ 1040cabdff1aSopenharmony_ci "paddw %%mm7, %%mm6 \n\t"\ 1041cabdff1aSopenharmony_ci "paddw %%mm7, %%mm3 \n\t"\ 1042cabdff1aSopenharmony_ci /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 1043cabdff1aSopenharmony_ci "packuswb %%mm0, %%mm2 \n\t"\ 1044cabdff1aSopenharmony_ci "packuswb %%mm6, %%mm5 \n\t"\ 1045cabdff1aSopenharmony_ci "packuswb %%mm3, %%mm4 \n\t"\ 1046cabdff1aSopenharmony_ci 1047cabdff1aSopenharmony_ci#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) 1048cabdff1aSopenharmony_ci 1049cabdff1aSopenharmony_ci// do vertical chrominance interpolation 1050cabdff1aSopenharmony_ci#define REAL_YSCALEYUV2RGB1b(index, c) \ 1051cabdff1aSopenharmony_ci "xor "#index", "#index" \n\t"\ 1052cabdff1aSopenharmony_ci ".p2align 4 \n\t"\ 1053cabdff1aSopenharmony_ci "1: \n\t"\ 1054cabdff1aSopenharmony_ci "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 1055cabdff1aSopenharmony_ci "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 1056cabdff1aSopenharmony_ci "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1057cabdff1aSopenharmony_ci "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 1058cabdff1aSopenharmony_ci "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 1059cabdff1aSopenharmony_ci "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1060cabdff1aSopenharmony_ci "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ 1061cabdff1aSopenharmony_ci "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ 1062cabdff1aSopenharmony_ci "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ 1063cabdff1aSopenharmony_ci "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ 1064cabdff1aSopenharmony_ci "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 1065cabdff1aSopenharmony_ci "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 1066cabdff1aSopenharmony_ci "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 1067cabdff1aSopenharmony_ci "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 1068cabdff1aSopenharmony_ci "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 1069cabdff1aSopenharmony_ci "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 1070cabdff1aSopenharmony_ci /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 1071cabdff1aSopenharmony_ci "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 1072cabdff1aSopenharmony_ci "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 1073cabdff1aSopenharmony_ci "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 1074cabdff1aSopenharmony_ci "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 1075cabdff1aSopenharmony_ci "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 1076cabdff1aSopenharmony_ci "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 1077cabdff1aSopenharmony_ci "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 1078cabdff1aSopenharmony_ci "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 1079cabdff1aSopenharmony_ci "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 1080cabdff1aSopenharmony_ci "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 1081cabdff1aSopenharmony_ci /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 1082cabdff1aSopenharmony_ci "paddw %%mm3, %%mm4 \n\t"\ 1083cabdff1aSopenharmony_ci "movq %%mm2, %%mm0 \n\t"\ 1084cabdff1aSopenharmony_ci "movq %%mm5, %%mm6 \n\t"\ 1085cabdff1aSopenharmony_ci "movq %%mm4, %%mm3 \n\t"\ 1086cabdff1aSopenharmony_ci "punpcklwd %%mm2, %%mm2 \n\t"\ 1087cabdff1aSopenharmony_ci "punpcklwd %%mm5, %%mm5 \n\t"\ 1088cabdff1aSopenharmony_ci "punpcklwd %%mm4, %%mm4 \n\t"\ 1089cabdff1aSopenharmony_ci "paddw %%mm1, %%mm2 \n\t"\ 1090cabdff1aSopenharmony_ci "paddw %%mm1, %%mm5 \n\t"\ 1091cabdff1aSopenharmony_ci "paddw %%mm1, %%mm4 \n\t"\ 1092cabdff1aSopenharmony_ci "punpckhwd %%mm0, %%mm0 \n\t"\ 1093cabdff1aSopenharmony_ci "punpckhwd %%mm6, %%mm6 \n\t"\ 1094cabdff1aSopenharmony_ci "punpckhwd %%mm3, %%mm3 \n\t"\ 1095cabdff1aSopenharmony_ci "paddw %%mm7, %%mm0 \n\t"\ 1096cabdff1aSopenharmony_ci "paddw %%mm7, %%mm6 \n\t"\ 1097cabdff1aSopenharmony_ci "paddw %%mm7, %%mm3 \n\t"\ 1098cabdff1aSopenharmony_ci /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 1099cabdff1aSopenharmony_ci "packuswb %%mm0, %%mm2 \n\t"\ 1100cabdff1aSopenharmony_ci "packuswb %%mm6, %%mm5 \n\t"\ 1101cabdff1aSopenharmony_ci "packuswb %%mm3, %%mm4 \n\t"\ 1102cabdff1aSopenharmony_ci 1103cabdff1aSopenharmony_ci#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) 1104cabdff1aSopenharmony_ci 1105cabdff1aSopenharmony_ci#define REAL_YSCALEYUV2RGB1_ALPHA(index) \ 1106cabdff1aSopenharmony_ci "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\ 1107cabdff1aSopenharmony_ci "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\ 1108cabdff1aSopenharmony_ci "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\ 1109cabdff1aSopenharmony_ci "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\ 1110cabdff1aSopenharmony_ci "packuswb %%mm1, %%mm7 \n\t" 1111cabdff1aSopenharmony_ci#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) 1112cabdff1aSopenharmony_ci 1113cabdff1aSopenharmony_ci/** 1114cabdff1aSopenharmony_ci * YV12 to RGB without scaling or interpolating 1115cabdff1aSopenharmony_ci */ 1116cabdff1aSopenharmony_cistatic void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, 1117cabdff1aSopenharmony_ci const int16_t *ubuf[2], const int16_t *vbuf[2], 1118cabdff1aSopenharmony_ci const int16_t *abuf0, uint8_t *dest, 1119cabdff1aSopenharmony_ci int dstW, int uvalpha, int y) 1120cabdff1aSopenharmony_ci{ 1121cabdff1aSopenharmony_ci const int16_t *ubuf0 = ubuf[0]; 1122cabdff1aSopenharmony_ci const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1123cabdff1aSopenharmony_ci 1124cabdff1aSopenharmony_ci if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1125cabdff1aSopenharmony_ci const int16_t *ubuf1 = ubuf[0]; 1126cabdff1aSopenharmony_ci if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { 1127cabdff1aSopenharmony_ci __asm__ volatile( 1128cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1129cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 1130cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 1131cabdff1aSopenharmony_ci YSCALEYUV2RGB1(%%FF_REGBP, %5) 1132cabdff1aSopenharmony_ci YSCALEYUV2RGB1_ALPHA(%%FF_REGBP) 1133cabdff1aSopenharmony_ci WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1134cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 1135cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1136cabdff1aSopenharmony_ci :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1137cabdff1aSopenharmony_ci "a" (&c->redDither) 1138cabdff1aSopenharmony_ci ); 1139cabdff1aSopenharmony_ci } else { 1140cabdff1aSopenharmony_ci __asm__ volatile( 1141cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1142cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 1143cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 1144cabdff1aSopenharmony_ci YSCALEYUV2RGB1(%%FF_REGBP, %5) 1145cabdff1aSopenharmony_ci "pcmpeqd %%mm7, %%mm7 \n\t" 1146cabdff1aSopenharmony_ci WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1147cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 1148cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1149cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1150cabdff1aSopenharmony_ci "a" (&c->redDither) 1151cabdff1aSopenharmony_ci ); 1152cabdff1aSopenharmony_ci } 1153cabdff1aSopenharmony_ci } else { 1154cabdff1aSopenharmony_ci const int16_t *ubuf1 = ubuf[1]; 1155cabdff1aSopenharmony_ci if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { 1156cabdff1aSopenharmony_ci __asm__ volatile( 1157cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1158cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 1159cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 1160cabdff1aSopenharmony_ci YSCALEYUV2RGB1b(%%FF_REGBP, %5) 1161cabdff1aSopenharmony_ci YSCALEYUV2RGB1_ALPHA(%%FF_REGBP) 1162cabdff1aSopenharmony_ci WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1163cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 1164cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1165cabdff1aSopenharmony_ci :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1166cabdff1aSopenharmony_ci "a" (&c->redDither) 1167cabdff1aSopenharmony_ci ); 1168cabdff1aSopenharmony_ci } else { 1169cabdff1aSopenharmony_ci __asm__ volatile( 1170cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1171cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 1172cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 1173cabdff1aSopenharmony_ci YSCALEYUV2RGB1b(%%FF_REGBP, %5) 1174cabdff1aSopenharmony_ci "pcmpeqd %%mm7, %%mm7 \n\t" 1175cabdff1aSopenharmony_ci WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1176cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 1177cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1178cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1179cabdff1aSopenharmony_ci "a" (&c->redDither) 1180cabdff1aSopenharmony_ci ); 1181cabdff1aSopenharmony_ci } 1182cabdff1aSopenharmony_ci } 1183cabdff1aSopenharmony_ci} 1184cabdff1aSopenharmony_ci 1185cabdff1aSopenharmony_cistatic void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0, 1186cabdff1aSopenharmony_ci const int16_t *ubuf[2], const int16_t *vbuf[2], 1187cabdff1aSopenharmony_ci const int16_t *abuf0, uint8_t *dest, 1188cabdff1aSopenharmony_ci int dstW, int uvalpha, int y) 1189cabdff1aSopenharmony_ci{ 1190cabdff1aSopenharmony_ci const int16_t *ubuf0 = ubuf[0]; 1191cabdff1aSopenharmony_ci const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1192cabdff1aSopenharmony_ci 1193cabdff1aSopenharmony_ci if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1194cabdff1aSopenharmony_ci const int16_t *ubuf1 = ubuf[0]; 1195cabdff1aSopenharmony_ci __asm__ volatile( 1196cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1197cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 1198cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 1199cabdff1aSopenharmony_ci YSCALEYUV2RGB1(%%FF_REGBP, %5) 1200cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 1201cabdff1aSopenharmony_ci WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1202cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 1203cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1204cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1205cabdff1aSopenharmony_ci "a" (&c->redDither) 1206cabdff1aSopenharmony_ci NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) 1207cabdff1aSopenharmony_ci ); 1208cabdff1aSopenharmony_ci } else { 1209cabdff1aSopenharmony_ci const int16_t *ubuf1 = ubuf[1]; 1210cabdff1aSopenharmony_ci __asm__ volatile( 1211cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1212cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 1213cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 1214cabdff1aSopenharmony_ci YSCALEYUV2RGB1b(%%FF_REGBP, %5) 1215cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 1216cabdff1aSopenharmony_ci WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1217cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 1218cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1219cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1220cabdff1aSopenharmony_ci "a" (&c->redDither) 1221cabdff1aSopenharmony_ci NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) 1222cabdff1aSopenharmony_ci ); 1223cabdff1aSopenharmony_ci } 1224cabdff1aSopenharmony_ci} 1225cabdff1aSopenharmony_ci 1226cabdff1aSopenharmony_cistatic void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0, 1227cabdff1aSopenharmony_ci const int16_t *ubuf[2], const int16_t *vbuf[2], 1228cabdff1aSopenharmony_ci const int16_t *abuf0, uint8_t *dest, 1229cabdff1aSopenharmony_ci int dstW, int uvalpha, int y) 1230cabdff1aSopenharmony_ci{ 1231cabdff1aSopenharmony_ci const int16_t *ubuf0 = ubuf[0]; 1232cabdff1aSopenharmony_ci const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1233cabdff1aSopenharmony_ci 1234cabdff1aSopenharmony_ci if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1235cabdff1aSopenharmony_ci const int16_t *ubuf1 = ubuf[0]; 1236cabdff1aSopenharmony_ci __asm__ volatile( 1237cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1238cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 1239cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 1240cabdff1aSopenharmony_ci YSCALEYUV2RGB1(%%FF_REGBP, %5) 1241cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 1242cabdff1aSopenharmony_ci /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1243cabdff1aSopenharmony_ci#ifdef DITHER1XBPP 1244cabdff1aSopenharmony_ci "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1245cabdff1aSopenharmony_ci "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1246cabdff1aSopenharmony_ci "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1247cabdff1aSopenharmony_ci#endif 1248cabdff1aSopenharmony_ci WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1249cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 1250cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1251cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1252cabdff1aSopenharmony_ci "a" (&c->redDither) 1253cabdff1aSopenharmony_ci NAMED_CONSTRAINTS_ADD(bF8) 1254cabdff1aSopenharmony_ci ); 1255cabdff1aSopenharmony_ci } else { 1256cabdff1aSopenharmony_ci const int16_t *ubuf1 = ubuf[1]; 1257cabdff1aSopenharmony_ci __asm__ volatile( 1258cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1259cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 1260cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 1261cabdff1aSopenharmony_ci YSCALEYUV2RGB1b(%%FF_REGBP, %5) 1262cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 1263cabdff1aSopenharmony_ci /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1264cabdff1aSopenharmony_ci#ifdef DITHER1XBPP 1265cabdff1aSopenharmony_ci "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1266cabdff1aSopenharmony_ci "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1267cabdff1aSopenharmony_ci "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1268cabdff1aSopenharmony_ci#endif 1269cabdff1aSopenharmony_ci WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1270cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 1271cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1272cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1273cabdff1aSopenharmony_ci "a" (&c->redDither) 1274cabdff1aSopenharmony_ci NAMED_CONSTRAINTS_ADD(bF8) 1275cabdff1aSopenharmony_ci ); 1276cabdff1aSopenharmony_ci } 1277cabdff1aSopenharmony_ci} 1278cabdff1aSopenharmony_ci 1279cabdff1aSopenharmony_cistatic void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, 1280cabdff1aSopenharmony_ci const int16_t *ubuf[2], const int16_t *vbuf[2], 1281cabdff1aSopenharmony_ci const int16_t *abuf0, uint8_t *dest, 1282cabdff1aSopenharmony_ci int dstW, int uvalpha, int y) 1283cabdff1aSopenharmony_ci{ 1284cabdff1aSopenharmony_ci const int16_t *ubuf0 = ubuf[0]; 1285cabdff1aSopenharmony_ci const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1286cabdff1aSopenharmony_ci 1287cabdff1aSopenharmony_ci if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1288cabdff1aSopenharmony_ci const int16_t *ubuf1 = ubuf[0]; 1289cabdff1aSopenharmony_ci __asm__ volatile( 1290cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1291cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 1292cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 1293cabdff1aSopenharmony_ci YSCALEYUV2RGB1(%%FF_REGBP, %5) 1294cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 1295cabdff1aSopenharmony_ci /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1296cabdff1aSopenharmony_ci#ifdef DITHER1XBPP 1297cabdff1aSopenharmony_ci "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1298cabdff1aSopenharmony_ci "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1299cabdff1aSopenharmony_ci "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1300cabdff1aSopenharmony_ci#endif 1301cabdff1aSopenharmony_ci WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1302cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 1303cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1304cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1305cabdff1aSopenharmony_ci "a" (&c->redDither) 1306cabdff1aSopenharmony_ci NAMED_CONSTRAINTS_ADD(bF8,bFC) 1307cabdff1aSopenharmony_ci ); 1308cabdff1aSopenharmony_ci } else { 1309cabdff1aSopenharmony_ci const int16_t *ubuf1 = ubuf[1]; 1310cabdff1aSopenharmony_ci __asm__ volatile( 1311cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1312cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 1313cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 1314cabdff1aSopenharmony_ci YSCALEYUV2RGB1b(%%FF_REGBP, %5) 1315cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 1316cabdff1aSopenharmony_ci /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1317cabdff1aSopenharmony_ci#ifdef DITHER1XBPP 1318cabdff1aSopenharmony_ci "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1319cabdff1aSopenharmony_ci "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1320cabdff1aSopenharmony_ci "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1321cabdff1aSopenharmony_ci#endif 1322cabdff1aSopenharmony_ci WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1323cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 1324cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1325cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1326cabdff1aSopenharmony_ci "a" (&c->redDither) 1327cabdff1aSopenharmony_ci NAMED_CONSTRAINTS_ADD(bF8,bFC) 1328cabdff1aSopenharmony_ci ); 1329cabdff1aSopenharmony_ci } 1330cabdff1aSopenharmony_ci} 1331cabdff1aSopenharmony_ci 1332cabdff1aSopenharmony_ci#define REAL_YSCALEYUV2PACKED1(index, c) \ 1333cabdff1aSopenharmony_ci "xor "#index", "#index" \n\t"\ 1334cabdff1aSopenharmony_ci ".p2align 4 \n\t"\ 1335cabdff1aSopenharmony_ci "1: \n\t"\ 1336cabdff1aSopenharmony_ci "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ 1337cabdff1aSopenharmony_ci "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1338cabdff1aSopenharmony_ci "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 1339cabdff1aSopenharmony_ci "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1340cabdff1aSopenharmony_ci "psraw $7, %%mm3 \n\t" \ 1341cabdff1aSopenharmony_ci "psraw $7, %%mm4 \n\t" \ 1342cabdff1aSopenharmony_ci "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 1343cabdff1aSopenharmony_ci "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 1344cabdff1aSopenharmony_ci "psraw $7, %%mm1 \n\t" \ 1345cabdff1aSopenharmony_ci "psraw $7, %%mm7 \n\t" \ 1346cabdff1aSopenharmony_ci 1347cabdff1aSopenharmony_ci#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) 1348cabdff1aSopenharmony_ci 1349cabdff1aSopenharmony_ci#define REAL_YSCALEYUV2PACKED1b(index, c) \ 1350cabdff1aSopenharmony_ci "xor "#index", "#index" \n\t"\ 1351cabdff1aSopenharmony_ci ".p2align 4 \n\t"\ 1352cabdff1aSopenharmony_ci "1: \n\t"\ 1353cabdff1aSopenharmony_ci "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 1354cabdff1aSopenharmony_ci "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 1355cabdff1aSopenharmony_ci "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1356cabdff1aSopenharmony_ci "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 1357cabdff1aSopenharmony_ci "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 1358cabdff1aSopenharmony_ci "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1359cabdff1aSopenharmony_ci "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ 1360cabdff1aSopenharmony_ci "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ 1361cabdff1aSopenharmony_ci "psrlw $8, %%mm3 \n\t" \ 1362cabdff1aSopenharmony_ci "psrlw $8, %%mm4 \n\t" \ 1363cabdff1aSopenharmony_ci "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 1364cabdff1aSopenharmony_ci "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 1365cabdff1aSopenharmony_ci "psraw $7, %%mm1 \n\t" \ 1366cabdff1aSopenharmony_ci "psraw $7, %%mm7 \n\t" 1367cabdff1aSopenharmony_ci#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) 1368cabdff1aSopenharmony_ci 1369cabdff1aSopenharmony_cistatic void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, 1370cabdff1aSopenharmony_ci const int16_t *ubuf[2], const int16_t *vbuf[2], 1371cabdff1aSopenharmony_ci const int16_t *abuf0, uint8_t *dest, 1372cabdff1aSopenharmony_ci int dstW, int uvalpha, int y) 1373cabdff1aSopenharmony_ci{ 1374cabdff1aSopenharmony_ci const int16_t *ubuf0 = ubuf[0]; 1375cabdff1aSopenharmony_ci const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1376cabdff1aSopenharmony_ci 1377cabdff1aSopenharmony_ci if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1378cabdff1aSopenharmony_ci const int16_t *ubuf1 = ubuf[0]; 1379cabdff1aSopenharmony_ci __asm__ volatile( 1380cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1381cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 1382cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 1383cabdff1aSopenharmony_ci YSCALEYUV2PACKED1(%%FF_REGBP, %5) 1384cabdff1aSopenharmony_ci WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1385cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 1386cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1387cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1388cabdff1aSopenharmony_ci "a" (&c->redDither) 1389cabdff1aSopenharmony_ci ); 1390cabdff1aSopenharmony_ci } else { 1391cabdff1aSopenharmony_ci const int16_t *ubuf1 = ubuf[1]; 1392cabdff1aSopenharmony_ci __asm__ volatile( 1393cabdff1aSopenharmony_ci "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" 1394cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_b" \n\t" 1395cabdff1aSopenharmony_ci "push %%"FF_REG_BP" \n\t" 1396cabdff1aSopenharmony_ci YSCALEYUV2PACKED1b(%%FF_REGBP, %5) 1397cabdff1aSopenharmony_ci WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) 1398cabdff1aSopenharmony_ci "pop %%"FF_REG_BP" \n\t" 1399cabdff1aSopenharmony_ci "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" 1400cabdff1aSopenharmony_ci :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1401cabdff1aSopenharmony_ci "a" (&c->redDither) 1402cabdff1aSopenharmony_ci ); 1403cabdff1aSopenharmony_ci } 1404cabdff1aSopenharmony_ci} 1405cabdff1aSopenharmony_cistatic av_cold void RENAME(sws_init_swscale)(SwsContext *c) 1406cabdff1aSopenharmony_ci{ 1407cabdff1aSopenharmony_ci enum AVPixelFormat dstFormat = c->dstFormat; 1408cabdff1aSopenharmony_ci 1409cabdff1aSopenharmony_ci c->use_mmx_vfilter= 0; 1410cabdff1aSopenharmony_ci if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) 1411cabdff1aSopenharmony_ci && dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE 1412cabdff1aSopenharmony_ci && !(c->flags & SWS_BITEXACT)) { 1413cabdff1aSopenharmony_ci if (c->flags & SWS_ACCURATE_RND) { 1414cabdff1aSopenharmony_ci if (!(c->flags & SWS_FULL_CHR_H_INT)) { 1415cabdff1aSopenharmony_ci switch (c->dstFormat) { 1416cabdff1aSopenharmony_ci case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break; 1417cabdff1aSopenharmony_ci#if HAVE_6REGS 1418cabdff1aSopenharmony_ci case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break; 1419cabdff1aSopenharmony_ci#endif 1420cabdff1aSopenharmony_ci case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break; 1421cabdff1aSopenharmony_ci case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break; 1422cabdff1aSopenharmony_ci case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break; 1423cabdff1aSopenharmony_ci default: break; 1424cabdff1aSopenharmony_ci } 1425cabdff1aSopenharmony_ci } 1426cabdff1aSopenharmony_ci } else { 1427cabdff1aSopenharmony_ci c->use_mmx_vfilter= 1; 1428cabdff1aSopenharmony_ci if (!(c->flags & SWS_FULL_CHR_H_INT)) { 1429cabdff1aSopenharmony_ci switch (c->dstFormat) { 1430cabdff1aSopenharmony_ci case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break; 1431cabdff1aSopenharmony_ci case AV_PIX_FMT_BGR32: c->yuv2packedX = RENAME(yuv2bgr32_X); break; 1432cabdff1aSopenharmony_ci#if HAVE_6REGS 1433cabdff1aSopenharmony_ci case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break; 1434cabdff1aSopenharmony_ci#endif 1435cabdff1aSopenharmony_ci case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break; 1436cabdff1aSopenharmony_ci case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break; 1437cabdff1aSopenharmony_ci case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break; 1438cabdff1aSopenharmony_ci default: break; 1439cabdff1aSopenharmony_ci } 1440cabdff1aSopenharmony_ci } 1441cabdff1aSopenharmony_ci } 1442cabdff1aSopenharmony_ci if (!(c->flags & SWS_FULL_CHR_H_INT)) { 1443cabdff1aSopenharmony_ci switch (c->dstFormat) { 1444cabdff1aSopenharmony_ci case AV_PIX_FMT_RGB32: 1445cabdff1aSopenharmony_ci c->yuv2packed1 = RENAME(yuv2rgb32_1); 1446cabdff1aSopenharmony_ci c->yuv2packed2 = RENAME(yuv2rgb32_2); 1447cabdff1aSopenharmony_ci break; 1448cabdff1aSopenharmony_ci case AV_PIX_FMT_BGR24: 1449cabdff1aSopenharmony_ci c->yuv2packed1 = RENAME(yuv2bgr24_1); 1450cabdff1aSopenharmony_ci c->yuv2packed2 = RENAME(yuv2bgr24_2); 1451cabdff1aSopenharmony_ci break; 1452cabdff1aSopenharmony_ci case AV_PIX_FMT_RGB555: 1453cabdff1aSopenharmony_ci c->yuv2packed1 = RENAME(yuv2rgb555_1); 1454cabdff1aSopenharmony_ci c->yuv2packed2 = RENAME(yuv2rgb555_2); 1455cabdff1aSopenharmony_ci break; 1456cabdff1aSopenharmony_ci case AV_PIX_FMT_RGB565: 1457cabdff1aSopenharmony_ci c->yuv2packed1 = RENAME(yuv2rgb565_1); 1458cabdff1aSopenharmony_ci c->yuv2packed2 = RENAME(yuv2rgb565_2); 1459cabdff1aSopenharmony_ci break; 1460cabdff1aSopenharmony_ci case AV_PIX_FMT_YUYV422: 1461cabdff1aSopenharmony_ci c->yuv2packed1 = RENAME(yuv2yuyv422_1); 1462cabdff1aSopenharmony_ci c->yuv2packed2 = RENAME(yuv2yuyv422_2); 1463cabdff1aSopenharmony_ci break; 1464cabdff1aSopenharmony_ci default: 1465cabdff1aSopenharmony_ci break; 1466cabdff1aSopenharmony_ci } 1467cabdff1aSopenharmony_ci } 1468cabdff1aSopenharmony_ci } 1469cabdff1aSopenharmony_ci 1470cabdff1aSopenharmony_ci if (c->srcBpc == 8 && c->dstBpc <= 14) { 1471cabdff1aSopenharmony_ci // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one). 1472cabdff1aSopenharmony_ci if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) { 1473cabdff1aSopenharmony_ci c->hyscale_fast = ff_hyscale_fast_mmxext; 1474cabdff1aSopenharmony_ci c->hcscale_fast = ff_hcscale_fast_mmxext; 1475cabdff1aSopenharmony_ci } else { 1476cabdff1aSopenharmony_ci c->hyscale_fast = NULL; 1477cabdff1aSopenharmony_ci c->hcscale_fast = NULL; 1478cabdff1aSopenharmony_ci } 1479cabdff1aSopenharmony_ci } 1480cabdff1aSopenharmony_ci} 1481