1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> 3cabdff1aSopenharmony_ci * H.263, MPEG-1, MPEG-2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> 4cabdff1aSopenharmony_ci * 5cabdff1aSopenharmony_ci * This file is part of FFmpeg. 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci * 12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci * Lesser General Public License for more details. 16cabdff1aSopenharmony_ci * 17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci */ 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci#include "libavutil/attributes.h" 23cabdff1aSopenharmony_ci#include "libavutil/cpu.h" 24cabdff1aSopenharmony_ci#include "libavutil/x86/asm.h" 25cabdff1aSopenharmony_ci#include "libavutil/x86/cpu.h" 26cabdff1aSopenharmony_ci#include "libavcodec/avcodec.h" 27cabdff1aSopenharmony_ci#include "libavcodec/mpegvideo.h" 28cabdff1aSopenharmony_ci#include "libavcodec/mpegvideodata.h" 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ci#if HAVE_MMX_INLINE 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_cistatic void dct_unquantize_h263_intra_mmx(MpegEncContext *s, 33cabdff1aSopenharmony_ci int16_t *block, int n, int qscale) 34cabdff1aSopenharmony_ci{ 35cabdff1aSopenharmony_ci x86_reg level, qmul, qadd, nCoeffs; 36cabdff1aSopenharmony_ci 37cabdff1aSopenharmony_ci qmul = qscale << 1; 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_ci av_assert2(s->block_last_index[n]>=0 || s->h263_aic); 40cabdff1aSopenharmony_ci 41cabdff1aSopenharmony_ci if (!s->h263_aic) { 42cabdff1aSopenharmony_ci if (n < 4) 43cabdff1aSopenharmony_ci level = block[0] * s->y_dc_scale; 44cabdff1aSopenharmony_ci else 45cabdff1aSopenharmony_ci level = block[0] * s->c_dc_scale; 46cabdff1aSopenharmony_ci qadd = (qscale - 1) | 1; 47cabdff1aSopenharmony_ci }else{ 48cabdff1aSopenharmony_ci qadd = 0; 49cabdff1aSopenharmony_ci level= block[0]; 50cabdff1aSopenharmony_ci } 51cabdff1aSopenharmony_ci if(s->ac_pred) 52cabdff1aSopenharmony_ci nCoeffs=63; 53cabdff1aSopenharmony_ci else 54cabdff1aSopenharmony_ci nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; 55cabdff1aSopenharmony_ci 56cabdff1aSopenharmony_ci__asm__ volatile( 57cabdff1aSopenharmony_ci "movd %1, %%mm6 \n\t" //qmul 58cabdff1aSopenharmony_ci "packssdw %%mm6, %%mm6 \n\t" 59cabdff1aSopenharmony_ci "packssdw %%mm6, %%mm6 \n\t" 60cabdff1aSopenharmony_ci "movd %2, %%mm5 \n\t" //qadd 61cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 62cabdff1aSopenharmony_ci "packssdw %%mm5, %%mm5 \n\t" 63cabdff1aSopenharmony_ci "packssdw %%mm5, %%mm5 \n\t" 64cabdff1aSopenharmony_ci "psubw %%mm5, %%mm7 \n\t" 65cabdff1aSopenharmony_ci "pxor %%mm4, %%mm4 \n\t" 66cabdff1aSopenharmony_ci ".p2align 4 \n\t" 67cabdff1aSopenharmony_ci "1: \n\t" 68cabdff1aSopenharmony_ci "movq (%0, %3), %%mm0 \n\t" 69cabdff1aSopenharmony_ci "movq 8(%0, %3), %%mm1 \n\t" 70cabdff1aSopenharmony_ci 71cabdff1aSopenharmony_ci "pmullw %%mm6, %%mm0 \n\t" 72cabdff1aSopenharmony_ci "pmullw %%mm6, %%mm1 \n\t" 73cabdff1aSopenharmony_ci 74cabdff1aSopenharmony_ci "movq (%0, %3), %%mm2 \n\t" 75cabdff1aSopenharmony_ci "movq 8(%0, %3), %%mm3 \n\t" 76cabdff1aSopenharmony_ci 77cabdff1aSopenharmony_ci "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 78cabdff1aSopenharmony_ci "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 79cabdff1aSopenharmony_ci 80cabdff1aSopenharmony_ci "pxor %%mm2, %%mm0 \n\t" 81cabdff1aSopenharmony_ci "pxor %%mm3, %%mm1 \n\t" 82cabdff1aSopenharmony_ci 83cabdff1aSopenharmony_ci "paddw %%mm7, %%mm0 \n\t" 84cabdff1aSopenharmony_ci "paddw %%mm7, %%mm1 \n\t" 85cabdff1aSopenharmony_ci 86cabdff1aSopenharmony_ci "pxor %%mm0, %%mm2 \n\t" 87cabdff1aSopenharmony_ci "pxor %%mm1, %%mm3 \n\t" 88cabdff1aSopenharmony_ci 89cabdff1aSopenharmony_ci "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 90cabdff1aSopenharmony_ci "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 91cabdff1aSopenharmony_ci 92cabdff1aSopenharmony_ci "pandn %%mm2, %%mm0 \n\t" 93cabdff1aSopenharmony_ci "pandn %%mm3, %%mm1 \n\t" 94cabdff1aSopenharmony_ci 95cabdff1aSopenharmony_ci "movq %%mm0, (%0, %3) \n\t" 96cabdff1aSopenharmony_ci "movq %%mm1, 8(%0, %3) \n\t" 97cabdff1aSopenharmony_ci 98cabdff1aSopenharmony_ci "add $16, %3 \n\t" 99cabdff1aSopenharmony_ci "jng 1b \n\t" 100cabdff1aSopenharmony_ci ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) 101cabdff1aSopenharmony_ci : "memory" 102cabdff1aSopenharmony_ci ); 103cabdff1aSopenharmony_ci block[0]= level; 104cabdff1aSopenharmony_ci} 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ci 107cabdff1aSopenharmony_cistatic void dct_unquantize_h263_inter_mmx(MpegEncContext *s, 108cabdff1aSopenharmony_ci int16_t *block, int n, int qscale) 109cabdff1aSopenharmony_ci{ 110cabdff1aSopenharmony_ci x86_reg qmul, qadd, nCoeffs; 111cabdff1aSopenharmony_ci 112cabdff1aSopenharmony_ci qmul = qscale << 1; 113cabdff1aSopenharmony_ci qadd = (qscale - 1) | 1; 114cabdff1aSopenharmony_ci 115cabdff1aSopenharmony_ci av_assert2(s->block_last_index[n]>=0 || s->h263_aic); 116cabdff1aSopenharmony_ci 117cabdff1aSopenharmony_ci nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; 118cabdff1aSopenharmony_ci 119cabdff1aSopenharmony_ci__asm__ volatile( 120cabdff1aSopenharmony_ci "movd %1, %%mm6 \n\t" //qmul 121cabdff1aSopenharmony_ci "packssdw %%mm6, %%mm6 \n\t" 122cabdff1aSopenharmony_ci "packssdw %%mm6, %%mm6 \n\t" 123cabdff1aSopenharmony_ci "movd %2, %%mm5 \n\t" //qadd 124cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" 125cabdff1aSopenharmony_ci "packssdw %%mm5, %%mm5 \n\t" 126cabdff1aSopenharmony_ci "packssdw %%mm5, %%mm5 \n\t" 127cabdff1aSopenharmony_ci "psubw %%mm5, %%mm7 \n\t" 128cabdff1aSopenharmony_ci "pxor %%mm4, %%mm4 \n\t" 129cabdff1aSopenharmony_ci ".p2align 4 \n\t" 130cabdff1aSopenharmony_ci "1: \n\t" 131cabdff1aSopenharmony_ci "movq (%0, %3), %%mm0 \n\t" 132cabdff1aSopenharmony_ci "movq 8(%0, %3), %%mm1 \n\t" 133cabdff1aSopenharmony_ci 134cabdff1aSopenharmony_ci "pmullw %%mm6, %%mm0 \n\t" 135cabdff1aSopenharmony_ci "pmullw %%mm6, %%mm1 \n\t" 136cabdff1aSopenharmony_ci 137cabdff1aSopenharmony_ci "movq (%0, %3), %%mm2 \n\t" 138cabdff1aSopenharmony_ci "movq 8(%0, %3), %%mm3 \n\t" 139cabdff1aSopenharmony_ci 140cabdff1aSopenharmony_ci "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 141cabdff1aSopenharmony_ci "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 142cabdff1aSopenharmony_ci 143cabdff1aSopenharmony_ci "pxor %%mm2, %%mm0 \n\t" 144cabdff1aSopenharmony_ci "pxor %%mm3, %%mm1 \n\t" 145cabdff1aSopenharmony_ci 146cabdff1aSopenharmony_ci "paddw %%mm7, %%mm0 \n\t" 147cabdff1aSopenharmony_ci "paddw %%mm7, %%mm1 \n\t" 148cabdff1aSopenharmony_ci 149cabdff1aSopenharmony_ci "pxor %%mm0, %%mm2 \n\t" 150cabdff1aSopenharmony_ci "pxor %%mm1, %%mm3 \n\t" 151cabdff1aSopenharmony_ci 152cabdff1aSopenharmony_ci "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 153cabdff1aSopenharmony_ci "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 154cabdff1aSopenharmony_ci 155cabdff1aSopenharmony_ci "pandn %%mm2, %%mm0 \n\t" 156cabdff1aSopenharmony_ci "pandn %%mm3, %%mm1 \n\t" 157cabdff1aSopenharmony_ci 158cabdff1aSopenharmony_ci "movq %%mm0, (%0, %3) \n\t" 159cabdff1aSopenharmony_ci "movq %%mm1, 8(%0, %3) \n\t" 160cabdff1aSopenharmony_ci 161cabdff1aSopenharmony_ci "add $16, %3 \n\t" 162cabdff1aSopenharmony_ci "jng 1b \n\t" 163cabdff1aSopenharmony_ci ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) 164cabdff1aSopenharmony_ci : "memory" 165cabdff1aSopenharmony_ci ); 166cabdff1aSopenharmony_ci} 167cabdff1aSopenharmony_ci 168cabdff1aSopenharmony_cistatic void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, 169cabdff1aSopenharmony_ci int16_t *block, int n, int qscale) 170cabdff1aSopenharmony_ci{ 171cabdff1aSopenharmony_ci x86_reg nCoeffs; 172cabdff1aSopenharmony_ci const uint16_t *quant_matrix; 173cabdff1aSopenharmony_ci int block0; 174cabdff1aSopenharmony_ci 175cabdff1aSopenharmony_ci av_assert2(s->block_last_index[n]>=0); 176cabdff1aSopenharmony_ci 177cabdff1aSopenharmony_ci nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; 178cabdff1aSopenharmony_ci 179cabdff1aSopenharmony_ci if (n < 4) 180cabdff1aSopenharmony_ci block0 = block[0] * s->y_dc_scale; 181cabdff1aSopenharmony_ci else 182cabdff1aSopenharmony_ci block0 = block[0] * s->c_dc_scale; 183cabdff1aSopenharmony_ci /* XXX: only MPEG-1 */ 184cabdff1aSopenharmony_ci quant_matrix = s->intra_matrix; 185cabdff1aSopenharmony_ci__asm__ volatile( 186cabdff1aSopenharmony_ci "pcmpeqw %%mm7, %%mm7 \n\t" 187cabdff1aSopenharmony_ci "psrlw $15, %%mm7 \n\t" 188cabdff1aSopenharmony_ci "movd %2, %%mm6 \n\t" 189cabdff1aSopenharmony_ci "packssdw %%mm6, %%mm6 \n\t" 190cabdff1aSopenharmony_ci "packssdw %%mm6, %%mm6 \n\t" 191cabdff1aSopenharmony_ci "mov %3, %%"FF_REG_a" \n\t" 192cabdff1aSopenharmony_ci ".p2align 4 \n\t" 193cabdff1aSopenharmony_ci "1: \n\t" 194cabdff1aSopenharmony_ci "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" 195cabdff1aSopenharmony_ci "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" 196cabdff1aSopenharmony_ci "movq (%1, %%"FF_REG_a"), %%mm4 \n\t" 197cabdff1aSopenharmony_ci "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" 198cabdff1aSopenharmony_ci "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 199cabdff1aSopenharmony_ci "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 200cabdff1aSopenharmony_ci "pxor %%mm2, %%mm2 \n\t" 201cabdff1aSopenharmony_ci "pxor %%mm3, %%mm3 \n\t" 202cabdff1aSopenharmony_ci "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 203cabdff1aSopenharmony_ci "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 204cabdff1aSopenharmony_ci "pxor %%mm2, %%mm0 \n\t" 205cabdff1aSopenharmony_ci "pxor %%mm3, %%mm1 \n\t" 206cabdff1aSopenharmony_ci "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 207cabdff1aSopenharmony_ci "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 208cabdff1aSopenharmony_ci "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 209cabdff1aSopenharmony_ci "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 210cabdff1aSopenharmony_ci "pxor %%mm4, %%mm4 \n\t" 211cabdff1aSopenharmony_ci "pxor %%mm5, %%mm5 \n\t" // FIXME slow 212cabdff1aSopenharmony_ci "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 213cabdff1aSopenharmony_ci "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 214cabdff1aSopenharmony_ci "psraw $3, %%mm0 \n\t" 215cabdff1aSopenharmony_ci "psraw $3, %%mm1 \n\t" 216cabdff1aSopenharmony_ci "psubw %%mm7, %%mm0 \n\t" 217cabdff1aSopenharmony_ci "psubw %%mm7, %%mm1 \n\t" 218cabdff1aSopenharmony_ci "por %%mm7, %%mm0 \n\t" 219cabdff1aSopenharmony_ci "por %%mm7, %%mm1 \n\t" 220cabdff1aSopenharmony_ci "pxor %%mm2, %%mm0 \n\t" 221cabdff1aSopenharmony_ci "pxor %%mm3, %%mm1 \n\t" 222cabdff1aSopenharmony_ci "psubw %%mm2, %%mm0 \n\t" 223cabdff1aSopenharmony_ci "psubw %%mm3, %%mm1 \n\t" 224cabdff1aSopenharmony_ci "pandn %%mm0, %%mm4 \n\t" 225cabdff1aSopenharmony_ci "pandn %%mm1, %%mm5 \n\t" 226cabdff1aSopenharmony_ci "movq %%mm4, (%0, %%"FF_REG_a") \n\t" 227cabdff1aSopenharmony_ci "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ci "add $16, %%"FF_REG_a" \n\t" 230cabdff1aSopenharmony_ci "js 1b \n\t" 231cabdff1aSopenharmony_ci ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) 232cabdff1aSopenharmony_ci : "%"FF_REG_a, "memory" 233cabdff1aSopenharmony_ci ); 234cabdff1aSopenharmony_ci block[0]= block0; 235cabdff1aSopenharmony_ci} 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_cistatic void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, 238cabdff1aSopenharmony_ci int16_t *block, int n, int qscale) 239cabdff1aSopenharmony_ci{ 240cabdff1aSopenharmony_ci x86_reg nCoeffs; 241cabdff1aSopenharmony_ci const uint16_t *quant_matrix; 242cabdff1aSopenharmony_ci 243cabdff1aSopenharmony_ci av_assert2(s->block_last_index[n]>=0); 244cabdff1aSopenharmony_ci 245cabdff1aSopenharmony_ci nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; 246cabdff1aSopenharmony_ci 247cabdff1aSopenharmony_ci quant_matrix = s->inter_matrix; 248cabdff1aSopenharmony_ci__asm__ volatile( 249cabdff1aSopenharmony_ci "pcmpeqw %%mm7, %%mm7 \n\t" 250cabdff1aSopenharmony_ci "psrlw $15, %%mm7 \n\t" 251cabdff1aSopenharmony_ci "movd %2, %%mm6 \n\t" 252cabdff1aSopenharmony_ci "packssdw %%mm6, %%mm6 \n\t" 253cabdff1aSopenharmony_ci "packssdw %%mm6, %%mm6 \n\t" 254cabdff1aSopenharmony_ci "mov %3, %%"FF_REG_a" \n\t" 255cabdff1aSopenharmony_ci ".p2align 4 \n\t" 256cabdff1aSopenharmony_ci "1: \n\t" 257cabdff1aSopenharmony_ci "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" 258cabdff1aSopenharmony_ci "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" 259cabdff1aSopenharmony_ci "movq (%1, %%"FF_REG_a"), %%mm4 \n\t" 260cabdff1aSopenharmony_ci "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" 261cabdff1aSopenharmony_ci "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 262cabdff1aSopenharmony_ci "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 263cabdff1aSopenharmony_ci "pxor %%mm2, %%mm2 \n\t" 264cabdff1aSopenharmony_ci "pxor %%mm3, %%mm3 \n\t" 265cabdff1aSopenharmony_ci "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 266cabdff1aSopenharmony_ci "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 267cabdff1aSopenharmony_ci "pxor %%mm2, %%mm0 \n\t" 268cabdff1aSopenharmony_ci "pxor %%mm3, %%mm1 \n\t" 269cabdff1aSopenharmony_ci "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 270cabdff1aSopenharmony_ci "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 271cabdff1aSopenharmony_ci "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 272cabdff1aSopenharmony_ci "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 273cabdff1aSopenharmony_ci "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 274cabdff1aSopenharmony_ci "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 275cabdff1aSopenharmony_ci "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 276cabdff1aSopenharmony_ci "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 277cabdff1aSopenharmony_ci "pxor %%mm4, %%mm4 \n\t" 278cabdff1aSopenharmony_ci "pxor %%mm5, %%mm5 \n\t" // FIXME slow 279cabdff1aSopenharmony_ci "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 280cabdff1aSopenharmony_ci "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 281cabdff1aSopenharmony_ci "psraw $4, %%mm0 \n\t" 282cabdff1aSopenharmony_ci "psraw $4, %%mm1 \n\t" 283cabdff1aSopenharmony_ci "psubw %%mm7, %%mm0 \n\t" 284cabdff1aSopenharmony_ci "psubw %%mm7, %%mm1 \n\t" 285cabdff1aSopenharmony_ci "por %%mm7, %%mm0 \n\t" 286cabdff1aSopenharmony_ci "por %%mm7, %%mm1 \n\t" 287cabdff1aSopenharmony_ci "pxor %%mm2, %%mm0 \n\t" 288cabdff1aSopenharmony_ci "pxor %%mm3, %%mm1 \n\t" 289cabdff1aSopenharmony_ci "psubw %%mm2, %%mm0 \n\t" 290cabdff1aSopenharmony_ci "psubw %%mm3, %%mm1 \n\t" 291cabdff1aSopenharmony_ci "pandn %%mm0, %%mm4 \n\t" 292cabdff1aSopenharmony_ci "pandn %%mm1, %%mm5 \n\t" 293cabdff1aSopenharmony_ci "movq %%mm4, (%0, %%"FF_REG_a") \n\t" 294cabdff1aSopenharmony_ci "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" 295cabdff1aSopenharmony_ci 296cabdff1aSopenharmony_ci "add $16, %%"FF_REG_a" \n\t" 297cabdff1aSopenharmony_ci "js 1b \n\t" 298cabdff1aSopenharmony_ci ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) 299cabdff1aSopenharmony_ci : "%"FF_REG_a, "memory" 300cabdff1aSopenharmony_ci ); 301cabdff1aSopenharmony_ci} 302cabdff1aSopenharmony_ci 303cabdff1aSopenharmony_cistatic void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, 304cabdff1aSopenharmony_ci int16_t *block, int n, int qscale) 305cabdff1aSopenharmony_ci{ 306cabdff1aSopenharmony_ci x86_reg nCoeffs; 307cabdff1aSopenharmony_ci const uint16_t *quant_matrix; 308cabdff1aSopenharmony_ci int block0; 309cabdff1aSopenharmony_ci 310cabdff1aSopenharmony_ci av_assert2(s->block_last_index[n]>=0); 311cabdff1aSopenharmony_ci 312cabdff1aSopenharmony_ci if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale]; 313cabdff1aSopenharmony_ci else qscale <<= 1; 314cabdff1aSopenharmony_ci 315cabdff1aSopenharmony_ci if(s->alternate_scan) nCoeffs= 63; //FIXME 316cabdff1aSopenharmony_ci else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; 317cabdff1aSopenharmony_ci 318cabdff1aSopenharmony_ci if (n < 4) 319cabdff1aSopenharmony_ci block0 = block[0] * s->y_dc_scale; 320cabdff1aSopenharmony_ci else 321cabdff1aSopenharmony_ci block0 = block[0] * s->c_dc_scale; 322cabdff1aSopenharmony_ci quant_matrix = s->intra_matrix; 323cabdff1aSopenharmony_ci__asm__ volatile( 324cabdff1aSopenharmony_ci "pcmpeqw %%mm7, %%mm7 \n\t" 325cabdff1aSopenharmony_ci "psrlw $15, %%mm7 \n\t" 326cabdff1aSopenharmony_ci "movd %2, %%mm6 \n\t" 327cabdff1aSopenharmony_ci "packssdw %%mm6, %%mm6 \n\t" 328cabdff1aSopenharmony_ci "packssdw %%mm6, %%mm6 \n\t" 329cabdff1aSopenharmony_ci "mov %3, %%"FF_REG_a" \n\t" 330cabdff1aSopenharmony_ci ".p2align 4 \n\t" 331cabdff1aSopenharmony_ci "1: \n\t" 332cabdff1aSopenharmony_ci "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" 333cabdff1aSopenharmony_ci "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" 334cabdff1aSopenharmony_ci "movq (%1, %%"FF_REG_a"), %%mm4 \n\t" 335cabdff1aSopenharmony_ci "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" 336cabdff1aSopenharmony_ci "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 337cabdff1aSopenharmony_ci "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 338cabdff1aSopenharmony_ci "pxor %%mm2, %%mm2 \n\t" 339cabdff1aSopenharmony_ci "pxor %%mm3, %%mm3 \n\t" 340cabdff1aSopenharmony_ci "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 341cabdff1aSopenharmony_ci "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 342cabdff1aSopenharmony_ci "pxor %%mm2, %%mm0 \n\t" 343cabdff1aSopenharmony_ci "pxor %%mm3, %%mm1 \n\t" 344cabdff1aSopenharmony_ci "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 345cabdff1aSopenharmony_ci "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 346cabdff1aSopenharmony_ci "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 347cabdff1aSopenharmony_ci "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 348cabdff1aSopenharmony_ci "pxor %%mm4, %%mm4 \n\t" 349cabdff1aSopenharmony_ci "pxor %%mm5, %%mm5 \n\t" // FIXME slow 350cabdff1aSopenharmony_ci "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 351cabdff1aSopenharmony_ci "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 352cabdff1aSopenharmony_ci "psraw $4, %%mm0 \n\t" 353cabdff1aSopenharmony_ci "psraw $4, %%mm1 \n\t" 354cabdff1aSopenharmony_ci "pxor %%mm2, %%mm0 \n\t" 355cabdff1aSopenharmony_ci "pxor %%mm3, %%mm1 \n\t" 356cabdff1aSopenharmony_ci "psubw %%mm2, %%mm0 \n\t" 357cabdff1aSopenharmony_ci "psubw %%mm3, %%mm1 \n\t" 358cabdff1aSopenharmony_ci "pandn %%mm0, %%mm4 \n\t" 359cabdff1aSopenharmony_ci "pandn %%mm1, %%mm5 \n\t" 360cabdff1aSopenharmony_ci "movq %%mm4, (%0, %%"FF_REG_a") \n\t" 361cabdff1aSopenharmony_ci "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" 362cabdff1aSopenharmony_ci 363cabdff1aSopenharmony_ci "add $16, %%"FF_REG_a" \n\t" 364cabdff1aSopenharmony_ci "jng 1b \n\t" 365cabdff1aSopenharmony_ci ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) 366cabdff1aSopenharmony_ci : "%"FF_REG_a, "memory" 367cabdff1aSopenharmony_ci ); 368cabdff1aSopenharmony_ci block[0]= block0; 369cabdff1aSopenharmony_ci //Note, we do not do mismatch control for intra as errors cannot accumulate 370cabdff1aSopenharmony_ci} 371cabdff1aSopenharmony_ci 372cabdff1aSopenharmony_cistatic void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, 373cabdff1aSopenharmony_ci int16_t *block, int n, int qscale) 374cabdff1aSopenharmony_ci{ 375cabdff1aSopenharmony_ci x86_reg nCoeffs; 376cabdff1aSopenharmony_ci const uint16_t *quant_matrix; 377cabdff1aSopenharmony_ci 378cabdff1aSopenharmony_ci av_assert2(s->block_last_index[n]>=0); 379cabdff1aSopenharmony_ci 380cabdff1aSopenharmony_ci if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale]; 381cabdff1aSopenharmony_ci else qscale <<= 1; 382cabdff1aSopenharmony_ci 383cabdff1aSopenharmony_ci if(s->alternate_scan) nCoeffs= 63; //FIXME 384cabdff1aSopenharmony_ci else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; 385cabdff1aSopenharmony_ci 386cabdff1aSopenharmony_ci quant_matrix = s->inter_matrix; 387cabdff1aSopenharmony_ci__asm__ volatile( 388cabdff1aSopenharmony_ci "pcmpeqw %%mm7, %%mm7 \n\t" 389cabdff1aSopenharmony_ci "psrlq $48, %%mm7 \n\t" 390cabdff1aSopenharmony_ci "movd %2, %%mm6 \n\t" 391cabdff1aSopenharmony_ci "packssdw %%mm6, %%mm6 \n\t" 392cabdff1aSopenharmony_ci "packssdw %%mm6, %%mm6 \n\t" 393cabdff1aSopenharmony_ci "mov %3, %%"FF_REG_a" \n\t" 394cabdff1aSopenharmony_ci ".p2align 4 \n\t" 395cabdff1aSopenharmony_ci "1: \n\t" 396cabdff1aSopenharmony_ci "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" 397cabdff1aSopenharmony_ci "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" 398cabdff1aSopenharmony_ci "movq (%1, %%"FF_REG_a"), %%mm4 \n\t" 399cabdff1aSopenharmony_ci "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" 400cabdff1aSopenharmony_ci "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 401cabdff1aSopenharmony_ci "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 402cabdff1aSopenharmony_ci "pxor %%mm2, %%mm2 \n\t" 403cabdff1aSopenharmony_ci "pxor %%mm3, %%mm3 \n\t" 404cabdff1aSopenharmony_ci "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 405cabdff1aSopenharmony_ci "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 406cabdff1aSopenharmony_ci "pxor %%mm2, %%mm0 \n\t" 407cabdff1aSopenharmony_ci "pxor %%mm3, %%mm1 \n\t" 408cabdff1aSopenharmony_ci "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 409cabdff1aSopenharmony_ci "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 410cabdff1aSopenharmony_ci "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 411cabdff1aSopenharmony_ci "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 412cabdff1aSopenharmony_ci "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q 413cabdff1aSopenharmony_ci "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q 414cabdff1aSopenharmony_ci "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 415cabdff1aSopenharmony_ci "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 416cabdff1aSopenharmony_ci "pxor %%mm4, %%mm4 \n\t" 417cabdff1aSopenharmony_ci "pxor %%mm5, %%mm5 \n\t" // FIXME slow 418cabdff1aSopenharmony_ci "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 419cabdff1aSopenharmony_ci "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 420cabdff1aSopenharmony_ci "psrlw $5, %%mm0 \n\t" 421cabdff1aSopenharmony_ci "psrlw $5, %%mm1 \n\t" 422cabdff1aSopenharmony_ci "pxor %%mm2, %%mm0 \n\t" 423cabdff1aSopenharmony_ci "pxor %%mm3, %%mm1 \n\t" 424cabdff1aSopenharmony_ci "psubw %%mm2, %%mm0 \n\t" 425cabdff1aSopenharmony_ci "psubw %%mm3, %%mm1 \n\t" 426cabdff1aSopenharmony_ci "pandn %%mm0, %%mm4 \n\t" 427cabdff1aSopenharmony_ci "pandn %%mm1, %%mm5 \n\t" 428cabdff1aSopenharmony_ci "pxor %%mm4, %%mm7 \n\t" 429cabdff1aSopenharmony_ci "pxor %%mm5, %%mm7 \n\t" 430cabdff1aSopenharmony_ci "movq %%mm4, (%0, %%"FF_REG_a") \n\t" 431cabdff1aSopenharmony_ci "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" 432cabdff1aSopenharmony_ci 433cabdff1aSopenharmony_ci "add $16, %%"FF_REG_a" \n\t" 434cabdff1aSopenharmony_ci "jng 1b \n\t" 435cabdff1aSopenharmony_ci "movd 124(%0, %3), %%mm0 \n\t" 436cabdff1aSopenharmony_ci "movq %%mm7, %%mm6 \n\t" 437cabdff1aSopenharmony_ci "psrlq $32, %%mm7 \n\t" 438cabdff1aSopenharmony_ci "pxor %%mm6, %%mm7 \n\t" 439cabdff1aSopenharmony_ci "movq %%mm7, %%mm6 \n\t" 440cabdff1aSopenharmony_ci "psrlq $16, %%mm7 \n\t" 441cabdff1aSopenharmony_ci "pxor %%mm6, %%mm7 \n\t" 442cabdff1aSopenharmony_ci "pslld $31, %%mm7 \n\t" 443cabdff1aSopenharmony_ci "psrlq $15, %%mm7 \n\t" 444cabdff1aSopenharmony_ci "pxor %%mm7, %%mm0 \n\t" 445cabdff1aSopenharmony_ci "movd %%mm0, 124(%0, %3) \n\t" 446cabdff1aSopenharmony_ci 447cabdff1aSopenharmony_ci ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs) 448cabdff1aSopenharmony_ci : "%"FF_REG_a, "memory" 449cabdff1aSopenharmony_ci ); 450cabdff1aSopenharmony_ci} 451cabdff1aSopenharmony_ci 452cabdff1aSopenharmony_ci#endif /* HAVE_MMX_INLINE */ 453cabdff1aSopenharmony_ci 454cabdff1aSopenharmony_ciav_cold void ff_mpv_common_init_x86(MpegEncContext *s) 455cabdff1aSopenharmony_ci{ 456cabdff1aSopenharmony_ci#if HAVE_MMX_INLINE 457cabdff1aSopenharmony_ci int cpu_flags = av_get_cpu_flags(); 458cabdff1aSopenharmony_ci 459cabdff1aSopenharmony_ci if (INLINE_MMX(cpu_flags)) { 460cabdff1aSopenharmony_ci s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; 461cabdff1aSopenharmony_ci s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; 462cabdff1aSopenharmony_ci s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; 463cabdff1aSopenharmony_ci s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; 464cabdff1aSopenharmony_ci if (!(s->avctx->flags & AV_CODEC_FLAG_BITEXACT)) 465cabdff1aSopenharmony_ci s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; 466cabdff1aSopenharmony_ci s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; 467cabdff1aSopenharmony_ci } 468cabdff1aSopenharmony_ci#endif /* HAVE_MMX_INLINE */ 469cabdff1aSopenharmony_ci} 470