1cb93a386Sopenharmony_ci; 2cb93a386Sopenharmony_ci; jidctflt.asm - floating-point IDCT (SSE & SSE2) 3cb93a386Sopenharmony_ci; 4cb93a386Sopenharmony_ci; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5cb93a386Sopenharmony_ci; Copyright (C) 2016, D. R. Commander. 6cb93a386Sopenharmony_ci; 7cb93a386Sopenharmony_ci; Based on the x86 SIMD extension for IJG JPEG library 8cb93a386Sopenharmony_ci; Copyright (C) 1999-2006, MIYASAKA Masaru. 9cb93a386Sopenharmony_ci; For conditions of distribution and use, see copyright notice in jsimdext.inc 10cb93a386Sopenharmony_ci; 11cb93a386Sopenharmony_ci; This file should be assembled with NASM (Netwide Assembler), 12cb93a386Sopenharmony_ci; can *not* be assembled with Microsoft's MASM or any compatible 13cb93a386Sopenharmony_ci; assembler (including Borland's Turbo Assembler). 14cb93a386Sopenharmony_ci; NASM is available from http://nasm.sourceforge.net/ or 15cb93a386Sopenharmony_ci; http://sourceforge.net/project/showfiles.php?group_id=6208 16cb93a386Sopenharmony_ci; 17cb93a386Sopenharmony_ci; This file contains a floating-point implementation of the inverse DCT 18cb93a386Sopenharmony_ci; (Discrete Cosine Transform). The following code is based directly on 19cb93a386Sopenharmony_ci; the IJG's original jidctflt.c; see the jidctflt.c for more details. 20cb93a386Sopenharmony_ci 21cb93a386Sopenharmony_ci%include "jsimdext.inc" 22cb93a386Sopenharmony_ci%include "jdct.inc" 23cb93a386Sopenharmony_ci 24cb93a386Sopenharmony_ci; -------------------------------------------------------------------------- 25cb93a386Sopenharmony_ci 26cb93a386Sopenharmony_ci%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 27cb93a386Sopenharmony_ci shufps %1, %2, 0x44 28cb93a386Sopenharmony_ci%endmacro 29cb93a386Sopenharmony_ci 30cb93a386Sopenharmony_ci%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 31cb93a386Sopenharmony_ci shufps %1, %2, 0xEE 32cb93a386Sopenharmony_ci%endmacro 33cb93a386Sopenharmony_ci 34cb93a386Sopenharmony_ci; -------------------------------------------------------------------------- 35cb93a386Sopenharmony_ci SECTION SEG_CONST 36cb93a386Sopenharmony_ci 37cb93a386Sopenharmony_ci alignz 32 38cb93a386Sopenharmony_ci GLOBAL_DATA(jconst_idct_float_sse2) 39cb93a386Sopenharmony_ci 40cb93a386Sopenharmony_ciEXTN(jconst_idct_float_sse2): 41cb93a386Sopenharmony_ci 42cb93a386Sopenharmony_ciPD_1_414 times 4 dd 1.414213562373095048801689 43cb93a386Sopenharmony_ciPD_1_847 times 4 dd 1.847759065022573512256366 44cb93a386Sopenharmony_ciPD_1_082 times 4 dd 1.082392200292393968799446 45cb93a386Sopenharmony_ciPD_M2_613 times 4 dd -2.613125929752753055713286 46cb93a386Sopenharmony_ciPD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) 47cb93a386Sopenharmony_ciPB_CENTERJSAMP times 16 db CENTERJSAMPLE 48cb93a386Sopenharmony_ci 49cb93a386Sopenharmony_ci alignz 32 50cb93a386Sopenharmony_ci 51cb93a386Sopenharmony_ci; -------------------------------------------------------------------------- 52cb93a386Sopenharmony_ci SECTION SEG_TEXT 53cb93a386Sopenharmony_ci BITS 32 54cb93a386Sopenharmony_ci; 55cb93a386Sopenharmony_ci; Perform dequantization and inverse DCT on one block of coefficients. 56cb93a386Sopenharmony_ci; 57cb93a386Sopenharmony_ci; GLOBAL(void) 58cb93a386Sopenharmony_ci; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block, 59cb93a386Sopenharmony_ci; JSAMPARRAY output_buf, JDIMENSION output_col) 60cb93a386Sopenharmony_ci; 61cb93a386Sopenharmony_ci 62cb93a386Sopenharmony_ci%define dct_table(b) (b) + 8 ; void *dct_table 63cb93a386Sopenharmony_ci%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block 64cb93a386Sopenharmony_ci%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf 65cb93a386Sopenharmony_ci%define output_col(b) (b) + 20 ; JDIMENSION output_col 66cb93a386Sopenharmony_ci 67cb93a386Sopenharmony_ci%define original_ebp ebp + 0 68cb93a386Sopenharmony_ci%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD 69cb93a386Sopenharmony_ci ; xmmword wk[WK_NUM] 70cb93a386Sopenharmony_ci%define WK_NUM 2 71cb93a386Sopenharmony_ci%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT 72cb93a386Sopenharmony_ci ; FAST_FLOAT workspace[DCTSIZE2] 73cb93a386Sopenharmony_ci 74cb93a386Sopenharmony_ci align 32 75cb93a386Sopenharmony_ci GLOBAL_FUNCTION(jsimd_idct_float_sse2) 76cb93a386Sopenharmony_ci 77cb93a386Sopenharmony_ciEXTN(jsimd_idct_float_sse2): 78cb93a386Sopenharmony_ci push ebp 79cb93a386Sopenharmony_ci mov eax, esp ; eax = original ebp 80cb93a386Sopenharmony_ci sub esp, byte 4 81cb93a386Sopenharmony_ci and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 82cb93a386Sopenharmony_ci mov [esp], eax 83cb93a386Sopenharmony_ci mov ebp, esp ; ebp = aligned ebp 84cb93a386Sopenharmony_ci lea esp, [workspace] 85cb93a386Sopenharmony_ci push ebx 86cb93a386Sopenharmony_ci; push ecx ; need not be preserved 87cb93a386Sopenharmony_ci; push edx ; need not be preserved 88cb93a386Sopenharmony_ci push esi 89cb93a386Sopenharmony_ci push edi 90cb93a386Sopenharmony_ci 91cb93a386Sopenharmony_ci get_GOT ebx ; get GOT address 92cb93a386Sopenharmony_ci 93cb93a386Sopenharmony_ci ; ---- Pass 1: process columns from input, store into work array. 94cb93a386Sopenharmony_ci 95cb93a386Sopenharmony_ci; mov eax, [original_ebp] 96cb93a386Sopenharmony_ci mov edx, POINTER [dct_table(eax)] ; quantptr 97cb93a386Sopenharmony_ci mov esi, JCOEFPTR [coef_block(eax)] ; inptr 98cb93a386Sopenharmony_ci lea edi, [workspace] ; FAST_FLOAT *wsptr 99cb93a386Sopenharmony_ci mov ecx, DCTSIZE/4 ; ctr 100cb93a386Sopenharmony_ci alignx 16, 7 101cb93a386Sopenharmony_ci.columnloop: 102cb93a386Sopenharmony_ci%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE 103cb93a386Sopenharmony_ci mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 104cb93a386Sopenharmony_ci or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 105cb93a386Sopenharmony_ci jnz near .columnDCT 106cb93a386Sopenharmony_ci 107cb93a386Sopenharmony_ci movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 108cb93a386Sopenharmony_ci movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 109cb93a386Sopenharmony_ci movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 110cb93a386Sopenharmony_ci movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 111cb93a386Sopenharmony_ci movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 112cb93a386Sopenharmony_ci movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 113cb93a386Sopenharmony_ci movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 114cb93a386Sopenharmony_ci por xmm1, xmm2 115cb93a386Sopenharmony_ci por xmm3, xmm4 116cb93a386Sopenharmony_ci por xmm5, xmm6 117cb93a386Sopenharmony_ci por xmm1, xmm3 118cb93a386Sopenharmony_ci por xmm5, xmm7 119cb93a386Sopenharmony_ci por xmm1, xmm5 120cb93a386Sopenharmony_ci packsswb xmm1, xmm1 121cb93a386Sopenharmony_ci movd eax, xmm1 122cb93a386Sopenharmony_ci test eax, eax 123cb93a386Sopenharmony_ci jnz short .columnDCT 124cb93a386Sopenharmony_ci 125cb93a386Sopenharmony_ci ; -- AC terms all zero 126cb93a386Sopenharmony_ci 127cb93a386Sopenharmony_ci movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 128cb93a386Sopenharmony_ci 129cb93a386Sopenharmony_ci punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 130cb93a386Sopenharmony_ci psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 131cb93a386Sopenharmony_ci cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 132cb93a386Sopenharmony_ci 133cb93a386Sopenharmony_ci mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 134cb93a386Sopenharmony_ci 135cb93a386Sopenharmony_ci movaps xmm1, xmm0 136cb93a386Sopenharmony_ci movaps xmm2, xmm0 137cb93a386Sopenharmony_ci movaps xmm3, xmm0 138cb93a386Sopenharmony_ci 139cb93a386Sopenharmony_ci shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) 140cb93a386Sopenharmony_ci shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) 141cb93a386Sopenharmony_ci shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) 142cb93a386Sopenharmony_ci shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) 143cb93a386Sopenharmony_ci 144cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 145cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 146cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 147cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 148cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 149cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 150cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 151cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 152cb93a386Sopenharmony_ci jmp near .nextcolumn 153cb93a386Sopenharmony_ci alignx 16, 7 154cb93a386Sopenharmony_ci%endif 155cb93a386Sopenharmony_ci.columnDCT: 156cb93a386Sopenharmony_ci 157cb93a386Sopenharmony_ci ; -- Even part 158cb93a386Sopenharmony_ci 159cb93a386Sopenharmony_ci movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 160cb93a386Sopenharmony_ci movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 161cb93a386Sopenharmony_ci movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 162cb93a386Sopenharmony_ci movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 163cb93a386Sopenharmony_ci 164cb93a386Sopenharmony_ci punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 165cb93a386Sopenharmony_ci punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23) 166cb93a386Sopenharmony_ci psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 167cb93a386Sopenharmony_ci psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) 168cb93a386Sopenharmony_ci cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 169cb93a386Sopenharmony_ci cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23) 170cb93a386Sopenharmony_ci 171cb93a386Sopenharmony_ci punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43) 172cb93a386Sopenharmony_ci punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63) 173cb93a386Sopenharmony_ci psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) 174cb93a386Sopenharmony_ci psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) 175cb93a386Sopenharmony_ci cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43) 176cb93a386Sopenharmony_ci cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63) 177cb93a386Sopenharmony_ci 178cb93a386Sopenharmony_ci mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 179cb93a386Sopenharmony_ci mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 180cb93a386Sopenharmony_ci mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 181cb93a386Sopenharmony_ci mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 182cb93a386Sopenharmony_ci 183cb93a386Sopenharmony_ci movaps xmm4, xmm0 184cb93a386Sopenharmony_ci movaps xmm5, xmm1 185cb93a386Sopenharmony_ci subps xmm0, xmm2 ; xmm0=tmp11 186cb93a386Sopenharmony_ci subps xmm1, xmm3 187cb93a386Sopenharmony_ci addps xmm4, xmm2 ; xmm4=tmp10 188cb93a386Sopenharmony_ci addps xmm5, xmm3 ; xmm5=tmp13 189cb93a386Sopenharmony_ci 190cb93a386Sopenharmony_ci mulps xmm1, [GOTOFF(ebx,PD_1_414)] 191cb93a386Sopenharmony_ci subps xmm1, xmm5 ; xmm1=tmp12 192cb93a386Sopenharmony_ci 193cb93a386Sopenharmony_ci movaps xmm6, xmm4 194cb93a386Sopenharmony_ci movaps xmm7, xmm0 195cb93a386Sopenharmony_ci subps xmm4, xmm5 ; xmm4=tmp3 196cb93a386Sopenharmony_ci subps xmm0, xmm1 ; xmm0=tmp2 197cb93a386Sopenharmony_ci addps xmm6, xmm5 ; xmm6=tmp0 198cb93a386Sopenharmony_ci addps xmm7, xmm1 ; xmm7=tmp1 199cb93a386Sopenharmony_ci 200cb93a386Sopenharmony_ci movaps XMMWORD [wk(1)], xmm4 ; tmp3 201cb93a386Sopenharmony_ci movaps XMMWORD [wk(0)], xmm0 ; tmp2 202cb93a386Sopenharmony_ci 203cb93a386Sopenharmony_ci ; -- Odd part 204cb93a386Sopenharmony_ci 205cb93a386Sopenharmony_ci movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 206cb93a386Sopenharmony_ci movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 207cb93a386Sopenharmony_ci movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 208cb93a386Sopenharmony_ci movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 209cb93a386Sopenharmony_ci 210cb93a386Sopenharmony_ci punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13) 211cb93a386Sopenharmony_ci punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33) 212cb93a386Sopenharmony_ci psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) 213cb93a386Sopenharmony_ci psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) 214cb93a386Sopenharmony_ci cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13) 215cb93a386Sopenharmony_ci cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33) 216cb93a386Sopenharmony_ci 217cb93a386Sopenharmony_ci punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53) 218cb93a386Sopenharmony_ci punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73) 219cb93a386Sopenharmony_ci psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) 220cb93a386Sopenharmony_ci psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) 221cb93a386Sopenharmony_ci cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53) 222cb93a386Sopenharmony_ci cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73) 223cb93a386Sopenharmony_ci 224cb93a386Sopenharmony_ci mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 225cb93a386Sopenharmony_ci mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 226cb93a386Sopenharmony_ci mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 227cb93a386Sopenharmony_ci mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 228cb93a386Sopenharmony_ci 229cb93a386Sopenharmony_ci movaps xmm4, xmm2 230cb93a386Sopenharmony_ci movaps xmm0, xmm5 231cb93a386Sopenharmony_ci addps xmm2, xmm1 ; xmm2=z11 232cb93a386Sopenharmony_ci addps xmm5, xmm3 ; xmm5=z13 233cb93a386Sopenharmony_ci subps xmm4, xmm1 ; xmm4=z12 234cb93a386Sopenharmony_ci subps xmm0, xmm3 ; xmm0=z10 235cb93a386Sopenharmony_ci 236cb93a386Sopenharmony_ci movaps xmm1, xmm2 237cb93a386Sopenharmony_ci subps xmm2, xmm5 238cb93a386Sopenharmony_ci addps xmm1, xmm5 ; xmm1=tmp7 239cb93a386Sopenharmony_ci 240cb93a386Sopenharmony_ci mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 241cb93a386Sopenharmony_ci 242cb93a386Sopenharmony_ci movaps xmm3, xmm0 243cb93a386Sopenharmony_ci addps xmm0, xmm4 244cb93a386Sopenharmony_ci mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 245cb93a386Sopenharmony_ci mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 246cb93a386Sopenharmony_ci mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 247cb93a386Sopenharmony_ci addps xmm3, xmm0 ; xmm3=tmp12 248cb93a386Sopenharmony_ci subps xmm4, xmm0 ; xmm4=tmp10 249cb93a386Sopenharmony_ci 250cb93a386Sopenharmony_ci ; -- Final output stage 251cb93a386Sopenharmony_ci 252cb93a386Sopenharmony_ci subps xmm3, xmm1 ; xmm3=tmp6 253cb93a386Sopenharmony_ci movaps xmm5, xmm6 254cb93a386Sopenharmony_ci movaps xmm0, xmm7 255cb93a386Sopenharmony_ci addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) 256cb93a386Sopenharmony_ci addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) 257cb93a386Sopenharmony_ci subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) 258cb93a386Sopenharmony_ci subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) 259cb93a386Sopenharmony_ci subps xmm2, xmm3 ; xmm2=tmp5 260cb93a386Sopenharmony_ci 261cb93a386Sopenharmony_ci movaps xmm1, xmm6 ; transpose coefficients(phase 1) 262cb93a386Sopenharmony_ci unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) 263cb93a386Sopenharmony_ci unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) 264cb93a386Sopenharmony_ci movaps xmm3, xmm0 ; transpose coefficients(phase 1) 265cb93a386Sopenharmony_ci unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) 266cb93a386Sopenharmony_ci unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) 267cb93a386Sopenharmony_ci 268cb93a386Sopenharmony_ci movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 269cb93a386Sopenharmony_ci movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 270cb93a386Sopenharmony_ci 271cb93a386Sopenharmony_ci movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) 272cb93a386Sopenharmony_ci movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) 273cb93a386Sopenharmony_ci 274cb93a386Sopenharmony_ci addps xmm4, xmm2 ; xmm4=tmp4 275cb93a386Sopenharmony_ci movaps xmm0, xmm7 276cb93a386Sopenharmony_ci movaps xmm3, xmm5 277cb93a386Sopenharmony_ci addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) 278cb93a386Sopenharmony_ci addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) 279cb93a386Sopenharmony_ci subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) 280cb93a386Sopenharmony_ci subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) 281cb93a386Sopenharmony_ci 282cb93a386Sopenharmony_ci movaps xmm2, xmm7 ; transpose coefficients(phase 1) 283cb93a386Sopenharmony_ci unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) 284cb93a386Sopenharmony_ci unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) 285cb93a386Sopenharmony_ci movaps xmm4, xmm5 ; transpose coefficients(phase 1) 286cb93a386Sopenharmony_ci unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) 287cb93a386Sopenharmony_ci unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) 288cb93a386Sopenharmony_ci 289cb93a386Sopenharmony_ci movaps xmm3, xmm6 ; transpose coefficients(phase 2) 290cb93a386Sopenharmony_ci unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) 291cb93a386Sopenharmony_ci unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) 292cb93a386Sopenharmony_ci movaps xmm0, xmm1 ; transpose coefficients(phase 2) 293cb93a386Sopenharmony_ci unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) 294cb93a386Sopenharmony_ci unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) 295cb93a386Sopenharmony_ci 296cb93a386Sopenharmony_ci movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) 297cb93a386Sopenharmony_ci movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) 298cb93a386Sopenharmony_ci 299cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 300cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 301cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 302cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 303cb93a386Sopenharmony_ci 304cb93a386Sopenharmony_ci movaps xmm6, xmm5 ; transpose coefficients(phase 2) 305cb93a386Sopenharmony_ci unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) 306cb93a386Sopenharmony_ci unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) 307cb93a386Sopenharmony_ci movaps xmm3, xmm4 ; transpose coefficients(phase 2) 308cb93a386Sopenharmony_ci unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) 309cb93a386Sopenharmony_ci unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) 310cb93a386Sopenharmony_ci 311cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 312cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 313cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 314cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 315cb93a386Sopenharmony_ci 316cb93a386Sopenharmony_ci.nextcolumn: 317cb93a386Sopenharmony_ci add esi, byte 4*SIZEOF_JCOEF ; coef_block 318cb93a386Sopenharmony_ci add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr 319cb93a386Sopenharmony_ci add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 320cb93a386Sopenharmony_ci dec ecx ; ctr 321cb93a386Sopenharmony_ci jnz near .columnloop 322cb93a386Sopenharmony_ci 323cb93a386Sopenharmony_ci ; -- Prefetch the next coefficient block 324cb93a386Sopenharmony_ci 325cb93a386Sopenharmony_ci prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 326cb93a386Sopenharmony_ci prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 327cb93a386Sopenharmony_ci prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 328cb93a386Sopenharmony_ci prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 329cb93a386Sopenharmony_ci 330cb93a386Sopenharmony_ci ; ---- Pass 2: process rows from work array, store into output array. 331cb93a386Sopenharmony_ci 332cb93a386Sopenharmony_ci mov eax, [original_ebp] 333cb93a386Sopenharmony_ci lea esi, [workspace] ; FAST_FLOAT *wsptr 334cb93a386Sopenharmony_ci mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 335cb93a386Sopenharmony_ci mov eax, JDIMENSION [output_col(eax)] 336cb93a386Sopenharmony_ci mov ecx, DCTSIZE/4 ; ctr 337cb93a386Sopenharmony_ci alignx 16, 7 338cb93a386Sopenharmony_ci.rowloop: 339cb93a386Sopenharmony_ci 340cb93a386Sopenharmony_ci ; -- Even part 341cb93a386Sopenharmony_ci 342cb93a386Sopenharmony_ci movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 343cb93a386Sopenharmony_ci movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] 344cb93a386Sopenharmony_ci movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] 345cb93a386Sopenharmony_ci movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] 346cb93a386Sopenharmony_ci 347cb93a386Sopenharmony_ci movaps xmm4, xmm0 348cb93a386Sopenharmony_ci movaps xmm5, xmm1 349cb93a386Sopenharmony_ci subps xmm0, xmm2 ; xmm0=tmp11 350cb93a386Sopenharmony_ci subps xmm1, xmm3 351cb93a386Sopenharmony_ci addps xmm4, xmm2 ; xmm4=tmp10 352cb93a386Sopenharmony_ci addps xmm5, xmm3 ; xmm5=tmp13 353cb93a386Sopenharmony_ci 354cb93a386Sopenharmony_ci mulps xmm1, [GOTOFF(ebx,PD_1_414)] 355cb93a386Sopenharmony_ci subps xmm1, xmm5 ; xmm1=tmp12 356cb93a386Sopenharmony_ci 357cb93a386Sopenharmony_ci movaps xmm6, xmm4 358cb93a386Sopenharmony_ci movaps xmm7, xmm0 359cb93a386Sopenharmony_ci subps xmm4, xmm5 ; xmm4=tmp3 360cb93a386Sopenharmony_ci subps xmm0, xmm1 ; xmm0=tmp2 361cb93a386Sopenharmony_ci addps xmm6, xmm5 ; xmm6=tmp0 362cb93a386Sopenharmony_ci addps xmm7, xmm1 ; xmm7=tmp1 363cb93a386Sopenharmony_ci 364cb93a386Sopenharmony_ci movaps XMMWORD [wk(1)], xmm4 ; tmp3 365cb93a386Sopenharmony_ci movaps XMMWORD [wk(0)], xmm0 ; tmp2 366cb93a386Sopenharmony_ci 367cb93a386Sopenharmony_ci ; -- Odd part 368cb93a386Sopenharmony_ci 369cb93a386Sopenharmony_ci movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 370cb93a386Sopenharmony_ci movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] 371cb93a386Sopenharmony_ci movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] 372cb93a386Sopenharmony_ci movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] 373cb93a386Sopenharmony_ci 374cb93a386Sopenharmony_ci movaps xmm4, xmm2 375cb93a386Sopenharmony_ci movaps xmm0, xmm5 376cb93a386Sopenharmony_ci addps xmm2, xmm1 ; xmm2=z11 377cb93a386Sopenharmony_ci addps xmm5, xmm3 ; xmm5=z13 378cb93a386Sopenharmony_ci subps xmm4, xmm1 ; xmm4=z12 379cb93a386Sopenharmony_ci subps xmm0, xmm3 ; xmm0=z10 380cb93a386Sopenharmony_ci 381cb93a386Sopenharmony_ci movaps xmm1, xmm2 382cb93a386Sopenharmony_ci subps xmm2, xmm5 383cb93a386Sopenharmony_ci addps xmm1, xmm5 ; xmm1=tmp7 384cb93a386Sopenharmony_ci 385cb93a386Sopenharmony_ci mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 386cb93a386Sopenharmony_ci 387cb93a386Sopenharmony_ci movaps xmm3, xmm0 388cb93a386Sopenharmony_ci addps xmm0, xmm4 389cb93a386Sopenharmony_ci mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 390cb93a386Sopenharmony_ci mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 391cb93a386Sopenharmony_ci mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 392cb93a386Sopenharmony_ci addps xmm3, xmm0 ; xmm3=tmp12 393cb93a386Sopenharmony_ci subps xmm4, xmm0 ; xmm4=tmp10 394cb93a386Sopenharmony_ci 395cb93a386Sopenharmony_ci ; -- Final output stage 396cb93a386Sopenharmony_ci 397cb93a386Sopenharmony_ci subps xmm3, xmm1 ; xmm3=tmp6 398cb93a386Sopenharmony_ci movaps xmm5, xmm6 399cb93a386Sopenharmony_ci movaps xmm0, xmm7 400cb93a386Sopenharmony_ci addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) 401cb93a386Sopenharmony_ci addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) 402cb93a386Sopenharmony_ci subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) 403cb93a386Sopenharmony_ci subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) 404cb93a386Sopenharmony_ci subps xmm2, xmm3 ; xmm2=tmp5 405cb93a386Sopenharmony_ci 406cb93a386Sopenharmony_ci movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] 407cb93a386Sopenharmony_ci pcmpeqd xmm3, xmm3 408cb93a386Sopenharmony_ci psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} 409cb93a386Sopenharmony_ci 410cb93a386Sopenharmony_ci addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) 411cb93a386Sopenharmony_ci addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) 412cb93a386Sopenharmony_ci addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) 413cb93a386Sopenharmony_ci addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) 414cb93a386Sopenharmony_ci 415cb93a386Sopenharmony_ci pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) 416cb93a386Sopenharmony_ci pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) 417cb93a386Sopenharmony_ci pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) 418cb93a386Sopenharmony_ci pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) 419cb93a386Sopenharmony_ci por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31) 420cb93a386Sopenharmony_ci por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37) 421cb93a386Sopenharmony_ci 422cb93a386Sopenharmony_ci movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 423cb93a386Sopenharmony_ci movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 424cb93a386Sopenharmony_ci 425cb93a386Sopenharmony_ci addps xmm4, xmm2 ; xmm4=tmp4 426cb93a386Sopenharmony_ci movaps xmm7, xmm1 427cb93a386Sopenharmony_ci movaps xmm5, xmm3 428cb93a386Sopenharmony_ci addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32) 429cb93a386Sopenharmony_ci addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34) 430cb93a386Sopenharmony_ci subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35) 431cb93a386Sopenharmony_ci subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33) 432cb93a386Sopenharmony_ci 433cb93a386Sopenharmony_ci movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] 434cb93a386Sopenharmony_ci pcmpeqd xmm4, xmm4 435cb93a386Sopenharmony_ci psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} 436cb93a386Sopenharmony_ci 437cb93a386Sopenharmony_ci addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) 438cb93a386Sopenharmony_ci addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) 439cb93a386Sopenharmony_ci addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) 440cb93a386Sopenharmony_ci addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) 441cb93a386Sopenharmony_ci 442cb93a386Sopenharmony_ci pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) 443cb93a386Sopenharmony_ci pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) 444cb93a386Sopenharmony_ci pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) 445cb93a386Sopenharmony_ci pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) 446cb93a386Sopenharmony_ci por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35) 447cb93a386Sopenharmony_ci por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33) 448cb93a386Sopenharmony_ci 449cb93a386Sopenharmony_ci movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] 450cb93a386Sopenharmony_ci 451cb93a386Sopenharmony_ci packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) 452cb93a386Sopenharmony_ci packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) 453cb93a386Sopenharmony_ci paddb xmm6, xmm2 454cb93a386Sopenharmony_ci paddb xmm1, xmm2 455cb93a386Sopenharmony_ci 456cb93a386Sopenharmony_ci movdqa xmm4, xmm6 ; transpose coefficients(phase 2) 457cb93a386Sopenharmony_ci punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 458cb93a386Sopenharmony_ci punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 459cb93a386Sopenharmony_ci 460cb93a386Sopenharmony_ci movdqa xmm7, xmm6 ; transpose coefficients(phase 3) 461cb93a386Sopenharmony_ci punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 462cb93a386Sopenharmony_ci punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 463cb93a386Sopenharmony_ci 464cb93a386Sopenharmony_ci pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 465cb93a386Sopenharmony_ci pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 466cb93a386Sopenharmony_ci 467cb93a386Sopenharmony_ci pushpic ebx ; save GOT address 468cb93a386Sopenharmony_ci 469cb93a386Sopenharmony_ci mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 470cb93a386Sopenharmony_ci mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 471cb93a386Sopenharmony_ci movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 472cb93a386Sopenharmony_ci movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 473cb93a386Sopenharmony_ci mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 474cb93a386Sopenharmony_ci mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 475cb93a386Sopenharmony_ci movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 476cb93a386Sopenharmony_ci movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 477cb93a386Sopenharmony_ci 478cb93a386Sopenharmony_ci poppic ebx ; restore GOT address 479cb93a386Sopenharmony_ci 480cb93a386Sopenharmony_ci add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr 481cb93a386Sopenharmony_ci add edi, byte 4*SIZEOF_JSAMPROW 482cb93a386Sopenharmony_ci dec ecx ; ctr 483cb93a386Sopenharmony_ci jnz near .rowloop 484cb93a386Sopenharmony_ci 485cb93a386Sopenharmony_ci pop edi 486cb93a386Sopenharmony_ci pop esi 487cb93a386Sopenharmony_ci; pop edx ; need not be preserved 488cb93a386Sopenharmony_ci; pop ecx ; need not be preserved 489cb93a386Sopenharmony_ci pop ebx 490cb93a386Sopenharmony_ci mov esp, ebp ; esp <- aligned ebp 491cb93a386Sopenharmony_ci pop esp ; esp <- original ebp 492cb93a386Sopenharmony_ci pop ebp 493cb93a386Sopenharmony_ci ret 494cb93a386Sopenharmony_ci 495cb93a386Sopenharmony_ci; For some reason, the OS X linker does not honor the request to align the 496cb93a386Sopenharmony_ci; segment unless we do this. 497cb93a386Sopenharmony_ci align 32 498