1cb93a386Sopenharmony_ci; 2cb93a386Sopenharmony_ci; jquant.asm - sample data conversion and quantization (SSE & MMX) 3cb93a386Sopenharmony_ci; 4cb93a386Sopenharmony_ci; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5cb93a386Sopenharmony_ci; Copyright (C) 2016, D. R. Commander. 6cb93a386Sopenharmony_ci; 7cb93a386Sopenharmony_ci; Based on the x86 SIMD extension for IJG JPEG library 8cb93a386Sopenharmony_ci; Copyright (C) 1999-2006, MIYASAKA Masaru. 9cb93a386Sopenharmony_ci; For conditions of distribution and use, see copyright notice in jsimdext.inc 10cb93a386Sopenharmony_ci; 11cb93a386Sopenharmony_ci; This file should be assembled with NASM (Netwide Assembler), 12cb93a386Sopenharmony_ci; can *not* be assembled with Microsoft's MASM or any compatible 13cb93a386Sopenharmony_ci; assembler (including Borland's Turbo Assembler). 14cb93a386Sopenharmony_ci; NASM is available from http://nasm.sourceforge.net/ or 15cb93a386Sopenharmony_ci; http://sourceforge.net/project/showfiles.php?group_id=6208 16cb93a386Sopenharmony_ci 17cb93a386Sopenharmony_ci%include "jsimdext.inc" 18cb93a386Sopenharmony_ci%include "jdct.inc" 19cb93a386Sopenharmony_ci 20cb93a386Sopenharmony_ci; -------------------------------------------------------------------------- 21cb93a386Sopenharmony_ci SECTION SEG_TEXT 22cb93a386Sopenharmony_ci BITS 32 23cb93a386Sopenharmony_ci; 24cb93a386Sopenharmony_ci; Load data into workspace, applying unsigned->signed conversion 25cb93a386Sopenharmony_ci; 26cb93a386Sopenharmony_ci; GLOBAL(void) 27cb93a386Sopenharmony_ci; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col, 28cb93a386Sopenharmony_ci; FAST_FLOAT *workspace); 29cb93a386Sopenharmony_ci; 30cb93a386Sopenharmony_ci 31cb93a386Sopenharmony_ci%define sample_data ebp + 8 ; JSAMPARRAY sample_data 32cb93a386Sopenharmony_ci%define start_col ebp + 12 ; JDIMENSION start_col 33cb93a386Sopenharmony_ci%define workspace ebp + 16 ; FAST_FLOAT *workspace 34cb93a386Sopenharmony_ci 35cb93a386Sopenharmony_ci align 32 36cb93a386Sopenharmony_ci GLOBAL_FUNCTION(jsimd_convsamp_float_sse) 37cb93a386Sopenharmony_ci 38cb93a386Sopenharmony_ciEXTN(jsimd_convsamp_float_sse): 39cb93a386Sopenharmony_ci push ebp 40cb93a386Sopenharmony_ci mov ebp, esp 41cb93a386Sopenharmony_ci push ebx 42cb93a386Sopenharmony_ci; push ecx ; need not be preserved 43cb93a386Sopenharmony_ci; push edx ; need not be preserved 44cb93a386Sopenharmony_ci push esi 45cb93a386Sopenharmony_ci push edi 46cb93a386Sopenharmony_ci 47cb93a386Sopenharmony_ci pcmpeqw mm7, mm7 48cb93a386Sopenharmony_ci psllw mm7, 7 49cb93a386Sopenharmony_ci packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) 50cb93a386Sopenharmony_ci 51cb93a386Sopenharmony_ci mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 52cb93a386Sopenharmony_ci mov eax, JDIMENSION [start_col] 53cb93a386Sopenharmony_ci mov edi, POINTER [workspace] ; (DCTELEM *) 54cb93a386Sopenharmony_ci mov ecx, DCTSIZE/2 55cb93a386Sopenharmony_ci alignx 16, 7 56cb93a386Sopenharmony_ci.convloop: 57cb93a386Sopenharmony_ci mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 58cb93a386Sopenharmony_ci mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 59cb93a386Sopenharmony_ci 60cb93a386Sopenharmony_ci movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] 61cb93a386Sopenharmony_ci movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] 62cb93a386Sopenharmony_ci 63cb93a386Sopenharmony_ci psubb mm0, mm7 ; mm0=(01234567) 64cb93a386Sopenharmony_ci psubb mm1, mm7 ; mm1=(89ABCDEF) 65cb93a386Sopenharmony_ci 66cb93a386Sopenharmony_ci punpcklbw mm2, mm0 ; mm2=(*0*1*2*3) 67cb93a386Sopenharmony_ci punpckhbw mm0, mm0 ; mm0=(*4*5*6*7) 68cb93a386Sopenharmony_ci punpcklbw mm3, mm1 ; mm3=(*8*9*A*B) 69cb93a386Sopenharmony_ci punpckhbw mm1, mm1 ; mm1=(*C*D*E*F) 70cb93a386Sopenharmony_ci 71cb93a386Sopenharmony_ci punpcklwd mm4, mm2 ; mm4=(***0***1) 72cb93a386Sopenharmony_ci punpckhwd mm2, mm2 ; mm2=(***2***3) 73cb93a386Sopenharmony_ci punpcklwd mm5, mm0 ; mm5=(***4***5) 74cb93a386Sopenharmony_ci punpckhwd mm0, mm0 ; mm0=(***6***7) 75cb93a386Sopenharmony_ci 76cb93a386Sopenharmony_ci psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01) 77cb93a386Sopenharmony_ci psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23) 78cb93a386Sopenharmony_ci cvtpi2ps xmm0, mm4 ; xmm0=(01**) 79cb93a386Sopenharmony_ci cvtpi2ps xmm1, mm2 ; xmm1=(23**) 80cb93a386Sopenharmony_ci psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45) 81cb93a386Sopenharmony_ci psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67) 82cb93a386Sopenharmony_ci cvtpi2ps xmm2, mm5 ; xmm2=(45**) 83cb93a386Sopenharmony_ci cvtpi2ps xmm3, mm0 ; xmm3=(67**) 84cb93a386Sopenharmony_ci 85cb93a386Sopenharmony_ci punpcklwd mm6, mm3 ; mm6=(***8***9) 86cb93a386Sopenharmony_ci punpckhwd mm3, mm3 ; mm3=(***A***B) 87cb93a386Sopenharmony_ci punpcklwd mm4, mm1 ; mm4=(***C***D) 88cb93a386Sopenharmony_ci punpckhwd mm1, mm1 ; mm1=(***E***F) 89cb93a386Sopenharmony_ci 90cb93a386Sopenharmony_ci psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89) 91cb93a386Sopenharmony_ci psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB) 92cb93a386Sopenharmony_ci cvtpi2ps xmm4, mm6 ; xmm4=(89**) 93cb93a386Sopenharmony_ci cvtpi2ps xmm5, mm3 ; xmm5=(AB**) 94cb93a386Sopenharmony_ci psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD) 95cb93a386Sopenharmony_ci psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF) 96cb93a386Sopenharmony_ci cvtpi2ps xmm6, mm4 ; xmm6=(CD**) 97cb93a386Sopenharmony_ci cvtpi2ps xmm7, mm1 ; xmm7=(EF**) 98cb93a386Sopenharmony_ci 99cb93a386Sopenharmony_ci movlhps xmm0, xmm1 ; xmm0=(0123) 100cb93a386Sopenharmony_ci movlhps xmm2, xmm3 ; xmm2=(4567) 101cb93a386Sopenharmony_ci movlhps xmm4, xmm5 ; xmm4=(89AB) 102cb93a386Sopenharmony_ci movlhps xmm6, xmm7 ; xmm6=(CDEF) 103cb93a386Sopenharmony_ci 104cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 105cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2 106cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4 107cb93a386Sopenharmony_ci movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 108cb93a386Sopenharmony_ci 109cb93a386Sopenharmony_ci add esi, byte 2*SIZEOF_JSAMPROW 110cb93a386Sopenharmony_ci add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 111cb93a386Sopenharmony_ci dec ecx 112cb93a386Sopenharmony_ci jnz near .convloop 113cb93a386Sopenharmony_ci 114cb93a386Sopenharmony_ci emms ; empty MMX state 115cb93a386Sopenharmony_ci 116cb93a386Sopenharmony_ci pop edi 117cb93a386Sopenharmony_ci pop esi 118cb93a386Sopenharmony_ci; pop edx ; need not be preserved 119cb93a386Sopenharmony_ci; pop ecx ; need not be preserved 120cb93a386Sopenharmony_ci pop ebx 121cb93a386Sopenharmony_ci pop ebp 122cb93a386Sopenharmony_ci ret 123cb93a386Sopenharmony_ci 124cb93a386Sopenharmony_ci; -------------------------------------------------------------------------- 125cb93a386Sopenharmony_ci; 126cb93a386Sopenharmony_ci; Quantize/descale the coefficients, and store into coef_block 127cb93a386Sopenharmony_ci; 128cb93a386Sopenharmony_ci; GLOBAL(void) 129cb93a386Sopenharmony_ci; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors, 130cb93a386Sopenharmony_ci; FAST_FLOAT *workspace); 131cb93a386Sopenharmony_ci; 132cb93a386Sopenharmony_ci 133cb93a386Sopenharmony_ci%define coef_block ebp + 8 ; JCOEFPTR coef_block 134cb93a386Sopenharmony_ci%define divisors ebp + 12 ; FAST_FLOAT *divisors 135cb93a386Sopenharmony_ci%define workspace ebp + 16 ; FAST_FLOAT *workspace 136cb93a386Sopenharmony_ci 137cb93a386Sopenharmony_ci align 32 138cb93a386Sopenharmony_ci GLOBAL_FUNCTION(jsimd_quantize_float_sse) 139cb93a386Sopenharmony_ci 140cb93a386Sopenharmony_ciEXTN(jsimd_quantize_float_sse): 141cb93a386Sopenharmony_ci push ebp 142cb93a386Sopenharmony_ci mov ebp, esp 143cb93a386Sopenharmony_ci; push ebx ; unused 144cb93a386Sopenharmony_ci; push ecx ; unused 145cb93a386Sopenharmony_ci; push edx ; need not be preserved 146cb93a386Sopenharmony_ci push esi 147cb93a386Sopenharmony_ci push edi 148cb93a386Sopenharmony_ci 149cb93a386Sopenharmony_ci mov esi, POINTER [workspace] 150cb93a386Sopenharmony_ci mov edx, POINTER [divisors] 151cb93a386Sopenharmony_ci mov edi, JCOEFPTR [coef_block] 152cb93a386Sopenharmony_ci mov eax, DCTSIZE2/16 153cb93a386Sopenharmony_ci alignx 16, 7 154cb93a386Sopenharmony_ci.quantloop: 155cb93a386Sopenharmony_ci movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 156cb93a386Sopenharmony_ci movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] 157cb93a386Sopenharmony_ci mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 158cb93a386Sopenharmony_ci mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 159cb93a386Sopenharmony_ci movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 160cb93a386Sopenharmony_ci movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] 161cb93a386Sopenharmony_ci mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 162cb93a386Sopenharmony_ci mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 163cb93a386Sopenharmony_ci 164cb93a386Sopenharmony_ci movhlps xmm4, xmm0 165cb93a386Sopenharmony_ci movhlps xmm5, xmm1 166cb93a386Sopenharmony_ci 167cb93a386Sopenharmony_ci cvtps2pi mm0, xmm0 168cb93a386Sopenharmony_ci cvtps2pi mm1, xmm1 169cb93a386Sopenharmony_ci cvtps2pi mm4, xmm4 170cb93a386Sopenharmony_ci cvtps2pi mm5, xmm5 171cb93a386Sopenharmony_ci 172cb93a386Sopenharmony_ci movhlps xmm6, xmm2 173cb93a386Sopenharmony_ci movhlps xmm7, xmm3 174cb93a386Sopenharmony_ci 175cb93a386Sopenharmony_ci cvtps2pi mm2, xmm2 176cb93a386Sopenharmony_ci cvtps2pi mm3, xmm3 177cb93a386Sopenharmony_ci cvtps2pi mm6, xmm6 178cb93a386Sopenharmony_ci cvtps2pi mm7, xmm7 179cb93a386Sopenharmony_ci 180cb93a386Sopenharmony_ci packssdw mm0, mm4 181cb93a386Sopenharmony_ci packssdw mm1, mm5 182cb93a386Sopenharmony_ci packssdw mm2, mm6 183cb93a386Sopenharmony_ci packssdw mm3, mm7 184cb93a386Sopenharmony_ci 185cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 186cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 187cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2 188cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 189cb93a386Sopenharmony_ci 190cb93a386Sopenharmony_ci add esi, byte 16*SIZEOF_FAST_FLOAT 191cb93a386Sopenharmony_ci add edx, byte 16*SIZEOF_FAST_FLOAT 192cb93a386Sopenharmony_ci add edi, byte 16*SIZEOF_JCOEF 193cb93a386Sopenharmony_ci dec eax 194cb93a386Sopenharmony_ci jnz short .quantloop 195cb93a386Sopenharmony_ci 196cb93a386Sopenharmony_ci emms ; empty MMX state 197cb93a386Sopenharmony_ci 198cb93a386Sopenharmony_ci pop edi 199cb93a386Sopenharmony_ci pop esi 200cb93a386Sopenharmony_ci; pop edx ; need not be preserved 201cb93a386Sopenharmony_ci; pop ecx ; unused 202cb93a386Sopenharmony_ci; pop ebx ; unused 203cb93a386Sopenharmony_ci pop ebp 204cb93a386Sopenharmony_ci ret 205cb93a386Sopenharmony_ci 206cb93a386Sopenharmony_ci; For some reason, the OS X linker does not honor the request to align the 207cb93a386Sopenharmony_ci; segment unless we do this. 208cb93a386Sopenharmony_ci align 32 209