1cb93a386Sopenharmony_ci;
2cb93a386Sopenharmony_ci; jquant.asm - sample data conversion and quantization (SSE & MMX)
3cb93a386Sopenharmony_ci;
4cb93a386Sopenharmony_ci; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5cb93a386Sopenharmony_ci; Copyright (C) 2016, D. R. Commander.
6cb93a386Sopenharmony_ci;
7cb93a386Sopenharmony_ci; Based on the x86 SIMD extension for IJG JPEG library
8cb93a386Sopenharmony_ci; Copyright (C) 1999-2006, MIYASAKA Masaru.
9cb93a386Sopenharmony_ci; For conditions of distribution and use, see copyright notice in jsimdext.inc
10cb93a386Sopenharmony_ci;
11cb93a386Sopenharmony_ci; This file should be assembled with NASM (Netwide Assembler),
12cb93a386Sopenharmony_ci; can *not* be assembled with Microsoft's MASM or any compatible
13cb93a386Sopenharmony_ci; assembler (including Borland's Turbo Assembler).
14cb93a386Sopenharmony_ci; NASM is available from http://nasm.sourceforge.net/ or
15cb93a386Sopenharmony_ci; http://sourceforge.net/project/showfiles.php?group_id=6208
16cb93a386Sopenharmony_ci
17cb93a386Sopenharmony_ci%include "jsimdext.inc"
18cb93a386Sopenharmony_ci%include "jdct.inc"
19cb93a386Sopenharmony_ci
20cb93a386Sopenharmony_ci; --------------------------------------------------------------------------
21cb93a386Sopenharmony_ci    SECTION     SEG_TEXT
22cb93a386Sopenharmony_ci    BITS        32
23cb93a386Sopenharmony_ci;
24cb93a386Sopenharmony_ci; Load data into workspace, applying unsigned->signed conversion
25cb93a386Sopenharmony_ci;
26cb93a386Sopenharmony_ci; GLOBAL(void)
27cb93a386Sopenharmony_ci; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col,
28cb93a386Sopenharmony_ci;                          FAST_FLOAT *workspace);
29cb93a386Sopenharmony_ci;
30cb93a386Sopenharmony_ci
31cb93a386Sopenharmony_ci%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
32cb93a386Sopenharmony_ci%define start_col    ebp + 12           ; JDIMENSION start_col
33cb93a386Sopenharmony_ci%define workspace    ebp + 16           ; FAST_FLOAT *workspace
34cb93a386Sopenharmony_ci
35cb93a386Sopenharmony_ci    align       32
36cb93a386Sopenharmony_ci    GLOBAL_FUNCTION(jsimd_convsamp_float_sse)
37cb93a386Sopenharmony_ci
38cb93a386Sopenharmony_ciEXTN(jsimd_convsamp_float_sse):
39cb93a386Sopenharmony_ci    push        ebp
40cb93a386Sopenharmony_ci    mov         ebp, esp
41cb93a386Sopenharmony_ci    push        ebx
42cb93a386Sopenharmony_ci;   push        ecx                     ; need not be preserved
43cb93a386Sopenharmony_ci;   push        edx                     ; need not be preserved
44cb93a386Sopenharmony_ci    push        esi
45cb93a386Sopenharmony_ci    push        edi
46cb93a386Sopenharmony_ci
47cb93a386Sopenharmony_ci    pcmpeqw     mm7, mm7
48cb93a386Sopenharmony_ci    psllw       mm7, 7
49cb93a386Sopenharmony_ci    packsswb    mm7, mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
50cb93a386Sopenharmony_ci
51cb93a386Sopenharmony_ci    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
52cb93a386Sopenharmony_ci    mov         eax, JDIMENSION [start_col]
53cb93a386Sopenharmony_ci    mov         edi, POINTER [workspace]       ; (DCTELEM *)
54cb93a386Sopenharmony_ci    mov         ecx, DCTSIZE/2
55cb93a386Sopenharmony_ci    alignx      16, 7
56cb93a386Sopenharmony_ci.convloop:
57cb93a386Sopenharmony_ci    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
58cb93a386Sopenharmony_ci    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
59cb93a386Sopenharmony_ci
60cb93a386Sopenharmony_ci    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
61cb93a386Sopenharmony_ci    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
62cb93a386Sopenharmony_ci
63cb93a386Sopenharmony_ci    psubb       mm0, mm7                ; mm0=(01234567)
64cb93a386Sopenharmony_ci    psubb       mm1, mm7                ; mm1=(89ABCDEF)
65cb93a386Sopenharmony_ci
66cb93a386Sopenharmony_ci    punpcklbw   mm2, mm0                ; mm2=(*0*1*2*3)
67cb93a386Sopenharmony_ci    punpckhbw   mm0, mm0                ; mm0=(*4*5*6*7)
68cb93a386Sopenharmony_ci    punpcklbw   mm3, mm1                ; mm3=(*8*9*A*B)
69cb93a386Sopenharmony_ci    punpckhbw   mm1, mm1                ; mm1=(*C*D*E*F)
70cb93a386Sopenharmony_ci
71cb93a386Sopenharmony_ci    punpcklwd   mm4, mm2                ; mm4=(***0***1)
72cb93a386Sopenharmony_ci    punpckhwd   mm2, mm2                ; mm2=(***2***3)
73cb93a386Sopenharmony_ci    punpcklwd   mm5, mm0                ; mm5=(***4***5)
74cb93a386Sopenharmony_ci    punpckhwd   mm0, mm0                ; mm0=(***6***7)
75cb93a386Sopenharmony_ci
76cb93a386Sopenharmony_ci    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(01)
77cb93a386Sopenharmony_ci    psrad       mm2, (DWORD_BIT-BYTE_BIT)  ; mm2=(23)
78cb93a386Sopenharmony_ci    cvtpi2ps    xmm0, mm4                  ; xmm0=(01**)
79cb93a386Sopenharmony_ci    cvtpi2ps    xmm1, mm2                  ; xmm1=(23**)
80cb93a386Sopenharmony_ci    psrad       mm5, (DWORD_BIT-BYTE_BIT)  ; mm5=(45)
81cb93a386Sopenharmony_ci    psrad       mm0, (DWORD_BIT-BYTE_BIT)  ; mm0=(67)
82cb93a386Sopenharmony_ci    cvtpi2ps    xmm2, mm5                  ; xmm2=(45**)
83cb93a386Sopenharmony_ci    cvtpi2ps    xmm3, mm0                  ; xmm3=(67**)
84cb93a386Sopenharmony_ci
85cb93a386Sopenharmony_ci    punpcklwd   mm6, mm3                ; mm6=(***8***9)
86cb93a386Sopenharmony_ci    punpckhwd   mm3, mm3                ; mm3=(***A***B)
87cb93a386Sopenharmony_ci    punpcklwd   mm4, mm1                ; mm4=(***C***D)
88cb93a386Sopenharmony_ci    punpckhwd   mm1, mm1                ; mm1=(***E***F)
89cb93a386Sopenharmony_ci
90cb93a386Sopenharmony_ci    psrad       mm6, (DWORD_BIT-BYTE_BIT)  ; mm6=(89)
91cb93a386Sopenharmony_ci    psrad       mm3, (DWORD_BIT-BYTE_BIT)  ; mm3=(AB)
92cb93a386Sopenharmony_ci    cvtpi2ps    xmm4, mm6                  ; xmm4=(89**)
93cb93a386Sopenharmony_ci    cvtpi2ps    xmm5, mm3                  ; xmm5=(AB**)
94cb93a386Sopenharmony_ci    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(CD)
95cb93a386Sopenharmony_ci    psrad       mm1, (DWORD_BIT-BYTE_BIT)  ; mm1=(EF)
96cb93a386Sopenharmony_ci    cvtpi2ps    xmm6, mm4                  ; xmm6=(CD**)
97cb93a386Sopenharmony_ci    cvtpi2ps    xmm7, mm1                  ; xmm7=(EF**)
98cb93a386Sopenharmony_ci
99cb93a386Sopenharmony_ci    movlhps     xmm0, xmm1              ; xmm0=(0123)
100cb93a386Sopenharmony_ci    movlhps     xmm2, xmm3              ; xmm2=(4567)
101cb93a386Sopenharmony_ci    movlhps     xmm4, xmm5              ; xmm4=(89AB)
102cb93a386Sopenharmony_ci    movlhps     xmm6, xmm7              ; xmm6=(CDEF)
103cb93a386Sopenharmony_ci
104cb93a386Sopenharmony_ci    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
105cb93a386Sopenharmony_ci    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
106cb93a386Sopenharmony_ci    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
107cb93a386Sopenharmony_ci    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
108cb93a386Sopenharmony_ci
109cb93a386Sopenharmony_ci    add         esi, byte 2*SIZEOF_JSAMPROW
110cb93a386Sopenharmony_ci    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
111cb93a386Sopenharmony_ci    dec         ecx
112cb93a386Sopenharmony_ci    jnz         near .convloop
113cb93a386Sopenharmony_ci
114cb93a386Sopenharmony_ci    emms                                ; empty MMX state
115cb93a386Sopenharmony_ci
116cb93a386Sopenharmony_ci    pop         edi
117cb93a386Sopenharmony_ci    pop         esi
118cb93a386Sopenharmony_ci;   pop         edx                     ; need not be preserved
119cb93a386Sopenharmony_ci;   pop         ecx                     ; need not be preserved
120cb93a386Sopenharmony_ci    pop         ebx
121cb93a386Sopenharmony_ci    pop         ebp
122cb93a386Sopenharmony_ci    ret
123cb93a386Sopenharmony_ci
124cb93a386Sopenharmony_ci; --------------------------------------------------------------------------
125cb93a386Sopenharmony_ci;
126cb93a386Sopenharmony_ci; Quantize/descale the coefficients, and store into coef_block
127cb93a386Sopenharmony_ci;
128cb93a386Sopenharmony_ci; GLOBAL(void)
129cb93a386Sopenharmony_ci; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors,
130cb93a386Sopenharmony_ci;                          FAST_FLOAT *workspace);
131cb93a386Sopenharmony_ci;
132cb93a386Sopenharmony_ci
133cb93a386Sopenharmony_ci%define coef_block  ebp + 8             ; JCOEFPTR coef_block
134cb93a386Sopenharmony_ci%define divisors    ebp + 12            ; FAST_FLOAT *divisors
135cb93a386Sopenharmony_ci%define workspace   ebp + 16            ; FAST_FLOAT *workspace
136cb93a386Sopenharmony_ci
137cb93a386Sopenharmony_ci    align       32
138cb93a386Sopenharmony_ci    GLOBAL_FUNCTION(jsimd_quantize_float_sse)
139cb93a386Sopenharmony_ci
140cb93a386Sopenharmony_ciEXTN(jsimd_quantize_float_sse):
141cb93a386Sopenharmony_ci    push        ebp
142cb93a386Sopenharmony_ci    mov         ebp, esp
143cb93a386Sopenharmony_ci;   push        ebx                     ; unused
144cb93a386Sopenharmony_ci;   push        ecx                     ; unused
145cb93a386Sopenharmony_ci;   push        edx                     ; need not be preserved
146cb93a386Sopenharmony_ci    push        esi
147cb93a386Sopenharmony_ci    push        edi
148cb93a386Sopenharmony_ci
149cb93a386Sopenharmony_ci    mov         esi, POINTER [workspace]
150cb93a386Sopenharmony_ci    mov         edx, POINTER [divisors]
151cb93a386Sopenharmony_ci    mov         edi, JCOEFPTR [coef_block]
152cb93a386Sopenharmony_ci    mov         eax, DCTSIZE2/16
153cb93a386Sopenharmony_ci    alignx      16, 7
154cb93a386Sopenharmony_ci.quantloop:
155cb93a386Sopenharmony_ci    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
156cb93a386Sopenharmony_ci    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
157cb93a386Sopenharmony_ci    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
158cb93a386Sopenharmony_ci    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
159cb93a386Sopenharmony_ci    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
160cb93a386Sopenharmony_ci    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
161cb93a386Sopenharmony_ci    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
162cb93a386Sopenharmony_ci    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
163cb93a386Sopenharmony_ci
164cb93a386Sopenharmony_ci    movhlps     xmm4, xmm0
165cb93a386Sopenharmony_ci    movhlps     xmm5, xmm1
166cb93a386Sopenharmony_ci
167cb93a386Sopenharmony_ci    cvtps2pi    mm0, xmm0
168cb93a386Sopenharmony_ci    cvtps2pi    mm1, xmm1
169cb93a386Sopenharmony_ci    cvtps2pi    mm4, xmm4
170cb93a386Sopenharmony_ci    cvtps2pi    mm5, xmm5
171cb93a386Sopenharmony_ci
172cb93a386Sopenharmony_ci    movhlps     xmm6, xmm2
173cb93a386Sopenharmony_ci    movhlps     xmm7, xmm3
174cb93a386Sopenharmony_ci
175cb93a386Sopenharmony_ci    cvtps2pi    mm2, xmm2
176cb93a386Sopenharmony_ci    cvtps2pi    mm3, xmm3
177cb93a386Sopenharmony_ci    cvtps2pi    mm6, xmm6
178cb93a386Sopenharmony_ci    cvtps2pi    mm7, xmm7
179cb93a386Sopenharmony_ci
180cb93a386Sopenharmony_ci    packssdw    mm0, mm4
181cb93a386Sopenharmony_ci    packssdw    mm1, mm5
182cb93a386Sopenharmony_ci    packssdw    mm2, mm6
183cb93a386Sopenharmony_ci    packssdw    mm3, mm7
184cb93a386Sopenharmony_ci
185cb93a386Sopenharmony_ci    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
186cb93a386Sopenharmony_ci    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
187cb93a386Sopenharmony_ci    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
188cb93a386Sopenharmony_ci    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
189cb93a386Sopenharmony_ci
190cb93a386Sopenharmony_ci    add         esi, byte 16*SIZEOF_FAST_FLOAT
191cb93a386Sopenharmony_ci    add         edx, byte 16*SIZEOF_FAST_FLOAT
192cb93a386Sopenharmony_ci    add         edi, byte 16*SIZEOF_JCOEF
193cb93a386Sopenharmony_ci    dec         eax
194cb93a386Sopenharmony_ci    jnz         short .quantloop
195cb93a386Sopenharmony_ci
196cb93a386Sopenharmony_ci    emms                                ; empty MMX state
197cb93a386Sopenharmony_ci
198cb93a386Sopenharmony_ci    pop         edi
199cb93a386Sopenharmony_ci    pop         esi
200cb93a386Sopenharmony_ci;   pop         edx                     ; need not be preserved
201cb93a386Sopenharmony_ci;   pop         ecx                     ; unused
202cb93a386Sopenharmony_ci;   pop         ebx                     ; unused
203cb93a386Sopenharmony_ci    pop         ebp
204cb93a386Sopenharmony_ci    ret
205cb93a386Sopenharmony_ci
206cb93a386Sopenharmony_ci; For some reason, the OS X linker does not honor the request to align the
207cb93a386Sopenharmony_ci; segment unless we do this.
208cb93a386Sopenharmony_ci    align       32
209