1159b3361Sopenharmony_ci; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA
2159b3361Sopenharmony_ci
3159b3361Sopenharmony_ci; GOGO-no-coda
4159b3361Sopenharmony_ci;	Copyright (C) 1999 shigeo
5159b3361Sopenharmony_ci;	special thanks to Keiichi SAKAI
6159b3361Sopenharmony_ci 
7159b3361Sopenharmony_ci%include "nasm.h"
8159b3361Sopenharmony_ci
9159b3361Sopenharmony_ci	globaldef fht_SSE
10159b3361Sopenharmony_ci
11159b3361Sopenharmony_ci	segment_data
12159b3361Sopenharmony_ci	align 16
13159b3361Sopenharmony_ciQ_MMPP	dd	0x0,0x0,0x80000000,0x80000000
14159b3361Sopenharmony_ciQ_MPMP	dd	0x0,0x80000000,0x0,0x80000000
15159b3361Sopenharmony_ciD_1100	dd 0.0, 0.0, 1.0, 1.0
16159b3361Sopenharmony_cicostab_fft:
17159b3361Sopenharmony_ci	dd 9.238795325112867e-01
18159b3361Sopenharmony_ci	dd 3.826834323650898e-01
19159b3361Sopenharmony_ci	dd 9.951847266721969e-01
20159b3361Sopenharmony_ci	dd 9.801714032956060e-02
21159b3361Sopenharmony_ci	dd 9.996988186962042e-01
22159b3361Sopenharmony_ci	dd 2.454122852291229e-02
23159b3361Sopenharmony_ci	dd 9.999811752836011e-01
24159b3361Sopenharmony_ci	dd 6.135884649154475e-03
25159b3361Sopenharmony_ciS_SQRT2	dd	1.414213562
26159b3361Sopenharmony_ci
27159b3361Sopenharmony_ci	segment_code
28159b3361Sopenharmony_ci
29159b3361Sopenharmony_ciPIC_OFFSETTABLE
30159b3361Sopenharmony_ci
31159b3361Sopenharmony_ci;------------------------------------------------------------------------
32159b3361Sopenharmony_ci;	by K. SAKAI
33159b3361Sopenharmony_ci;	99/08/18	PIII 23k[clk]
34159b3361Sopenharmony_ci;	99/08/19	̿�������촹�� PIII 22k[clk]
35159b3361Sopenharmony_ci;	99/08/20	bit reversal ����夫��ܿ����� PIII 17k[clk]
36159b3361Sopenharmony_ci;	99/08/23	���� unroll PIII 14k[clk]
37159b3361Sopenharmony_ci;	99/11/12	clean up
38159b3361Sopenharmony_ci;
39159b3361Sopenharmony_ci;void fht_SSE(float *fz, int n);
40159b3361Sopenharmony_ci	align 16
41159b3361Sopenharmony_cifht_SSE:
42159b3361Sopenharmony_ci	push	ebx
43159b3361Sopenharmony_ci	push	esi
44159b3361Sopenharmony_ci	push	edi
45159b3361Sopenharmony_ci	push	ebp
46159b3361Sopenharmony_ci
47159b3361Sopenharmony_ci%assign _P 4*5
48159b3361Sopenharmony_ci
49159b3361Sopenharmony_ci	;2���ܤΥ롼��
50159b3361Sopenharmony_ci	mov	eax,[esp+_P+0]	;eax=fz
51159b3361Sopenharmony_ci	mov	ebp,[esp+_P+4]	;=n
52159b3361Sopenharmony_ci	shl	ebp,3
53159b3361Sopenharmony_ci	add	ebp,eax		; fn  = fz + n, ���δؿ���λ�ޤ�����
54159b3361Sopenharmony_ci	push	ebp
55159b3361Sopenharmony_ci
56159b3361Sopenharmony_ci	call	get_pc.bp
57159b3361Sopenharmony_ci	add	ebp, PIC_BASE()
58159b3361Sopenharmony_ci
59159b3361Sopenharmony_ci	lea	ecx,[PIC_EBP_REL(costab_fft)]
60159b3361Sopenharmony_ci	xor	eax,eax
61159b3361Sopenharmony_ci	mov	al,8		; =k1=1*(sizeof float)	// 4, 16, 64, 256,...
62159b3361Sopenharmony_ci.lp2:				; do{
63159b3361Sopenharmony_ci	mov	esi,[esp+_P+4]	; esi=fi=fz
64159b3361Sopenharmony_ci	lea	edx,[eax+eax*2]
65159b3361Sopenharmony_ci	mov	ebx, esi
66159b3361Sopenharmony_ci
67159b3361Sopenharmony_ci; ��������2��������ԤǤ��ʤ���ʬ��FPU�Τۤ���®����
68159b3361Sopenharmony_ci	loopalign	16
69159b3361Sopenharmony_ci.lp20:				; do{
70159b3361Sopenharmony_ci;                       f0     = fi[0 ] + fi[k1];
71159b3361Sopenharmony_ci;                       f2     = fi[k2] + fi[k3];
72159b3361Sopenharmony_ci;                       f1     = fi[0 ] - fi[k1];
73159b3361Sopenharmony_ci;                       f3     = fi[k2] - fi[k3];
74159b3361Sopenharmony_ci;                       fi[0 ] = f0     + f2;
75159b3361Sopenharmony_ci;                       fi[k1] = f1     + f3;
76159b3361Sopenharmony_ci;                       fi[k2] = f0     - f2;
77159b3361Sopenharmony_ci;                       fi[k3] = f1     - f3;
78159b3361Sopenharmony_ci	lea	edi,[ebx+eax]	; edi=gi=fi+ki/2
79159b3361Sopenharmony_ci	fld	dword [ebx]
80159b3361Sopenharmony_ci	fadd	dword [ebx+eax*2]
81159b3361Sopenharmony_ci	fld	dword [ebx+eax*4]
82159b3361Sopenharmony_ci	fadd	dword [ebx+edx*2]
83159b3361Sopenharmony_ci
84159b3361Sopenharmony_ci	fld	dword [ebx]
85159b3361Sopenharmony_ci	fsub	dword [ebx+eax*2]
86159b3361Sopenharmony_ci	fld	dword [ebx+eax*4]
87159b3361Sopenharmony_ci	fsub	dword [ebx+edx*2]
88159b3361Sopenharmony_ci
89159b3361Sopenharmony_ci	fld	st1
90159b3361Sopenharmony_ci	fadd	st0,st1
91159b3361Sopenharmony_ci	fstp	dword [ebx+eax*2]
92159b3361Sopenharmony_ci	fsubp	st1,st0
93159b3361Sopenharmony_ci	fstp	dword [ebx+edx*2]
94159b3361Sopenharmony_ci
95159b3361Sopenharmony_ci	fld	st1
96159b3361Sopenharmony_ci	fadd	st0,st1
97159b3361Sopenharmony_ci	fstp	dword [ebx]
98159b3361Sopenharmony_ci	fsubp	st1,st0
99159b3361Sopenharmony_ci	fstp	dword [ebx+eax*4]
100159b3361Sopenharmony_ci
101159b3361Sopenharmony_ci	lea	ebx,[ebx + eax*8]	; = fi += (k1 * 4);
102159b3361Sopenharmony_ci;                       g0     = gi[0 ] + gi[k1];
103159b3361Sopenharmony_ci;                       g2     = SQRT2  * gi[k2];
104159b3361Sopenharmony_ci;                       g1     = gi[0 ] - gi[k1];
105159b3361Sopenharmony_ci;                       g3     = SQRT2  * gi[k3];
106159b3361Sopenharmony_ci;                       gi[0 ] = g0     + g2;
107159b3361Sopenharmony_ci;                       gi[k2] = g0     - g2;
108159b3361Sopenharmony_ci;                       gi[k1] = g1     + g3;
109159b3361Sopenharmony_ci;                       gi[k3] = g1     - g3;
110159b3361Sopenharmony_ci	fld	dword [edi]
111159b3361Sopenharmony_ci	fadd	dword [edi+eax*2]
112159b3361Sopenharmony_ci	fld	dword [PIC_EBP_REL(S_SQRT2)]
113159b3361Sopenharmony_ci	fmul	dword [edi+eax*4]
114159b3361Sopenharmony_ci
115159b3361Sopenharmony_ci	fld	dword [edi]
116159b3361Sopenharmony_ci	fsub	dword [edi+eax*2]
117159b3361Sopenharmony_ci	fld	dword [PIC_EBP_REL(S_SQRT2)]
118159b3361Sopenharmony_ci	fmul	dword [edi+edx*2]
119159b3361Sopenharmony_ci
120159b3361Sopenharmony_ci	fld	st1
121159b3361Sopenharmony_ci	fadd	st0,st1
122159b3361Sopenharmony_ci	fstp	dword [edi+eax*2]
123159b3361Sopenharmony_ci	fsubp	st1,st0
124159b3361Sopenharmony_ci	fstp	dword [edi+edx*2]
125159b3361Sopenharmony_ci
126159b3361Sopenharmony_ci	fld	st1
127159b3361Sopenharmony_ci	fadd	st0,st1
128159b3361Sopenharmony_ci	fstp	dword [edi]
129159b3361Sopenharmony_ci	fsubp	st1,st0
130159b3361Sopenharmony_ci	fstp	dword [edi+eax*4]
131159b3361Sopenharmony_ci
132159b3361Sopenharmony_ci	cmp	ebx,[esp]
133159b3361Sopenharmony_ci	jl	near .lp20		; while (fi<fn);
134159b3361Sopenharmony_ci
135159b3361Sopenharmony_ci
136159b3361Sopenharmony_ci;               i = 1; //for (i=1;i<kx;i++){
137159b3361Sopenharmony_ci;                       c1 = 1.0*t_c - 0.0*t_s;
138159b3361Sopenharmony_ci;                       s1 = 0.0*t_c + 1.0*t_s;
139159b3361Sopenharmony_ci	movlps	xmm6,[ecx] ; = { --,  --,  s1, c1}
140159b3361Sopenharmony_ci	movaps	xmm7,xmm6
141159b3361Sopenharmony_ci
142159b3361Sopenharmony_ci	shufps	xmm6,xmm6,R4(0,1,1,0)	; = {+c1, +s1, +s1, +c1} -> ɬ��
143159b3361Sopenharmony_ci;                       c2 = c1*c1 - s1*s1 = 1 - (2*s1)*s1;
144159b3361Sopenharmony_ci;                       s2 = c1*s1 + s1*c1 = 2*s1*c1;
145159b3361Sopenharmony_ci	shufps	xmm7,xmm7,R4(1,0,0,1)
146159b3361Sopenharmony_ci	movss	xmm5,xmm7		; = { --,  --,  --, s1}
147159b3361Sopenharmony_ci	xorps	xmm7,[PIC_EBP_REL(Q_MMPP)]	; = {-s1, -c1, +c1, +s1} -> ɬ��
148159b3361Sopenharmony_ci
149159b3361Sopenharmony_ci	addss	xmm5,xmm5		; = (--, --,  --, 2*s1)
150159b3361Sopenharmony_ci	add	esi,4		; esi = fi = fz + i
151159b3361Sopenharmony_ci	shufps	xmm5,xmm5,R4(0,0,0,0)	; = (2*s1, 2*s1, 2*s1, 2*s1)
152159b3361Sopenharmony_ci	mulps	xmm5,xmm6		; = (2*s1*c1, 2*s1*s1, 2*s1*s1, 2*s1*c1)
153159b3361Sopenharmony_ci	subps	xmm5,[PIC_EBP_REL(D_1100)]		; = (--, 2*s1*s1-1, --, 2*s1*c1) = {-- -c2 -- s2}
154159b3361Sopenharmony_ci	movaps	xmm4,xmm5
155159b3361Sopenharmony_ci	shufps	xmm5,xmm5,R4(2,0,2,0)	; = {-c2, s2, -c2, s2} -> ɬ��
156159b3361Sopenharmony_ci
157159b3361Sopenharmony_ci	xorps	xmm4,[PIC_EBP_REL(Q_MMPP)]		; = {--, c2, --, s2}
158159b3361Sopenharmony_ci	shufps	xmm4,xmm4,R4(0,2,0,2)	; = {s2, c2, s2, c2} -> ɬ��
159159b3361Sopenharmony_ci
160159b3361Sopenharmony_ci	loopalign	16
161159b3361Sopenharmony_ci.lp21:				; do{
162159b3361Sopenharmony_ci;                               a       = c2*fi[k1] + s2*gi[k1];
163159b3361Sopenharmony_ci;                               b       = s2*fi[k1] - c2*gi[k1];
164159b3361Sopenharmony_ci;                               c       = c2*fi[k3] + s2*gi[k3];
165159b3361Sopenharmony_ci;                               d       = s2*fi[k3] - c2*gi[k3];
166159b3361Sopenharmony_ci;                               f0      = fi[0 ]        + a;
167159b3361Sopenharmony_ci;                               g0      = gi[0 ]        + b;
168159b3361Sopenharmony_ci;                               f2      = fi[k1 * 2]    + c;
169159b3361Sopenharmony_ci;                               g2      = gi[k1 * 2]    + d;
170159b3361Sopenharmony_ci;                               f1      = fi[0 ]        - a;
171159b3361Sopenharmony_ci;                               g1      = gi[0 ]        - b;
172159b3361Sopenharmony_ci;                               f3      = fi[k1 * 2]    - c;
173159b3361Sopenharmony_ci;                               g3      = gi[k1 * 2]    - d;
174159b3361Sopenharmony_ci	lea	edi,[esi + eax*2 - 8]	; edi = gi = fz +k1-i
175159b3361Sopenharmony_ci
176159b3361Sopenharmony_ci	movss	xmm0,[esi + eax*2]	; = fi[k1]
177159b3361Sopenharmony_ci	movss	xmm2,[esi + edx*2]	; = fi[k3]
178159b3361Sopenharmony_ci	shufps	xmm0,xmm2,0x00	; = {fi[k3], fi[k3], fi[k1], fi[k1]}
179159b3361Sopenharmony_ci	movss	xmm1,[edi + eax*2]	; = fi[k1]
180159b3361Sopenharmony_ci	movss	xmm3,[edi + edx*2]	; = fi[k3]
181159b3361Sopenharmony_ci	shufps	xmm1,xmm3,0x00	; = {gi[k3], gi[k3], gi[k1], gi[k1]}
182159b3361Sopenharmony_ci	movss	xmm2,[esi]		; = fi[0]
183159b3361Sopenharmony_ci	mulps	xmm0,xmm4		; *= {+s2, +c2, +s2, +c2}
184159b3361Sopenharmony_ci	movss	xmm3,[esi + eax*4]	; = fi[k2]
185159b3361Sopenharmony_ci	unpcklps	xmm2,xmm3	; = {--, --, fi[k2], fi[0]}
186159b3361Sopenharmony_ci	mulps	xmm1,xmm5		; *= {-c2, +s2, -c2, +s2}
187159b3361Sopenharmony_ci	movss	xmm3,[edi + eax*4]	; = gi[k2]
188159b3361Sopenharmony_ci	addps	xmm0,xmm1		; = {d, c, b, a}
189159b3361Sopenharmony_ci	movss	xmm1,[edi]		; = gi[0]
190159b3361Sopenharmony_ci	unpcklps	xmm1,xmm3	; = {--,  --, gi[k2], gi[0]}
191159b3361Sopenharmony_ci	unpcklps	xmm2,xmm1	; = {gi[k2], fi[k2], gi[0], fi[0]}
192159b3361Sopenharmony_ci	movaps	xmm1,xmm2
193159b3361Sopenharmony_ci	addps	xmm1,xmm0	; = {g2, f2, g0, f0}
194159b3361Sopenharmony_ci	subps	xmm2,xmm0	; = {g3, f3, g1, f1}
195159b3361Sopenharmony_ci
196159b3361Sopenharmony_ci;                               a       = c1*f2     + s1*g3;
197159b3361Sopenharmony_ci;                               c       = s1*g2     + c1*f3;
198159b3361Sopenharmony_ci;                               b       = s1*f2     - c1*g3;
199159b3361Sopenharmony_ci;                               d       = c1*g2     - s1*f3;
200159b3361Sopenharmony_ci;                               fi[0 ]  = f0        + a;
201159b3361Sopenharmony_ci;                               gi[0 ]  = g0        + c;
202159b3361Sopenharmony_ci;                               gi[k1]  = g1        + b;
203159b3361Sopenharmony_ci;                               fi[k1]  = f1        + d;
204159b3361Sopenharmony_ci;                               fi[k1 * 2]  = f0    - a;
205159b3361Sopenharmony_ci;                               gi[k1 * 2]  = g0    - c;
206159b3361Sopenharmony_ci;                               gi[k3]      = g1    - b;
207159b3361Sopenharmony_ci;                               fi[k3]      = f1    - d;
208159b3361Sopenharmony_ci	movaps	xmm3,xmm1
209159b3361Sopenharmony_ci	movhlps	xmm1,xmm1	; = {g2, f2, g2, f2}
210159b3361Sopenharmony_ci	shufps	xmm3,xmm2,0x14	; = {f1, g1, g0, f0}
211159b3361Sopenharmony_ci	mulps	xmm1,xmm6	; *= {+c1, +s1, +s1, +c1}
212159b3361Sopenharmony_ci	shufps	xmm2,xmm2,0xBB	; = {f3, g3, f3, g3}
213159b3361Sopenharmony_ci	mulps	xmm2,xmm7	; *= {-s1, -c1, +c1, +s1}
214159b3361Sopenharmony_ci	addps	xmm1,xmm2	; = {d, b, c, a}
215159b3361Sopenharmony_ci	movaps	xmm2,xmm3
216159b3361Sopenharmony_ci	addps	xmm3,xmm1	; = {fi[k1], gi[k1], gi[0], fi[0]}
217159b3361Sopenharmony_ci	subps	xmm2,xmm1	; = {fi[k3], gi[k3], gi[k1*2], fi[k1*2]}
218159b3361Sopenharmony_ci	movhlps	xmm0,xmm3
219159b3361Sopenharmony_ci	movss	[esi],xmm3
220159b3361Sopenharmony_ci	shufps	xmm3,xmm3,0x55
221159b3361Sopenharmony_ci	movss	[edi+eax*2],xmm0
222159b3361Sopenharmony_ci	shufps	xmm0,xmm0,0x55
223159b3361Sopenharmony_ci	movss	[edi],xmm3
224159b3361Sopenharmony_ci	movss	[esi+eax*2],xmm0
225159b3361Sopenharmony_ci	movhlps	xmm0,xmm2
226159b3361Sopenharmony_ci	movss	[esi+eax*4],xmm2
227159b3361Sopenharmony_ci	shufps	xmm2,xmm2,0x55
228159b3361Sopenharmony_ci	movss	[edi+edx*2],xmm0
229159b3361Sopenharmony_ci	shufps	xmm0,xmm0,0x55
230159b3361Sopenharmony_ci	movss	[edi+eax*4],xmm2
231159b3361Sopenharmony_ci	movss	[esi+edx*2],xmm0
232159b3361Sopenharmony_ci	lea	esi,[esi + eax*8] ; fi += (k1 * 4);
233159b3361Sopenharmony_ci	cmp	esi,[esp]
234159b3361Sopenharmony_ci	jl	near .lp21		; while (fi<fn);
235159b3361Sopenharmony_ci
236159b3361Sopenharmony_ci
237159b3361Sopenharmony_ci; unroll����do loop��43+4̿��
238159b3361Sopenharmony_ci
239159b3361Sopenharmony_ci; ������ǤϤʤ�for�롼�פ�i=2�������unrolling����
240159b3361Sopenharmony_ci; kx=   2,   8,  32,  128
241159b3361Sopenharmony_ci; k4=  16,  64, 256, 1024
242159b3361Sopenharmony_ci;       0, 6/2,30/2,126/2
243159b3361Sopenharmony_ci
244159b3361Sopenharmony_ci	xor	ebx,ebx
245159b3361Sopenharmony_ci	mov	bl, 4*2		; = i = 4
246159b3361Sopenharmony_ci	cmp	ebx,eax		; i < k1
247159b3361Sopenharmony_ci	jnl	near .F22
248159b3361Sopenharmony_ci;               for (i=2;i<kx;i+=2){
249159b3361Sopenharmony_ci	loopalign	16
250159b3361Sopenharmony_ci.lp22:
251159b3361Sopenharmony_ci; at here, xmm6 is {c3, s3, s3, c3}
252159b3361Sopenharmony_ci;                       c1 = c3*t_c - s3*t_s;
253159b3361Sopenharmony_ci;                       s1 = c3*t_s + s3*t_c;
254159b3361Sopenharmony_ci	movlps	xmm0,[ecx]
255159b3361Sopenharmony_ci	shufps	xmm0,xmm0,R4(1,1,0,0)	; = {t_s, t_s, t_c, t_c}
256159b3361Sopenharmony_ci	mulps	xmm6,xmm0	; = {c3*ts, s3*ts, s3*tc, c3*tc}
257159b3361Sopenharmony_ci	movhlps	xmm4,xmm6	; = {--,    --,    c3*ts, s3*ts}
258159b3361Sopenharmony_ci	xorps	xmm4,[PIC_EBP_REL(Q_MPMP)]	; = {--,    --,   -c3*ts, s3*ts}
259159b3361Sopenharmony_ci	subps	xmm6,xmm4	; = {-,-, c3*ts+s3*tc, c3*tc-s3*ts}={-,-,s1,c1}
260159b3361Sopenharmony_ci
261159b3361Sopenharmony_ci;                       c3 = c1*t_c - s1*t_s;
262159b3361Sopenharmony_ci;                       s3 = s1*t_c + c1*t_s;
263159b3361Sopenharmony_ci	shufps	xmm6,xmm6,0x14	; = {c1, s1, s1, c1}
264159b3361Sopenharmony_ci	mulps	xmm0,xmm6	; = {ts*c1 ts*s1 tc*s1 tc*c1}
265159b3361Sopenharmony_ci	movhlps	xmm3,xmm0
266159b3361Sopenharmony_ci	xorps	xmm3,[PIC_EBP_REL(Q_MPMP)]
267159b3361Sopenharmony_ci	subps	xmm0,xmm3	; = {--, --, s3, c3}
268159b3361Sopenharmony_ci
269159b3361Sopenharmony_ci; {s2 s4 c4 c2} = {2*s1*c1 2*s3*c3 1-2*s3*s3 1-2*s1*s1}
270159b3361Sopenharmony_ci	unpcklps	xmm6,xmm0	; xmm6 = {s3, s1, c3, c1}
271159b3361Sopenharmony_ci	movaps	xmm7, xmm6
272159b3361Sopenharmony_ci	shufps	xmm6,xmm6,R4(2,3,1,0)	; xmm6 = {s1, s3, c3, c1}
273159b3361Sopenharmony_ci	addps	xmm7, xmm7		; {s3*2, s1*2,   --,   --}
274159b3361Sopenharmony_ci	mov	edi,[esp+_P+4]		; = fz
275159b3361Sopenharmony_ci	shufps	xmm7, xmm7, R4(2,3,3,2)	; {s1*2, s3*2, s3*2, s1*2}
276159b3361Sopenharmony_ci	sub	edi,ebx			; edi = fz - i/2
277159b3361Sopenharmony_ci	mulps	xmm7, xmm6		; {s1*s1*2, s3*s3*2, s3*c3*2, s1*c1*2}
278159b3361Sopenharmony_ci	lea	esi,[edi + ebx*2]	; esi = fi = fz +i/2
279159b3361Sopenharmony_ci	subps	xmm7, [PIC_EBP_REL(D_1100)]		; {-c2, -c4, s4, s2}
280159b3361Sopenharmony_ci	lea	edi,[edi + eax*2-4]	; edi = gi = fz +k1-i/2
281159b3361Sopenharmony_ci
282159b3361Sopenharmony_ci;                       fi = fz +i;
283159b3361Sopenharmony_ci;                       gi = fz +k1-i;
284159b3361Sopenharmony_ci;                       do{
285159b3361Sopenharmony_ci.lp220:
286159b3361Sopenharmony_ci; unroll���do loop��51+4̿��
287159b3361Sopenharmony_ci;                               a       = c2*fi[k1  ] + s2*gi[k1  ];
288159b3361Sopenharmony_ci;                               e       = c4*fi[k1+1] + s4*gi[k1-1];
289159b3361Sopenharmony_ci;                               f       = s4*fi[k1+1] - c4*gi[k1-1];
290159b3361Sopenharmony_ci;                               b       = s2*fi[k1  ] - c2*gi[k1  ];
291159b3361Sopenharmony_ci;                               c       = c2*fi[k3  ] + s2*gi[k3  ];
292159b3361Sopenharmony_ci;                               g       = c4*fi[k3+1] + s4*gi[k3-1];
293159b3361Sopenharmony_ci;                               h       = s4*fi[k3+1] - c4*gi[k3-1];
294159b3361Sopenharmony_ci;                               d       = s2*fi[k3  ] - c2*gi[k3  ];
295159b3361Sopenharmony_ci
296159b3361Sopenharmony_ci	movaps	xmm4,xmm7	; = {-c2 -c4  s4  s2}
297159b3361Sopenharmony_ci	xorps	xmm4,[PIC_EBP_REL(Q_MMPP)]	; = { c2  c4  s4  s2}
298159b3361Sopenharmony_ci	shufps	xmm4,xmm4,0x1B	; = { s2  s4  c4  c2}
299159b3361Sopenharmony_ci	movlps	xmm0,[esi+eax*2]
300159b3361Sopenharmony_ci	movlps	xmm1,[edi+eax*2]
301159b3361Sopenharmony_ci	movlps	xmm2,[esi+edx*2]
302159b3361Sopenharmony_ci	movlps	xmm3,[edi+edx*2]
303159b3361Sopenharmony_ci	shufps	xmm0,xmm0,0x14
304159b3361Sopenharmony_ci	shufps	xmm1,xmm1,0x41
305159b3361Sopenharmony_ci	shufps	xmm2,xmm2,0x14
306159b3361Sopenharmony_ci	shufps	xmm3,xmm3,0x41
307159b3361Sopenharmony_ci	mulps	xmm0,xmm4
308159b3361Sopenharmony_ci	mulps	xmm1,xmm7
309159b3361Sopenharmony_ci	mulps	xmm2,xmm4
310159b3361Sopenharmony_ci	mulps	xmm3,xmm7
311159b3361Sopenharmony_ci	addps	xmm0,xmm1	; xmm0 = {b, f, e, a}
312159b3361Sopenharmony_ci	addps	xmm2,xmm3	; xmm2 = {d, h, g, c}
313159b3361Sopenharmony_ci;17
314159b3361Sopenharmony_ci
315159b3361Sopenharmony_ci;                               f0      = fi[0   ]    + a;
316159b3361Sopenharmony_ci;                               f4      = fi[0 +1]    + e;
317159b3361Sopenharmony_ci;                               g4      = gi[0 -1]    + f;
318159b3361Sopenharmony_ci;                               g0      = gi[0   ]    + b;
319159b3361Sopenharmony_ci;                               f1      = fi[0   ]    - a;
320159b3361Sopenharmony_ci;                               f5      = fi[0 +1]    - e;
321159b3361Sopenharmony_ci;                               g5      = gi[0 -1]    - f;
322159b3361Sopenharmony_ci;                               g1      = gi[0   ]    - b;
323159b3361Sopenharmony_ci;                               f2      = fi[k2  ]    + c;
324159b3361Sopenharmony_ci;                               f6      = fi[k2+1]    + g;
325159b3361Sopenharmony_ci;                               g6      = gi[k2-1]    + h;
326159b3361Sopenharmony_ci;                               g2      = gi[k2  ]    + d;
327159b3361Sopenharmony_ci;                               f3      = fi[k2  ]    - c;
328159b3361Sopenharmony_ci;                               f7      = fi[k2+1]    - g;
329159b3361Sopenharmony_ci;                               g7      = gi[k2-1]    - h;
330159b3361Sopenharmony_ci;                               g3      = gi[k2  ]    - d;
331159b3361Sopenharmony_ci	movlps	xmm1,[esi      ]
332159b3361Sopenharmony_ci	movhps	xmm1,[edi      ]
333159b3361Sopenharmony_ci	movaps	xmm4,xmm1
334159b3361Sopenharmony_ci	subps	xmm1,xmm0	; xmm1 = {g1, g5, f5, f1}
335159b3361Sopenharmony_ci	movlps	xmm3,[esi+eax*4]
336159b3361Sopenharmony_ci	movhps	xmm3,[edi+eax*4]
337159b3361Sopenharmony_ci	movaps	xmm5,xmm3
338159b3361Sopenharmony_ci	subps	xmm3,xmm2	; xmm3 = {g3, g7, f7, f3}
339159b3361Sopenharmony_ci	addps	xmm0,xmm4	; xmm0 = {g0, g4, f4, f0}
340159b3361Sopenharmony_ci	addps	xmm2,xmm5	; xmm2 = {g2, g6, f6, f2}
341159b3361Sopenharmony_ci;10
342159b3361Sopenharmony_ci
343159b3361Sopenharmony_ci;                               a       = c1*f2     + s1*g3;	��*�� + ��*��
344159b3361Sopenharmony_ci;                               e       = c3*f6     + s3*g7;
345159b3361Sopenharmony_ci;                               g       = s3*g6     + c3*f7;
346159b3361Sopenharmony_ci;                               c       = s1*g2     + c1*f3;
347159b3361Sopenharmony_ci;                               d       = c1*g2     - s1*f3;	��*�� - ��*��
348159b3361Sopenharmony_ci;                               h       = c3*g6     - s3*f7;
349159b3361Sopenharmony_ci;                               f       = s3*f6     - c3*g7;
350159b3361Sopenharmony_ci;                               b       = s1*f2     - c1*g3;
351159b3361Sopenharmony_ci
352159b3361Sopenharmony_ci	movaps	xmm5,xmm6	; xmm6 = {s1, s3, c3, c1}
353159b3361Sopenharmony_ci	shufps	xmm5,xmm5,0x1B	; = {c1, c3, s3, s1}
354159b3361Sopenharmony_ci	movaps	xmm4,xmm2
355159b3361Sopenharmony_ci	mulps	xmm4,xmm6
356159b3361Sopenharmony_ci	shufps	xmm2,xmm2,0x1B	; xmm2 = {f2, f6, g6, g2}
357159b3361Sopenharmony_ci	mulps	xmm2,xmm6
358159b3361Sopenharmony_ci	mulps	xmm5,xmm3
359159b3361Sopenharmony_ci	mulps	xmm3,xmm6
360159b3361Sopenharmony_ci	shufps	xmm3,xmm3,0x1B
361159b3361Sopenharmony_ci	addps	xmm4,xmm3	; = {c, g, e, a}
362159b3361Sopenharmony_ci	subps	xmm2,xmm5	; = {b, f, h, d}
363159b3361Sopenharmony_ci;10
364159b3361Sopenharmony_ci
365159b3361Sopenharmony_ci;                               fi[0   ]  = f0        + a;
366159b3361Sopenharmony_ci;                               fi[0 +1]  = f4        + e;
367159b3361Sopenharmony_ci;                               gi[0 -1]  = g4        + g;
368159b3361Sopenharmony_ci;                               gi[0   ]  = g0        + c;
369159b3361Sopenharmony_ci;                               fi[k2  ]  = f0        - a;
370159b3361Sopenharmony_ci;                               fi[k2+1]  = f4        - e;
371159b3361Sopenharmony_ci;                               gi[k2-1]  = g4        - g;
372159b3361Sopenharmony_ci;                               gi[k2  ]  = g0        - c;
373159b3361Sopenharmony_ci;                               fi[k1  ]  = f1        + d;
374159b3361Sopenharmony_ci;                               fi[k1+1]  = f5        + h;
375159b3361Sopenharmony_ci;                               gi[k1-1]  = g5        + f;
376159b3361Sopenharmony_ci;                               gi[k1  ]  = g1        + b;
377159b3361Sopenharmony_ci;                               fi[k3  ]  = f1        - d;
378159b3361Sopenharmony_ci;                               fi[k3+1]  = f5        - h;
379159b3361Sopenharmony_ci;                               gi[k3-1]  = g5        - f;
380159b3361Sopenharmony_ci;                               gi[k3  ]  = g1        - b;
381159b3361Sopenharmony_ci	movaps	xmm3,xmm0
382159b3361Sopenharmony_ci	subps	xmm0,xmm4
383159b3361Sopenharmony_ci	movlps	[esi+eax*4],xmm0
384159b3361Sopenharmony_ci	movhps	[edi+eax*4],xmm0
385159b3361Sopenharmony_ci	addps	xmm4,xmm3
386159b3361Sopenharmony_ci	movlps	[esi      ],xmm4
387159b3361Sopenharmony_ci	movhps	[edi      ],xmm4
388159b3361Sopenharmony_ci
389159b3361Sopenharmony_ci	movaps	xmm5,xmm1
390159b3361Sopenharmony_ci	subps	xmm1,xmm2
391159b3361Sopenharmony_ci	movlps	[esi+edx*2],xmm1
392159b3361Sopenharmony_ci	movhps	[edi+edx*2],xmm1
393159b3361Sopenharmony_ci	addps	xmm2,xmm5
394159b3361Sopenharmony_ci	movlps	[esi+eax*2],xmm2
395159b3361Sopenharmony_ci	movhps	[edi+eax*2],xmm2
396159b3361Sopenharmony_ci; 14
397159b3361Sopenharmony_ci;                               gi     += k4;
398159b3361Sopenharmony_ci;                               fi     += k4;
399159b3361Sopenharmony_ci	lea	edi,[edi + eax*8] ; gi += (k1 * 4);
400159b3361Sopenharmony_ci	lea	esi,[esi + eax*8] ; fi += (k1 * 4);
401159b3361Sopenharmony_ci	cmp	esi,[esp]
402159b3361Sopenharmony_ci	jl	near .lp220		; while (fi<fn);
403159b3361Sopenharmony_ci;                       } while (fi<fn);
404159b3361Sopenharmony_ci
405159b3361Sopenharmony_ci	add	ebx,byte 2*4	; i+= 4
406159b3361Sopenharmony_ci	cmp	ebx,eax		; i < k1
407159b3361Sopenharmony_ci	shufps	xmm6,xmm6,R4(1,2,2,1)	; (--,s3,c3,--) => {c3, s3, s3, c3}
408159b3361Sopenharmony_ci	jl	near .lp22
409159b3361Sopenharmony_ci;               }
410159b3361Sopenharmony_ci.F22:
411159b3361Sopenharmony_ci	shl	eax,2
412159b3361Sopenharmony_ci	add	ecx, byte 8
413159b3361Sopenharmony_ci	cmp	eax,[esp+_P+8]	; while ((k1 * 4)<n);
414159b3361Sopenharmony_ci	jle	near .lp2
415159b3361Sopenharmony_ci	pop	ebp
416159b3361Sopenharmony_ci	pop	ebp
417159b3361Sopenharmony_ci	pop	edi
418159b3361Sopenharmony_ci	pop	esi
419159b3361Sopenharmony_ci	pop	ebx
420159b3361Sopenharmony_ci	ret
421159b3361Sopenharmony_ci
422159b3361Sopenharmony_ci	end
423