1159b3361Sopenharmony_ci
2159b3361Sopenharmony_ci;	for new GOGO-no-coda (1999/09)
3159b3361Sopenharmony_ci;	Copyright (C) 1999 shigeo
4159b3361Sopenharmony_ci;	special thanks to Keiichi SAKAI, URURI
5159b3361Sopenharmony_ci%include "nasm.h"
6159b3361Sopenharmony_ci
7159b3361Sopenharmony_ci	globaldef fht_3DN
8159b3361Sopenharmony_ci	globaldef fht
9159b3361Sopenharmony_ci	externdef costab_fft
10159b3361Sopenharmony_ci	externdef sintab_fft
11159b3361Sopenharmony_ci	externdef gray_index
12159b3361Sopenharmony_ci
13159b3361Sopenharmony_ci	segment_data
14159b3361Sopenharmony_ci	align 16
15159b3361Sopenharmony_ciD_MSB1_0	dd	0         ,0x80000000
16159b3361Sopenharmony_ciD_SQRT2	dd	1.414213562,1.414213562
17159b3361Sopenharmony_cit_s0	dd	0	;[ t_c:t_s]
18159b3361Sopenharmony_cit_c0	dd	0
19159b3361Sopenharmony_cit_c1	dd	0	;[-t_s:t_c]
20159b3361Sopenharmony_cit_s1	dd	0
21159b3361Sopenharmony_ciD_s1c1	dd	0, 0
22159b3361Sopenharmony_ciD_Mc1s1	dd	0, 0
23159b3361Sopenharmony_ciD_s2c2	dd	0, 0
24159b3361Sopenharmony_ciD_Mc2s2	dd	0, 0
25159b3361Sopenharmony_ciD_0_1	dd	1.0, 0.0
26159b3361Sopenharmony_ciS_05	DD	0.5
27159b3361Sopenharmony_ciS_00005	DD	0.0005
28159b3361Sopenharmony_cifht		dd	0	;�ؿ��ݥ���
29159b3361Sopenharmony_ci
30159b3361Sopenharmony_ci	segment_code
31159b3361Sopenharmony_ci
32159b3361Sopenharmony_ci;************************************************************************
33159b3361Sopenharmony_ci
34159b3361Sopenharmony_ci;	by shigeo
35159b3361Sopenharmony_ci;	99/08/16
36159b3361Sopenharmony_ci;	23000clk �ɤ��ä���
37159b3361Sopenharmony_ci;	18500clk bit reversal from gogo1 by URURI
38159b3361Sopenharmony_ci
39159b3361Sopenharmony_ci;void fht(float *fz, int n);
40159b3361Sopenharmony_ci	align 16
41159b3361Sopenharmony_cifht_3DN:
42159b3361Sopenharmony_ci	push	ebx
43159b3361Sopenharmony_ci	push	esi
44159b3361Sopenharmony_ci	push	edi
45159b3361Sopenharmony_ci	push	ebp
46159b3361Sopenharmony_ci%assign _P 4*4
47159b3361Sopenharmony_ci	;�ޤ��ǽ�Υ롼��... ��fht()�γ��ذ�ư
48159b3361Sopenharmony_ci
49159b3361Sopenharmony_ci	mov	esi,[esp+_P+4]	;esi=fz
50159b3361Sopenharmony_ci	mov	ecx,[esp+_P+8]	;ecx=n
51159b3361Sopenharmony_ci
52159b3361Sopenharmony_ci	;�ᥤ��롼��
53159b3361Sopenharmony_ci	movq	mm7,[D_MSB1_0]	;mm7=[1<<31:0]
54159b3361Sopenharmony_ci
55159b3361Sopenharmony_ci%assign LOCAL_STACK	16
56159b3361Sopenharmony_ci	sub	esp,LOCAL_STACK
57159b3361Sopenharmony_ci%assign _P (_P+LOCAL_STACK)
58159b3361Sopenharmony_ci	xor	eax,eax
59159b3361Sopenharmony_ci	mov	[esp],eax	;k=0
60159b3361Sopenharmony_ci%define k dword [esp]
61159b3361Sopenharmony_ci%define kx	dword [esp+4]
62159b3361Sopenharmony_ci%define fn dword [esp+8]
63159b3361Sopenharmony_ci
64159b3361Sopenharmony_ci.lp30:	;k=0; do{
65159b3361Sopenharmony_ci	mov	ecx,k
66159b3361Sopenharmony_ci	add	ecx,2
67159b3361Sopenharmony_ci	mov	k,ecx
68159b3361Sopenharmony_ci	mov	eax,1
69159b3361Sopenharmony_ci	shl	eax,cl		;eax=k1 = 1<<k
70159b3361Sopenharmony_ci	lea	ebx,[eax+eax]	;ebx=k2 = k1*2
71159b3361Sopenharmony_ci	lea	ecx,[eax+eax*2]	;ecx=k3 = k2 + k1 = k1*3
72159b3361Sopenharmony_ci	lea	edx,[ebx+ebx]	;edx=k4 = k1*4
73159b3361Sopenharmony_ci	mov	esi,eax
74159b3361Sopenharmony_ci	shr	esi,1		;esi=kx=k1>>1
75159b3361Sopenharmony_ci	mov	kx,esi		;��¸(��ǻȤ�)
76159b3361Sopenharmony_ci	mov	edi,[esp+_P+4]	;edi=fi=fz
77159b3361Sopenharmony_ci	lea	ebp,[edi+esi*4]	;ebp=gi=fz+kx
78159b3361Sopenharmony_ci	mov	esi,[esp+_P+8]	;esi=n
79159b3361Sopenharmony_ci	lea	esi,[edi+esi*4]	;esi=fn=fz+n
80159b3361Sopenharmony_ci	movq	mm6,[D_SQRT2]	;mm6=[��2:��2]
81159b3361Sopenharmony_ci
82159b3361Sopenharmony_ci.lp31:	;fn=fz+n; do{ FLOAT g0,f0,f1,...
83159b3361Sopenharmony_ci	movd	mm0,[edi]	;mm0=[0:fi[ 0]]
84159b3361Sopenharmony_ci	movd	mm1,[edi+eax*4]	;mm1=[0:fi[k1]]
85159b3361Sopenharmony_ci	punpckldq	mm0,mm0	;mm0=[fi_0 :fi_0 ]
86159b3361Sopenharmony_ci	punpckldq	mm1,mm1	;mm1=[fi_k1:fi_k1]
87159b3361Sopenharmony_ci	movd	mm2,[edi+ebx*4]
88159b3361Sopenharmony_ci	movd	mm3,[edi+ecx*4]
89159b3361Sopenharmony_ci	punpckldq	mm2,mm2	;mm2=[fi_k2:fi_k2]
90159b3361Sopenharmony_ci	punpckldq	mm3,mm3	;mm3=[fi_k3:fi_k3]
91159b3361Sopenharmony_ci	pxor	mm1,mm7		;mm1=[-fi_k1:fi_k1]
92159b3361Sopenharmony_ci	pxor	mm3,mm7		;mm3=[-fi_k3:fi_k3]
93159b3361Sopenharmony_ci	pfadd	mm0,mm1		;mm0=[f1:f0]=[fi_0 -fi_k1 : fi_0 +fi_k1]
94159b3361Sopenharmony_ci	pfadd	mm2,mm3		;mm2=[f3:f2]=[fi_k2-fi_k3 : fi_k2+fi_k3]
95159b3361Sopenharmony_ci	movq	mm3,mm0		;mm3=[f1:f0]
96159b3361Sopenharmony_ci	pfadd	mm0,mm2		;mm0=[f1+f3:f0+f2]
97159b3361Sopenharmony_ci	movd	[edi],mm0	;fi[0]=f0+f2
98159b3361Sopenharmony_ci	psrlq	mm0,32		;mm0=[0:f1+f3]
99159b3361Sopenharmony_ci	pfsub	mm3,mm2		;mm3=[f1-f3:f0-f2]
100159b3361Sopenharmony_ci	movd	[edi+eax*4],mm0	;fi[k1]=f1+f3
101159b3361Sopenharmony_ci	movd	[edi+ebx*4],mm3	;fi[k2]=f0-f2
102159b3361Sopenharmony_ci	psrlq	mm3,32		;mm3=[0:f1-f3]
103159b3361Sopenharmony_ci	movd	[edi+ecx*4],mm3	;fi[k3]=f1-f3
104159b3361Sopenharmony_ci
105159b3361Sopenharmony_ci	movd	mm0,[ebp]	;mm0=[0:gi_0]
106159b3361Sopenharmony_ci	movd	mm1,[ebp+eax*4]	;mm1=[0:gi_k1]
107159b3361Sopenharmony_ci	punpckldq	mm0,mm0	;mm0=[gi_0 :gi_0 ]
108159b3361Sopenharmony_ci	punpckldq	mm1,mm1	;mm1=[gi_k1:gi_k1]
109159b3361Sopenharmony_ci	movd	mm2,[ebp+ebx*4]	;mm2=[0:gi_k2]
110159b3361Sopenharmony_ci	pxor	mm1,mm7		;mm1=[-gi_k1:gi_k1]
111159b3361Sopenharmony_ci	punpckldq	mm2,[ebp+ecx*4]	;mm2=[gi_k3:gi_k2]
112159b3361Sopenharmony_ci	pfadd	mm0,mm1		;mm0=[g1:g0]=[gi_0 -gi_k1:gi_0 +gi_k1]
113159b3361Sopenharmony_ci	pfmul	mm2,mm6		;mm2=[g3:g2]=sqrt2 * [gi_k3:gi_k2]
114159b3361Sopenharmony_ci	movq	mm1,mm0		;mm1=[g1:g0]
115159b3361Sopenharmony_ci	pfadd	mm0,mm2		;mm0=[g1+g3:g0+g2]
116159b3361Sopenharmony_ci	movd	[ebp],mm0	;gi[0]=g0+g2
117159b3361Sopenharmony_ci	psrlq	mm0,32		;mm0=[0:g1+g3]
118159b3361Sopenharmony_ci	pfsub	mm1,mm2		;mm1=[g1-g3:g0-g2]
119159b3361Sopenharmony_ci	movd	[ebp+eax*4],mm0	;gi[k1]=g1+g3
120159b3361Sopenharmony_ci	movd	[ebp+ebx*4],mm1	;gi[k2]=g0-g2
121159b3361Sopenharmony_ci	psrlq	mm1,32		;mm1=[0:g1-g3]
122159b3361Sopenharmony_ci	movd	[ebp+ecx*4],mm1	;gi[k3]=g1-g3
123159b3361Sopenharmony_ci	lea	edi,[edi+edx*4]	;fi += k4
124159b3361Sopenharmony_ci	lea	ebp,[ebp+edx*4]	;gi += k4
125159b3361Sopenharmony_ci	cmp	edi,esi
126159b3361Sopenharmony_ci	jc	near .lp31	;}while(fi<fn);
127159b3361Sopenharmony_ci
128159b3361Sopenharmony_ci;	�����ޤǤ�¿ʬO.K.
129159b3361Sopenharmony_ci
130159b3361Sopenharmony_ci	mov	fn,esi		;fn=fz+n
131159b3361Sopenharmony_ci	;�����ͤϰ���³���Ȥ�
132159b3361Sopenharmony_ci	;eax=k1,ebx=k2,ecx=k3,edx=k4
133159b3361Sopenharmony_ci
134159b3361Sopenharmony_ci	mov	edi,k
135159b3361Sopenharmony_ci	lea	ebp,[costab_fft+edi*4]
136159b3361Sopenharmony_ci	mov	ebp,[ebp]	;ebp=t_c
137159b3361Sopenharmony_ci	mov	[t_c0],ebp
138159b3361Sopenharmony_ci	mov	[t_c1],ebp	;t_c
139159b3361Sopenharmony_ci	lea	ebp,[sintab_fft+edi*4]
140159b3361Sopenharmony_ci	mov	ebp,[ebp]	;ebx=t_s
141159b3361Sopenharmony_ci	mov	[t_s0],ebp
142159b3361Sopenharmony_ci	xor	ebp,0x80000000
143159b3361Sopenharmony_ci	mov	[t_s1],ebp	;-t_s
144159b3361Sopenharmony_ci
145159b3361Sopenharmony_ci	movq	mm1,[D_0_1]	;mm1=[0:1]
146159b3361Sopenharmony_ci	movq	[D_s1c1],mm1	;mm1=[s1:c1]
147159b3361Sopenharmony_ci	mov	esi,1		;esi=i=1
148159b3361Sopenharmony_ci
149159b3361Sopenharmony_ci.lp32:	;	for(i=1;i<kx;i++){
150159b3361Sopenharmony_ci	movq	mm0,[D_s1c1]	;mm1=[s1:t]=[s1:c1]
151159b3361Sopenharmony_ci	movq	mm2,mm0
152159b3361Sopenharmony_ci	pfmul	mm0,[t_c1]	;mm0=[-s1*t_s: t*t_c]
153159b3361Sopenharmony_ci	pfmul	mm2,[t_s0]	;mm2=[ s1*t_c: t*t_s]
154159b3361Sopenharmony_ci	pfacc	mm0,mm2		;mm0=[s1:c1]=[ s1*t_c+t*t_s:-s1*t_s+t*t_c]
155159b3361Sopenharmony_ci	movq	mm2,mm0		;mm2=[s1:c1]
156159b3361Sopenharmony_ci	movq	[D_s1c1],mm0	;��¸
157159b3361Sopenharmony_ci	movq	mm6,mm2
158159b3361Sopenharmony_ci	punpckldq	mm5,mm6
159159b3361Sopenharmony_ci	punpckhdq	mm6,mm5	;mm6=[ c1:s1]
160159b3361Sopenharmony_ci	pxor	mm6,mm7		;mm6=[-c1:s1]
161159b3361Sopenharmony_ci	movq	[D_Mc1s1],mm6	;��¸
162159b3361Sopenharmony_ci	pfmul	mm2,mm2		;mm2=[s1*s1:c1*c1]
163159b3361Sopenharmony_ci	movq	mm3,mm0		;mm3=[s1:c1]
164159b3361Sopenharmony_ci	pxor	mm2,mm7		;mm2=[-s1*s1:c1*c1]
165159b3361Sopenharmony_ci	psrlq	mm3,32		;mm3=[ 0:s1]
166159b3361Sopenharmony_ci	pfacc	mm2,mm2		;mm2=[c2:c2]=[c1*c1-s1*s1:<]
167159b3361Sopenharmony_ci	pfmul	mm0,mm3		;mm0=[ 0:c1*s1]
168159b3361Sopenharmony_ci	pfadd	mm0,mm0		;mm0=[0:s2]=[ 0:2*c1*s1]
169159b3361Sopenharmony_ci	punpckldq	mm2,mm0	;mm2=[s2:c2]
170159b3361Sopenharmony_ci	movq	[D_s2c2],mm2	;��¸
171159b3361Sopenharmony_ci
172159b3361Sopenharmony_ci	punpckldq	mm0,mm2
173159b3361Sopenharmony_ci	punpckhdq	mm2,mm0	;mm2=[c2:s2]
174159b3361Sopenharmony_ci	pxor	mm2,mm7		;mm2=[-c2:s2]
175159b3361Sopenharmony_ci	movq	[D_Mc2s2],mm2	;��¸
176159b3361Sopenharmony_ci
177159b3361Sopenharmony_ci	mov	edi,[esp+_P+4]	;edi=fz
178159b3361Sopenharmony_ci	lea	edi,[edi+esi*4]	;edi=fz+i
179159b3361Sopenharmony_ci
180159b3361Sopenharmony_ci	mov	ebp,[esp+_P+4]	;ebp=fz
181159b3361Sopenharmony_ci	neg	esi		;esi=-i
182159b3361Sopenharmony_ci	lea	ebp,[ebp+eax*4]	;ebp=fz+k1
183159b3361Sopenharmony_ci	lea	ebp,[ebp+esi*4]	;ebp=gi=fz+k1-i
184159b3361Sopenharmony_ci	neg	esi		;esi=i
185159b3361Sopenharmony_ci
186159b3361Sopenharmony_ci.lp33:	;	do{ FLOAT a,b,g0,f0,f1,g1,f2,g2,f3,g3;
187159b3361Sopenharmony_ci
188159b3361Sopenharmony_ci	movd	mm0,[edi+eax*4]	;mm0=[0:fi_k1]
189159b3361Sopenharmony_ci	punpckldq	mm0,[ebp+eax*4]	;mm0=[gi_k1:fi_k1]
190159b3361Sopenharmony_ci	movq	mm1,mm0
191159b3361Sopenharmony_ci	pfmul	mm0,[D_s2c2]	;mm0=[ s2*gi_k1:c2*fi_k1]
192159b3361Sopenharmony_ci	pfmul	mm1,[D_Mc2s2]	;mm1=[-c2*gi_k1:s2*fi_k1]
193159b3361Sopenharmony_ci	pfacc	mm0,mm1		;mm0=[b:a]
194159b3361Sopenharmony_ci	movd	mm4,[edi]	;mm4=[0:fi_0]
195159b3361Sopenharmony_ci	movq	mm3,mm0		;mm3=[b:a]
196159b3361Sopenharmony_ci	punpckldq	mm4,[ebp]	;mm4=[gi_0:fi_0]
197159b3361Sopenharmony_ci	pfadd	mm3,mm4		;mm3=[g0:f0]=[gi_0+b:fi_0+a]
198159b3361Sopenharmony_ci	pfsub	mm4,mm0		;mm4=[g1:f1]=[gi_0-b:fi_0-a]
199159b3361Sopenharmony_ci
200159b3361Sopenharmony_ci	movd	mm0,[edi+ecx*4]	;mm0=[0:fi_k3]
201159b3361Sopenharmony_ci	punpckldq	mm0,[ebp+ecx*4]	;mm0=[gi_k3:fi_k3]
202159b3361Sopenharmony_ci	movq	mm1,mm0
203159b3361Sopenharmony_ci	pfmul	mm0,[D_s2c2]	;mm0=[ s2*gi_k3:c2*fi_k3]
204159b3361Sopenharmony_ci	pfmul	mm1,[D_Mc2s2]	;mm1=[-c2*gi_k3:s2*fi_k3]
205159b3361Sopenharmony_ci	pfacc	mm0,mm1		;mm0=[b:a]
206159b3361Sopenharmony_ci	movd	mm5,[edi+ebx*4]	;mm5=[0:fi_k2]
207159b3361Sopenharmony_ci	movq	mm6,mm0		;mm6=[b:a]
208159b3361Sopenharmony_ci	punpckldq	mm5,[ebp+ebx*4]	;mm5=[gi_k2:fi_k2]
209159b3361Sopenharmony_ci	pfadd	mm6,mm5		;mm6=[g2:f2]=[gi_k2+b:fi_k2+a]
210159b3361Sopenharmony_ci	pfsub	mm5,mm0		;mm5=[g3:f3]=[gi_k2-b:fi_k2-a]
211159b3361Sopenharmony_ci
212159b3361Sopenharmony_ci	punpckldq	mm1,mm6	;mm1=[f2:*]
213159b3361Sopenharmony_ci	movq	mm0,[D_s1c1]	;mm0=[s1:c1]
214159b3361Sopenharmony_ci	punpckhdq	mm1,mm5	;mm1=[g3:f2]
215159b3361Sopenharmony_ci	pfmul	mm0,mm1		;mm0=[ s1*g3:c1*f2]
216159b3361Sopenharmony_ci	movq	mm2,[D_Mc1s1]	;mm2=[-c1:s1]
217159b3361Sopenharmony_ci	pfmul	mm2,mm1		;mm2=[-c1*g3:s1*f2]
218159b3361Sopenharmony_ci	pfacc	mm0,mm2		;mm0=[b:a]
219159b3361Sopenharmony_ci
220159b3361Sopenharmony_ci	punpckldq	mm1,mm3	;mm1=[f0:*]
221159b3361Sopenharmony_ci	punpckhdq	mm1,mm4	;mm1=[g1:f0]
222159b3361Sopenharmony_ci	movq	mm2,mm0		;mm2=[b:a]
223159b3361Sopenharmony_ci	pfadd	mm0,mm1		;mm0=[g1+b:f0+a]
224159b3361Sopenharmony_ci	pfsubr	mm2,mm1		;mm2=[g1-b:f0-a]
225159b3361Sopenharmony_ci	movd	[edi],mm0	;fi[0]=f0+a
226159b3361Sopenharmony_ci	psrlq	mm0,32		;mm0=[0:g1+b]
227159b3361Sopenharmony_ci	movd	[edi+ebx*4],mm2	;fi[k2]=f0-a
228159b3361Sopenharmony_ci	psrlq	mm2,32		;mm2=[0:g1-b]
229159b3361Sopenharmony_ci	movd	[ebp+eax*4],mm0	;gi[k1]=g1+b
230159b3361Sopenharmony_ci	movd	[ebp+ecx*4],mm2	;gi[k3]=g1-b
231159b3361Sopenharmony_ci	psrlq	mm6,32		;mm6=[0:g2]
232159b3361Sopenharmony_ci	movq	mm0,[D_s1c1]	;mm0=[s1:c1]
233159b3361Sopenharmony_ci	punpckldq	mm5,mm6	;mm5=[g2:f3]
234159b3361Sopenharmony_ci	pfmul	mm0,mm5		;mm0=[g2* s1:f3*c1]
235159b3361Sopenharmony_ci	pfmul	mm5,[D_Mc1s1]	;mm5=[g2*-c1:f3*s1]
236159b3361Sopenharmony_ci	pfacc	mm0,mm5		;mm0=[-b:a]
237159b3361Sopenharmony_ci	psrlq	mm3,32		;mm3=[0:g0]
238159b3361Sopenharmony_ci	movq	mm1,mm0		;mm1=[-b:a]
239159b3361Sopenharmony_ci	punpckldq	mm3,mm4	;mm3=[f1:g0]
240159b3361Sopenharmony_ci	pfadd	mm0,mm3		;mm0=[f1-b:g0+a]
241159b3361Sopenharmony_ci	pfsubr	mm1,mm3		;mm1=[f1+b:g0-a]
242159b3361Sopenharmony_ci	movd	[ebp],mm0	;gi[0]=g0+a
243159b3361Sopenharmony_ci	psrlq	mm0,32		;mm0=[0:f1-b]
244159b3361Sopenharmony_ci	movd	[ebp+ebx*4],mm1	;gi[k2]=g0-a
245159b3361Sopenharmony_ci	psrlq	mm1,32		;mm1=[0:f1+b]
246159b3361Sopenharmony_ci	movd	[edi+ecx*4],mm0	;fi[k3]=f1-b
247159b3361Sopenharmony_ci	movd	[edi+eax*4],mm1	;fi[k1]=f1+b
248159b3361Sopenharmony_ci
249159b3361Sopenharmony_ci	lea	edi,[edi+edx*4]	;fi += k4
250159b3361Sopenharmony_ci	lea	ebp,[ebp+edx*4]	;gi += k4
251159b3361Sopenharmony_ci	cmp	edi,fn
252159b3361Sopenharmony_ci	jc	near .lp33	;}while(fi<fn)
253159b3361Sopenharmony_ci	inc	esi
254159b3361Sopenharmony_ci	cmp	esi,kx
255159b3361Sopenharmony_ci	jnz	near .lp32	;}
256159b3361Sopenharmony_ci	cmp	edx,[esp+_P+8]
257159b3361Sopenharmony_ci	jnz	near .lp30	;}while(k4<n)
258159b3361Sopenharmony_ci
259159b3361Sopenharmony_ci
260159b3361Sopenharmony_ci.exit:
261159b3361Sopenharmony_ci	add	esp,LOCAL_STACK
262159b3361Sopenharmony_ci	femms
263159b3361Sopenharmony_ci	pop	ebp
264159b3361Sopenharmony_ci	pop	edi
265159b3361Sopenharmony_ci	pop	esi
266159b3361Sopenharmony_ci	pop	ebx
267159b3361Sopenharmony_ci	ret
268