1159b3361Sopenharmony_ci; from a new GOGO-no-coda (1999/09)
2159b3361Sopenharmony_ci;	Copyright (C) 1999 shigeo
3159b3361Sopenharmony_ci;	special thanks to Keiichi SAKAI, URURI
4159b3361Sopenharmony_ci; hacked and back-ported to LAME
5159b3361Sopenharmony_ci;	 by Takehiro TOMINAGA Nov 2000
6159b3361Sopenharmony_ci
7159b3361Sopenharmony_ci%include "nasm.h"
8159b3361Sopenharmony_ci
9159b3361Sopenharmony_ci	globaldef fht_3DN
10159b3361Sopenharmony_ci
11159b3361Sopenharmony_ci	segment_data
12159b3361Sopenharmony_ci	align	16
13159b3361Sopenharmony_cicostab	dd	0x80000000, 0
14159b3361Sopenharmony_ci	dd	1.414213562,1.414213562
15159b3361Sopenharmony_ci	dd	9.238795283293805e-01, 9.238795283293805e-01
16159b3361Sopenharmony_ci	dd	3.826834424611044e-01, 3.826834424611044e-01
17159b3361Sopenharmony_ci	dd	9.951847264044178e-01, 9.951847264044178e-01
18159b3361Sopenharmony_ci	dd	9.801714304836734e-02, 9.801714304836734e-02
19159b3361Sopenharmony_ci	dd	9.996988186794428e-01, 9.996988186794428e-01
20159b3361Sopenharmony_ci	dd	2.454122920569705e-02, 2.454122920569705e-02
21159b3361Sopenharmony_ci	dd	9.999811752815535e-01, 9.999811752815535e-01
22159b3361Sopenharmony_ci	dd	6.135884819898878e-03, 6.135884819898878e-03
23159b3361Sopenharmony_ciD_1_0_0_0	dd	0.0		, 1.0
24159b3361Sopenharmony_ci
25159b3361Sopenharmony_ci	segment_code
26159b3361Sopenharmony_ci
27159b3361Sopenharmony_ciPIC_OFFSETTABLE
28159b3361Sopenharmony_ci
29159b3361Sopenharmony_ci
30159b3361Sopenharmony_ci;void fht_3DN(float *fz, int nn);
31159b3361Sopenharmony_ci
32159b3361Sopenharmony_ciproc	fht_3DN
33159b3361Sopenharmony_ci
34159b3361Sopenharmony_ci	pushd	ebp, ebx, esi, edi
35159b3361Sopenharmony_ci
36159b3361Sopenharmony_ci	sub	esp, 20
37159b3361Sopenharmony_ci
38159b3361Sopenharmony_ci	call	get_pc.bp
39159b3361Sopenharmony_ci	add	ebp, PIC_BASE()
40159b3361Sopenharmony_ci
41159b3361Sopenharmony_ci	mov	r0, [esp+40]		;fi
42159b3361Sopenharmony_ci	mov	r1, [esp+44]		;r1 = nn
43159b3361Sopenharmony_ci	lea	r3, [PIC_EBP_REL(costab)]		;tri = costab
44159b3361Sopenharmony_ci	lea	r4, [r0+r1*8]		;r4 = fn = &fz[n]
45159b3361Sopenharmony_ci	mov	[esp+16], r4
46159b3361Sopenharmony_ci	mov	r4, 8			;kx = k1/2
47159b3361Sopenharmony_ci
48159b3361Sopenharmony_ci	pmov	mm7, [r3]
49159b3361Sopenharmony_ci
50159b3361Sopenharmony_ci	loopalign 16
51159b3361Sopenharmony_ci.do1
52159b3361Sopenharmony_ci	lea	r3, [r3+16]	;tri += 2;
53159b3361Sopenharmony_ci	pmov	mm6, [PIC_EBP_REL(costab+8)]
54159b3361Sopenharmony_ci	lea	r2, [r4+r4*2]		;k3*fsize/2
55159b3361Sopenharmony_ci	mov	r5, 4		;i = 1*fsize
56159b3361Sopenharmony_ci
57159b3361Sopenharmony_ci	loopalign 16
58159b3361Sopenharmony_ci.do2:
59159b3361Sopenharmony_ci	lea	r1, [r0+r4]		;gi = fi + kx
60159b3361Sopenharmony_ci	;f
61159b3361Sopenharmony_ci	pmov	mm0, [r0]	;fi0
62159b3361Sopenharmony_ci	pmov	mm1, [r0+r4*2]	;fi1
63159b3361Sopenharmony_ci	pmov	mm2, [r0+r2*2]	;fi3
64159b3361Sopenharmony_ci	pmov	mm3, [r0+r4*4]	;fi2
65159b3361Sopenharmony_ci
66159b3361Sopenharmony_ci	pupldq	mm0, mm0	;fi0 | fi0
67159b3361Sopenharmony_ci	pupldq	mm1, mm1	;fi1 | fi1
68159b3361Sopenharmony_ci	pupldq	mm2, mm2	;fi2 | fi2
69159b3361Sopenharmony_ci	pupldq	mm3, mm3	;fi3 | fi3
70159b3361Sopenharmony_ci
71159b3361Sopenharmony_ci	pxor	mm1, mm7	;fi1 | -fi1
72159b3361Sopenharmony_ci	pxor	mm3, mm7	;fi3 | -fi3
73159b3361Sopenharmony_ci
74159b3361Sopenharmony_ci	pfsub	mm0, mm1	;f1 | f0
75159b3361Sopenharmony_ci	pfsub	mm2, mm3	;f3 | f2
76159b3361Sopenharmony_ci
77159b3361Sopenharmony_ci	pmov	mm4, mm0
78159b3361Sopenharmony_ci	pfadd	mm0, mm2	;f1+f3|f0+f2 = fi1 | fi0
79159b3361Sopenharmony_ci	pfsub	mm4, mm2	;f1-f3|f0-f2 = fi3 | fi2
80159b3361Sopenharmony_ci
81159b3361Sopenharmony_ci	pmovd	[r0], mm0	;fi[0]
82159b3361Sopenharmony_ci	puphdq	mm0, mm0
83159b3361Sopenharmony_ci	pmovd	[r0+r4*4], mm4	;fi[k2]
84159b3361Sopenharmony_ci	puphdq	mm4, mm4
85159b3361Sopenharmony_ci
86159b3361Sopenharmony_ci	pmovd	[r0+r4*2], mm4	;fi[k1]
87159b3361Sopenharmony_ci	pmovd	[r0+r2*2], mm0	;fi[k3]
88159b3361Sopenharmony_ci	lea	r0, [r0+r4*8]
89159b3361Sopenharmony_ci
90159b3361Sopenharmony_ci	;g
91159b3361Sopenharmony_ci	pmov	mm0, [r1]	;gi0
92159b3361Sopenharmony_ci	pmov	mm1, [r1+r4*2]	;gi1
93159b3361Sopenharmony_ci	pmov	mm2, [r1+r4*4]	;gi2
94159b3361Sopenharmony_ci	pmov	mm3, [r1+r2*2]	;gi3
95159b3361Sopenharmony_ci
96159b3361Sopenharmony_ci	pupldq	mm1, mm1
97159b3361Sopenharmony_ci	pupldq	mm0, mm0	;gi0 | gi0
98159b3361Sopenharmony_ci	pupldq	mm2, mm3	;gi3 | gi2
99159b3361Sopenharmony_ci
100159b3361Sopenharmony_ci	pxor	mm1, mm7	;gi1 | -gi1
101159b3361Sopenharmony_ci
102159b3361Sopenharmony_ci	pfsub	mm0, mm1	;gi0-gi1|gi0+gi1 = g1 | g0
103159b3361Sopenharmony_ci	pfmul	mm2, mm6	;gi3*SQRT2|gi2*SQRT2 = g3 | g2
104159b3361Sopenharmony_ci
105159b3361Sopenharmony_ci	pmov	mm4, mm0
106159b3361Sopenharmony_ci	pfadd	mm0, mm2	;g1+g3|g0+g2 = gi1 | gi0
107159b3361Sopenharmony_ci	pfsub	mm4, mm2	;g1-g3|g0-g2 = gi3 | gi2
108159b3361Sopenharmony_ci
109159b3361Sopenharmony_ci	pmovd	[r1], mm0	;gi[0]
110159b3361Sopenharmony_ci	puphdq	mm0, mm0
111159b3361Sopenharmony_ci	pmovd	[r1+r4*4], mm4	;gi[k2]
112159b3361Sopenharmony_ci	puphdq	mm4, mm4
113159b3361Sopenharmony_ci
114159b3361Sopenharmony_ci	cmp	r0, [esp + 16]
115159b3361Sopenharmony_ci	pmovd	[r1+r4*2], mm0	;gi[k1]
116159b3361Sopenharmony_ci	pmovd	[r1+r2*2], mm4	;gi[k3]
117159b3361Sopenharmony_ci
118159b3361Sopenharmony_ci	jb near .do2
119159b3361Sopenharmony_ci
120159b3361Sopenharmony_ci	pmov	mm6, [r3+r5]	; this is not aligned address!!
121159b3361Sopenharmony_ci
122159b3361Sopenharmony_ci	loopalign 16
123159b3361Sopenharmony_ci.for:
124159b3361Sopenharmony_ci;
125159b3361Sopenharmony_ci; mm6 = c1 | s1
126159b3361Sopenharmony_ci; mm7 = 0x800000000 | 0
127159b3361Sopenharmony_ci;
128159b3361Sopenharmony_ci	pmov	mm1, mm6
129159b3361Sopenharmony_ci	mov	r0, [esp+40]	; fz
130159b3361Sopenharmony_ci	puphdq	mm1, mm1	; c1 | c1
131159b3361Sopenharmony_ci	lea	r1, [r0+r4*2]
132159b3361Sopenharmony_ci	pfadd	mm1, mm1	; c1+c1 | c1+c1
133159b3361Sopenharmony_ci	pfmul	mm1, mm6	; 2*c1*c1 | 2*c1*s1
134159b3361Sopenharmony_ci	pfsub	mm1, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
135159b3361Sopenharmony_ci
136159b3361Sopenharmony_ci	pmov	mm0, mm1
137159b3361Sopenharmony_ci	pxor	mm7, mm6	; c1 | -s1
138159b3361Sopenharmony_ci
139159b3361Sopenharmony_ci	pupldq	mm2, mm0
140159b3361Sopenharmony_ci	pupldq	mm3, mm6	; ** | c1
141159b3361Sopenharmony_ci	puphdq	mm0, mm2	; s2 | c2
142159b3361Sopenharmony_ci	puphdq	mm6, mm3	;-s1 | c1
143159b3361Sopenharmony_ci
144159b3361Sopenharmony_ci	pxor	mm0, [PIC_EBP_REL(costab)]	; c2 | -s2
145159b3361Sopenharmony_ci
146159b3361Sopenharmony_ci; mm0 =  s2| c2
147159b3361Sopenharmony_ci; mm1 = -c2| s2
148159b3361Sopenharmony_ci; mm6 =  c1| s1
149159b3361Sopenharmony_ci; mm7 =  s1|-c1 (we use the opposite sign. from GOGO here)
150159b3361Sopenharmony_ci
151159b3361Sopenharmony_ci	pmov	[esp], mm0
152159b3361Sopenharmony_ci	pmov	[esp+8], mm1
153159b3361Sopenharmony_ci
154159b3361Sopenharmony_ci	sub	r1, r5		;r1 = gi
155159b3361Sopenharmony_ci	add	r0, r5		;r0 = fi
156159b3361Sopenharmony_ci
157159b3361Sopenharmony_ci	loopalign 16
158159b3361Sopenharmony_ci.do3:
159159b3361Sopenharmony_ci	pmov	mm2, [r0+r4*2] ; fi[k1]
160159b3361Sopenharmony_ci	pmov	mm4, [r1+r4*2] ; gi[k1]
161159b3361Sopenharmony_ci	pmov	mm3, [r0+r2*2] ; fi[k3]
162159b3361Sopenharmony_ci	pmov	mm5, [r1+r2*2] ; gi[k3]
163159b3361Sopenharmony_ci
164159b3361Sopenharmony_ci	pupldq	mm2, mm2	; fi1 | fi1
165159b3361Sopenharmony_ci	pupldq	mm4, mm4	; gi1 | gi1
166159b3361Sopenharmony_ci	pupldq	mm3, mm3	; fi3 | fi3
167159b3361Sopenharmony_ci	pupldq	mm5, mm5	; gi3 | gi3
168159b3361Sopenharmony_ci
169159b3361Sopenharmony_ci	pfmul	mm2, mm0	; s2 * fi1 | c2 * fi1
170159b3361Sopenharmony_ci	pfmul	mm4, mm1	;-c2 * gi1 | s2 * gi1
171159b3361Sopenharmony_ci	pfmul	mm3, mm0	; s2 * fi3 | c2 * fi3
172159b3361Sopenharmony_ci	pfmul	mm5, mm1	;-c2 * gi3 | s2 * gi3
173159b3361Sopenharmony_ci
174159b3361Sopenharmony_ci	pfadd	mm2, mm4		;b | a
175159b3361Sopenharmony_ci	pfadd	mm3, mm5		;d | c
176159b3361Sopenharmony_ci
177159b3361Sopenharmony_ci	pmov	mm0, [r0]
178159b3361Sopenharmony_ci	pmov	mm4, [r1]
179159b3361Sopenharmony_ci	pmov	mm1, [r0+r4*4]
180159b3361Sopenharmony_ci	pmov	mm5, [r1+r4*4]
181159b3361Sopenharmony_ci
182159b3361Sopenharmony_ci	pupldq	mm0, mm4		;gi0 | fi0
183159b3361Sopenharmony_ci	pupldq	mm1, mm5		;gi2 | fi2
184159b3361Sopenharmony_ci
185159b3361Sopenharmony_ci	pmov	mm4, mm2
186159b3361Sopenharmony_ci	pmov	mm5, mm3
187159b3361Sopenharmony_ci
188159b3361Sopenharmony_ci	pfadd	mm2, mm0		;g0 | f0
189159b3361Sopenharmony_ci	pfadd	mm3, mm1		;g2 | f2
190159b3361Sopenharmony_ci
191159b3361Sopenharmony_ci	pfsub	mm0, mm4		;g1 | f1
192159b3361Sopenharmony_ci	pfsub	mm1, mm5		;g3 | f3
193159b3361Sopenharmony_ci
194159b3361Sopenharmony_ci	pmov	mm4, mm3
195159b3361Sopenharmony_ci	pmov	mm5, mm1
196159b3361Sopenharmony_ci
197159b3361Sopenharmony_ci	pupldq	mm4, mm4		;f2 | f2
198159b3361Sopenharmony_ci	puphdq	mm5, mm5		;g3 | g3
199159b3361Sopenharmony_ci	puphdq	mm3, mm3		;g2 | g2
200159b3361Sopenharmony_ci	pupldq	mm1, mm1		;f3 | f3
201159b3361Sopenharmony_ci
202159b3361Sopenharmony_ci	pfmul	mm4, mm6		;f2 * c1 | f2 * s1
203159b3361Sopenharmony_ci	pfmul	mm5, mm7		;g3 * s1 | g3 *-c1
204159b3361Sopenharmony_ci	pfmul	mm3, mm6		;g2 * c1 | g2 * s1
205159b3361Sopenharmony_ci	pfmul	mm1, mm7		;f3 * s1 | f3 *-c1
206159b3361Sopenharmony_ci
207159b3361Sopenharmony_ci	pfadd	mm4, mm5		;a | b
208159b3361Sopenharmony_ci	pfsub	mm3, mm1		;d | c
209159b3361Sopenharmony_ci
210159b3361Sopenharmony_ci	pmov	mm5, mm2
211159b3361Sopenharmony_ci	pmov	mm1, mm0
212159b3361Sopenharmony_ci
213159b3361Sopenharmony_ci	pupldq	mm2, mm2		;f0 | f0
214159b3361Sopenharmony_ci	pupldq	mm0, mm0		;f1 | f1
215159b3361Sopenharmony_ci
216159b3361Sopenharmony_ci	puphdq	mm1, mm2		;f0 | g1
217159b3361Sopenharmony_ci	puphdq	mm5, mm0		;f1 | g0
218159b3361Sopenharmony_ci
219159b3361Sopenharmony_ci	pmov	mm2, mm4
220159b3361Sopenharmony_ci	pmov	mm0, mm3
221159b3361Sopenharmony_ci
222159b3361Sopenharmony_ci	pfadd	mm4, mm1		;fi0 | gi1
223159b3361Sopenharmony_ci	pfadd	mm3, mm5		;fi1 | gi0
224159b3361Sopenharmony_ci	pfsub	mm1, mm2		;fi2 | gi3
225159b3361Sopenharmony_ci	pfsub	mm5, mm0		;fi3 | gi2
226159b3361Sopenharmony_ci
227159b3361Sopenharmony_ci	pmovd	[r1+r4*2], mm4	;gi[k1]
228159b3361Sopenharmony_ci	puphdq	mm4, mm4
229159b3361Sopenharmony_ci	pmovd	[r1], mm3		;gi[0]
230159b3361Sopenharmony_ci	puphdq	mm3, mm3
231159b3361Sopenharmony_ci	pmovd	[r1+r2*2], mm1	;gi[k3]
232159b3361Sopenharmony_ci	puphdq	mm1, mm1
233159b3361Sopenharmony_ci	pmovd	[r1+r4*4], mm5	;gi[k2]
234159b3361Sopenharmony_ci	puphdq	mm5, mm5
235159b3361Sopenharmony_ci
236159b3361Sopenharmony_ci	pmovd	[r0], mm4	;fi[0]
237159b3361Sopenharmony_ci	pmovd	[r0+r4*2], mm3	;fi[k1]
238159b3361Sopenharmony_ci	pmovd	[r0+r4*4], mm1	;fi[k2]
239159b3361Sopenharmony_ci	pmovd	[r0+r2*2], mm5	;fi[k3]
240159b3361Sopenharmony_ci
241159b3361Sopenharmony_ci	lea	r0, [r0+r4*8]
242159b3361Sopenharmony_ci	lea	r1, [r1+r4*8]
243159b3361Sopenharmony_ci	cmp	r0, [esp + 16]
244159b3361Sopenharmony_ci	pmov	mm0, [esp]
245159b3361Sopenharmony_ci	pmov	mm1, [esp+8]
246159b3361Sopenharmony_ci
247159b3361Sopenharmony_ci	jb near	.do3
248159b3361Sopenharmony_ci
249159b3361Sopenharmony_ci	add	r5, 4
250159b3361Sopenharmony_ci; mm6 =  c1| s1
251159b3361Sopenharmony_ci; mm7 =  s1|-c1 (we use the opposite sign. from GOGO here)
252159b3361Sopenharmony_ci	pfmul	mm6, [r3]	; c1*a | s1*a
253159b3361Sopenharmony_ci	pfmul	mm7, [r3+8]	; s1*b |-c1*b
254159b3361Sopenharmony_ci	cmp	r5, r4
255159b3361Sopenharmony_ci
256159b3361Sopenharmony_ci	pfsub	mm6, mm7	; c1*a-s1*b | s1*a+c1*b
257159b3361Sopenharmony_ci	pupldq	mm7,mm6
258159b3361Sopenharmony_ci	puphdq	mm6,mm7
259159b3361Sopenharmony_ci	pmov	mm7, [PIC_EBP_REL(costab)]
260159b3361Sopenharmony_ci	jb near	.for
261159b3361Sopenharmony_ci
262159b3361Sopenharmony_ci	mov	r0, [esp+40]	;fi
263159b3361Sopenharmony_ci	cmp	r4, [esp+40+4]
264159b3361Sopenharmony_ci	lea	r4, [r4*4]	;kx *= 4
265159b3361Sopenharmony_ci
266159b3361Sopenharmony_ci	jb near	.do1
267159b3361Sopenharmony_ci.exitttt
268159b3361Sopenharmony_ci	femms
269159b3361Sopenharmony_ci	add	esp,20
270159b3361Sopenharmony_ci	popd	ebp, ebx, esi, edi
271159b3361Sopenharmony_ciendproc
272159b3361Sopenharmony_ci
273159b3361Sopenharmony_ci
274159b3361Sopenharmony_ci;void fht_E3DN(float *fz, int nn);
275159b3361Sopenharmony_ci
276159b3361Sopenharmony_ciproc	fht_E3DN
277159b3361Sopenharmony_ci
278159b3361Sopenharmony_ci	pushd	ebp, ebx, esi, edi
279159b3361Sopenharmony_ci
280159b3361Sopenharmony_ci	sub	esp, 20
281159b3361Sopenharmony_ci
282159b3361Sopenharmony_ci	call	get_pc.bp
283159b3361Sopenharmony_ci	add	ebp, PIC_BASE()
284159b3361Sopenharmony_ci
285159b3361Sopenharmony_ci	mov	r0, [esp+40]		;fi
286159b3361Sopenharmony_ci	mov	r1, [esp+44]		;r1 = nn
287159b3361Sopenharmony_ci	lea	r3, [PIC_EBP_REL(costab)]		;tri = costab
288159b3361Sopenharmony_ci	lea	r4, [r0+r1*8]		;r4 = fn = &fz[n]
289159b3361Sopenharmony_ci	mov	[esp+16], r4
290159b3361Sopenharmony_ci	mov	r4, 8			;kx = k1/2
291159b3361Sopenharmony_ci
292159b3361Sopenharmony_ci	pmov	mm7, [r3]
293159b3361Sopenharmony_ci
294159b3361Sopenharmony_ci	loopalign 16
295159b3361Sopenharmony_ci.do1
296159b3361Sopenharmony_ci	lea	r3, [r3+16]	;tri += 2;
297159b3361Sopenharmony_ci	pmov	mm6, [PIC_EBP_REL(costab+8)]
298159b3361Sopenharmony_ci	lea	r2, [r4+r4*2]		;k3*fsize/2
299159b3361Sopenharmony_ci	mov	r5, 4		;i = 1*fsize
300159b3361Sopenharmony_ci
301159b3361Sopenharmony_ci	loopalign 16
302159b3361Sopenharmony_ci.do2:
303159b3361Sopenharmony_ci	lea	r1, [r0+r4]		;gi = fi + kx
304159b3361Sopenharmony_ci;f
305159b3361Sopenharmony_ci	pmov	mm0, [r0]	; X  | fi0
306159b3361Sopenharmony_ci	pmov	mm1, [r0+r4*4]	; X  | fi2
307159b3361Sopenharmony_ci	pupldq	mm0, [r0+r4*2]	;fi1 | fi0
308159b3361Sopenharmony_ci	pupldq	mm1, [r0+r2*2]	;fi3 | fi2
309159b3361Sopenharmony_ci	pfpnacc	mm0, mm0	;fi0+fi1 | fi0-fi1 = f0|f1
310159b3361Sopenharmony_ci	pfpnacc	mm1, mm1	;fi2+fi3 | fi2-fi3 = f2|f3
311159b3361Sopenharmony_ci
312159b3361Sopenharmony_ci	pmov	mm2, mm0
313159b3361Sopenharmony_ci	pfadd	mm0, mm1	;f0+f2|f1+f3 = fi0 | fi1
314159b3361Sopenharmony_ci	pfsub	mm2, mm1	;f0-f2|f1-f3 = fi2 | fi3
315159b3361Sopenharmony_ci
316159b3361Sopenharmony_ci	pmovd	[r0+r4*2], mm0	;fi[k1]
317159b3361Sopenharmony_ci	pmovd	[r0+r2*2], mm2	;fi[k3]
318159b3361Sopenharmony_ci
319159b3361Sopenharmony_ci	puphdq	mm0, mm0
320159b3361Sopenharmony_ci	puphdq	mm2, mm2
321159b3361Sopenharmony_ci	pmovd	[r0], mm0	;fi[0]
322159b3361Sopenharmony_ci	pmovd	[r0+r4*4], mm2	;fi[k2]
323159b3361Sopenharmony_ci
324159b3361Sopenharmony_ci	lea	r0, [r0+r4*8]
325159b3361Sopenharmony_ci;g
326159b3361Sopenharmony_ci	pmov	mm3, [r1]	;    gi0
327159b3361Sopenharmony_ci	pmov	mm4, [r1+r2*2]	;    gi3
328159b3361Sopenharmony_ci	pupldq	mm3, [r1+r4*2]	;gi1|gi0
329159b3361Sopenharmony_ci	pupldq	mm4, [r1+r4*4]	;gi2|gi3
330159b3361Sopenharmony_ci
331159b3361Sopenharmony_ci	pfpnacc	mm3, mm3	;gi0+gi1  |gi0-gi1   = f0|f1
332159b3361Sopenharmony_ci	pfmul	mm4, mm6	;gi2*SQRT2|gi3*SQRT2 = f2|f3
333159b3361Sopenharmony_ci
334159b3361Sopenharmony_ci	pmov	mm5, mm3
335159b3361Sopenharmony_ci	pfadd	mm3, mm4	;f0+f2|f1+f3
336159b3361Sopenharmony_ci	pfsub	mm5, mm4	;f0-f2|f1-f3
337159b3361Sopenharmony_ci
338159b3361Sopenharmony_ci	cmp	r0, [esp + 16]
339159b3361Sopenharmony_ci	pmovd	[r1+r4*2], mm3	;gi[k1]
340159b3361Sopenharmony_ci	pmovd	[r1+r2*2], mm5	;gi[k3]
341159b3361Sopenharmony_ci	puphdq	mm3, mm3
342159b3361Sopenharmony_ci	puphdq	mm5, mm5
343159b3361Sopenharmony_ci	pmovd	[r1], mm3	;gi[0]
344159b3361Sopenharmony_ci	pmovd	[r1+r4*4], mm5	;gi[k2]
345159b3361Sopenharmony_ci
346159b3361Sopenharmony_ci	jb near .do2
347159b3361Sopenharmony_ci
348159b3361Sopenharmony_ci	pmov	mm6, [r3+r5]	; this is not aligned address!!
349159b3361Sopenharmony_ci
350159b3361Sopenharmony_ci	loopalign 16
351159b3361Sopenharmony_ci.for:
352159b3361Sopenharmony_ci;
353159b3361Sopenharmony_ci; mm6 = c1 | s1
354159b3361Sopenharmony_ci; mm7 = 0x800000000 | 0
355159b3361Sopenharmony_ci;
356159b3361Sopenharmony_ci	pmov	mm5, mm6
357159b3361Sopenharmony_ci	mov	r0, [esp+40]	; fz
358159b3361Sopenharmony_ci	puphdq	mm5, mm5	; c1 | c1
359159b3361Sopenharmony_ci	lea	r1, [r0+r4*2]
360159b3361Sopenharmony_ci	pfadd	mm5, mm5	; c1+c1 | c1+c1
361159b3361Sopenharmony_ci	pfmul	mm5, mm6	; 2*c1*c1 | 2*c1*s1
362159b3361Sopenharmony_ci	pfsub	mm5, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
363159b3361Sopenharmony_ci
364159b3361Sopenharmony_ci	pswapd	mm4, mm5	; s2 |-c2
365159b3361Sopenharmony_ci	pxor	mm4, mm7	; s2 | c2
366159b3361Sopenharmony_ci	pxor	mm7, mm6	; c1 |-s1
367159b3361Sopenharmony_ci	pswapd	mm6, mm6	; s1 | c1
368159b3361Sopenharmony_ci
369159b3361Sopenharmony_ci; mm4 =  s2| c2
370159b3361Sopenharmony_ci; mm5 = -c2| s2
371159b3361Sopenharmony_ci; mm6 =  c1| s1
372159b3361Sopenharmony_ci; mm7 =  s1|-c1 (we use the opposite sign. from GOGO here)
373159b3361Sopenharmony_ci
374159b3361Sopenharmony_ci	pmov	[esp], mm4
375159b3361Sopenharmony_ci	pmov	[esp+8], mm5
376159b3361Sopenharmony_ci
377159b3361Sopenharmony_ci	sub	r1, r5		;r1 = gi
378159b3361Sopenharmony_ci	add	r0, r5		;r0 = fi
379159b3361Sopenharmony_ci
380159b3361Sopenharmony_ci	loopalign 16
381159b3361Sopenharmony_ci.do3:
382159b3361Sopenharmony_ci	pmov	mm0, [r0+r2*2] ; fi[k1]
383159b3361Sopenharmony_ci	pmov	mm2, [r1+r2*2] ; gi[k1]
384159b3361Sopenharmony_ci	pmov	mm1, [r0+r4*2] ; fi[k3]
385159b3361Sopenharmony_ci	pmov	mm3, [r1+r4*2] ; gi[k3]
386159b3361Sopenharmony_ci
387159b3361Sopenharmony_ci	pupldq	mm0, mm0
388159b3361Sopenharmony_ci	pupldq	mm2, mm2
389159b3361Sopenharmony_ci	pupldq	mm1, mm1
390159b3361Sopenharmony_ci	pupldq	mm3, mm3
391159b3361Sopenharmony_ci
392159b3361Sopenharmony_ci	pfmul	mm0, mm4
393159b3361Sopenharmony_ci	pfmul	mm2, mm5
394159b3361Sopenharmony_ci	pfmul	mm1, mm4
395159b3361Sopenharmony_ci	pfmul	mm3, mm5
396159b3361Sopenharmony_ci
397159b3361Sopenharmony_ci	pfadd	mm0, mm2		;d | c
398159b3361Sopenharmony_ci	pfadd	mm1, mm3		;b | a
399159b3361Sopenharmony_ci
400159b3361Sopenharmony_ci	pmov	mm2, [r0+r4*4]		;fi2
401159b3361Sopenharmony_ci	pupldq	mm3, [r1+r4*4]		;gi2 | -
402159b3361Sopenharmony_ci	pmov	mm4, [r0]		;fi0
403159b3361Sopenharmony_ci	pupldq	mm5, [r1]		;gi0 | -
404159b3361Sopenharmony_ci
405159b3361Sopenharmony_ci	pupldq	mm2, mm0		;c | fi2
406159b3361Sopenharmony_ci	puphdq	mm3, mm0		;d | gi2
407159b3361Sopenharmony_ci	pupldq	mm4, mm1		;a | fi0
408159b3361Sopenharmony_ci	puphdq	mm5, mm1		;b | gi0
409159b3361Sopenharmony_ci
410159b3361Sopenharmony_ci	pfpnacc	mm2, mm2		;f2 | f3
411159b3361Sopenharmony_ci	pfpnacc	mm3, mm3		;g2 | g3
412159b3361Sopenharmony_ci	pfpnacc	mm4, mm4		;f0 | f1
413159b3361Sopenharmony_ci	pfpnacc	mm5, mm5		;g0 | g1
414159b3361Sopenharmony_ci
415159b3361Sopenharmony_ci	pmov	mm0, mm2
416159b3361Sopenharmony_ci	pmov	mm1, mm3
417159b3361Sopenharmony_ci	pupldq	mm2, mm2		;f3 | f3
418159b3361Sopenharmony_ci	pupldq	mm3, mm3		;g3 | g3
419159b3361Sopenharmony_ci	puphdq	mm0, mm0		;f2 | f2
420159b3361Sopenharmony_ci	puphdq	mm1, mm1		;g2 | g2
421159b3361Sopenharmony_ci
422159b3361Sopenharmony_ci	pswapd	mm4, mm4		;f1 | f0
423159b3361Sopenharmony_ci	pswapd	mm5, mm5		;g1 | g0
424159b3361Sopenharmony_ci
425159b3361Sopenharmony_ci	pfmul	mm0, mm7		;f2 * s1 | f2 *-c1
426159b3361Sopenharmony_ci	pfmul	mm3, mm6		;g3 * c1 | g3 * s1
427159b3361Sopenharmony_ci	pfmul	mm1, mm6		;g2 * c1 | g2 * s1
428159b3361Sopenharmony_ci	pfmul	mm2, mm7		;f3 * s1 | f3 *-c1
429159b3361Sopenharmony_ci
430159b3361Sopenharmony_ci	pfsub	mm0, mm3		; b |-a
431159b3361Sopenharmony_ci	pfsub	mm1, mm2		; d | c
432159b3361Sopenharmony_ci
433159b3361Sopenharmony_ci	pmov	mm2, mm5
434159b3361Sopenharmony_ci	pmov	mm3, mm4
435159b3361Sopenharmony_ci	pupldq	mm4, mm0		;-a | f0
436159b3361Sopenharmony_ci	pupldq	mm5, mm1		; c | g0
437159b3361Sopenharmony_ci	puphdq	mm2, mm0		; b | g1
438159b3361Sopenharmony_ci	puphdq	mm3, mm1		; d | f1
439159b3361Sopenharmony_ci
440159b3361Sopenharmony_ci	pfpnacc	mm4, mm4		;fi2 | fi0
441159b3361Sopenharmony_ci	pfpnacc	mm5, mm5		;gi0 | gi2
442159b3361Sopenharmony_ci	pfpnacc	mm2, mm2		;gi1 | gi3
443159b3361Sopenharmony_ci	pfpnacc	mm3, mm3		;fi1 | fi3
444159b3361Sopenharmony_ci
445159b3361Sopenharmony_ci	pmovd	[r0], mm4		;fi[0]
446159b3361Sopenharmony_ci	pmovd	[r1+r4*4], mm5		;gi[k2]
447159b3361Sopenharmony_ci	pmovd	[r1+r2*2], mm2		;gi[k3]
448159b3361Sopenharmony_ci	pmovd	[r0+r2*2], mm3		;fi[k3]
449159b3361Sopenharmony_ci
450159b3361Sopenharmony_ci	puphdq	mm4, mm4
451159b3361Sopenharmony_ci	puphdq	mm5, mm5
452159b3361Sopenharmony_ci	puphdq	mm2, mm2
453159b3361Sopenharmony_ci	puphdq	mm3, mm3
454159b3361Sopenharmony_ci	pmovd	[r0+r4*4], mm4		;fi[k2]
455159b3361Sopenharmony_ci	pmovd	[r1], mm5		;gi[0]
456159b3361Sopenharmony_ci	pmovd	[r1+r4*2], mm2		;gi[k1]
457159b3361Sopenharmony_ci	pmovd	[r0+r4*2], mm3		;fi[k1]
458159b3361Sopenharmony_ci
459159b3361Sopenharmony_ci	lea	r0, [r0+r4*8]
460159b3361Sopenharmony_ci	lea	r1, [r1+r4*8]
461159b3361Sopenharmony_ci	cmp	r0, [esp + 16]
462159b3361Sopenharmony_ci	pmov	mm4, [esp]
463159b3361Sopenharmony_ci	pmov	mm5, [esp+8]
464159b3361Sopenharmony_ci
465159b3361Sopenharmony_ci	jb near	.do3
466159b3361Sopenharmony_ci
467159b3361Sopenharmony_ci	add	r5, 4
468159b3361Sopenharmony_ci; mm6 =  c1| s1
469159b3361Sopenharmony_ci; mm7 =  s1|-c1 (we use the opposite sign. from GOGO here)
470159b3361Sopenharmony_ci	pfmul	mm6, [r3]	; c1*a | s1*a
471159b3361Sopenharmony_ci	pfmul	mm7, [r3+8]	; s1*b |-c1*b
472159b3361Sopenharmony_ci	cmp	r5, r4
473159b3361Sopenharmony_ci
474159b3361Sopenharmony_ci	pfsub	mm6, mm7	; c1*a-s1*b | s1*a+c1*b
475159b3361Sopenharmony_ci	pswapd	mm6, mm6 ; ???	; s1*a+c1*b | c1*a-s1*b
476159b3361Sopenharmony_ci	pmov	mm7, [PIC_EBP_REL(costab)]
477159b3361Sopenharmony_ci	jb near	.for
478159b3361Sopenharmony_ci
479159b3361Sopenharmony_ci	mov	r0, [esp+40]	;fi
480159b3361Sopenharmony_ci	cmp	r4, [esp+40+4]
481159b3361Sopenharmony_ci	lea	r4, [r4*4]	;kx *= 4
482159b3361Sopenharmony_ci
483159b3361Sopenharmony_ci	jb near	.do1
484159b3361Sopenharmony_ci.exitttt
485159b3361Sopenharmony_ci	femms
486159b3361Sopenharmony_ci	add	esp,20
487159b3361Sopenharmony_ci	popd	ebp, ebx, esi, edi
488159b3361Sopenharmony_ciendproc
489