1;
2;    (C) Frank Klemm 1995,99,2000
3;    Dedicated to the LAME project
4;
5;
6        %include "nasm.h"
7
8        segment_code
9        
10; float_t  scalar04_float32_i387 ( 
11;         const float32_t* const  p, 
12;         const float32_t* const  q );
13
14proc    scalar04_float32_i387
15%$p     arg     4
16%$q     arg     4
17;;;     alloc
18
19        mov     eax,[sp(%$p)]
20        mov     edx,[sp(%$q)]
21        fld     dword [eax]
22        fmul    dword [edx]
23        fld     dword [eax +  4]
24        fmul    dword [edx +  4]
25        faddp   st1,st0
26        fld     dword [eax +  8]
27        fmul    dword [edx +  8]
28        faddp   st1,st0
29        fld     dword [eax + 12]
30        fmul    dword [edx + 12]
31        faddp   st1,st0    
32endproc
33
34
35proc    scalar08_float32_i387
36%$p     arg     4
37%$q     arg     4
38;;;     alloc
39
40        mov     eax,[sp(%$p)]
41        mov     edx,[sp(%$q)]
42        fld     dword [eax]
43        fmul    dword [edx]
44        fld     dword [eax +  4]
45        fmul    dword [edx +  4]
46        faddp   st1,st0
47        fld     dword [eax +  8]
48        fmul    dword [edx +  8]
49        faddp   st1,st0
50        fld     dword [eax + 12]
51        fmul    dword [edx + 12]
52        faddp   st1,st0    
53        fld     dword [eax + 16]
54        fmul    dword [edx + 16]
55        faddp   st1,st0    
56        fld     dword [eax + 20]
57        fmul    dword [edx + 20]
58        faddp   st1,st0    
59        fld     dword [eax + 24]
60        fmul    dword [edx + 24]
61        faddp   st1,st0    
62        fld     dword [eax + 28]
63        fmul    dword [edx + 28]
64        faddp   st1,st0    
65endproc
66
67
68proc    scalar12_float32_i387
69%$p     arg     4
70%$q     arg     4
71;;;     alloc
72
73        mov     eax,[sp(%$p)]
74        mov     edx,[sp(%$q)]
75        fld     dword [eax]
76        fmul    dword [edx]
77        fld     dword [eax +  4]
78        fmul    dword [edx +  4]
79        faddp   st1,st0
80        fld     dword [eax +  8]
81        fmul    dword [edx +  8]
82        faddp   st1,st0
83        fld     dword [eax + 12]
84        fmul    dword [edx + 12]
85        faddp   st1,st0    
86        fld     dword [eax + 16]
87        fmul    dword [edx + 16]
88        faddp   st1,st0    
89        fld     dword [eax + 20]
90        fmul    dword [edx + 20]
91        faddp   st1,st0    
92        fld     dword [eax + 24]
93        fmul    dword [edx + 24]
94        faddp   st1,st0    
95        fld     dword [eax + 28]
96        fmul    dword [edx + 28]
97        faddp   st1,st0    
98        fld     dword [eax + 32]
99        fmul    dword [edx + 32]
100        faddp   st1,st0    
101        fld     dword [eax + 36]
102        fmul    dword [edx + 36]
103        faddp   st1,st0    
104        fld     dword [eax + 40]
105        fmul    dword [edx + 40]
106        faddp   st1,st0    
107        fld     dword [eax + 44]
108        fmul    dword [edx + 44]
109        faddp   st1,st0    
110endproc
111
112
113proc    scalar16_float32_i387
114%$p     arg     4
115%$q     arg     4
116;;;     alloc
117
118        mov     eax,[sp(%$p)]
119        mov     edx,[sp(%$q)]
120        fld     dword [eax]
121        fmul    dword [edx]
122        fld     dword [eax +  4]
123        fmul    dword [edx +  4]
124        faddp   st1,st0
125        fld     dword [eax +  8]
126        fmul    dword [edx +  8]
127        faddp   st1,st0
128        fld     dword [eax + 12]
129        fmul    dword [edx + 12]
130        faddp   st1,st0    
131        fld     dword [eax + 16]
132        fmul    dword [edx + 16]
133        faddp   st1,st0    
134        fld     dword [eax + 20]
135        fmul    dword [edx + 20]
136        faddp   st1,st0    
137        fld     dword [eax + 24]
138        fmul    dword [edx + 24]
139        faddp   st1,st0    
140        fld     dword [eax + 28]
141        fmul    dword [edx + 28]
142        faddp   st1,st0    
143        fld     dword [eax + 32]
144        fmul    dword [edx + 32]
145        faddp   st1,st0    
146        fld     dword [eax + 36]
147        fmul    dword [edx + 36]
148        faddp   st1,st0    
149        fld     dword [eax + 40]
150        fmul    dword [edx + 40]
151        faddp   st1,st0    
152        fld     dword [eax + 44]
153        fmul    dword [edx + 44]
154        faddp   st1,st0    
155        fld     dword [eax + 48]
156        fmul    dword [edx + 48]
157        faddp   st1,st0    
158        fld     dword [eax + 52]
159        fmul    dword [edx + 52]
160        faddp   st1,st0    
161        fld     dword [eax + 56]
162        fmul    dword [edx + 56]
163        faddp   st1,st0    
164        fld     dword [eax + 60]
165        fmul    dword [edx + 60]
166        faddp   st1,st0    
167endproc
168
169
170proc    scalar20_float32_i387
171%$p     arg     4
172%$q     arg     4
173;;;     alloc
174
175        mov     eax,[sp(%$p)]
176        mov     edx,[sp(%$q)]
177        fld     dword [eax]
178        fmul    dword [edx]
179        fld     dword [eax +  4]
180        fmul    dword [edx +  4]
181        faddp   st1,st0
182        fld     dword [eax +  8]
183        fmul    dword [edx +  8]
184        faddp   st1,st0
185        fld     dword [eax + 12]
186        fmul    dword [edx + 12]
187        faddp   st1,st0    
188        fld     dword [eax + 16]
189        fmul    dword [edx + 16]
190        faddp   st1,st0    
191        fld     dword [eax + 20]
192        fmul    dword [edx + 20]
193        faddp   st1,st0    
194        fld     dword [eax + 24]
195        fmul    dword [edx + 24]
196        faddp   st1,st0    
197        fld     dword [eax + 28]
198        fmul    dword [edx + 28]
199        faddp   st1,st0    
200        fld     dword [eax + 32]
201        fmul    dword [edx + 32]
202        faddp   st1,st0    
203        fld     dword [eax + 36]
204        fmul    dword [edx + 36]
205        faddp   st1,st0    
206        fld     dword [eax + 40]
207        fmul    dword [edx + 40]
208        faddp   st1,st0    
209        fld     dword [eax + 44]
210        fmul    dword [edx + 44]
211        faddp   st1,st0    
212        fld     dword [eax + 48]
213        fmul    dword [edx + 48]
214        faddp   st1,st0    
215        fld     dword [eax + 52]
216        fmul    dword [edx + 52]
217        faddp   st1,st0    
218        fld     dword [eax + 56]
219        fmul    dword [edx + 56]
220        faddp   st1,st0    
221        fld     dword [eax + 60]
222        fmul    dword [edx + 60]
223        faddp   st1,st0    
224        fld     dword [eax + 64]
225        fmul    dword [edx + 64]
226        faddp   st1,st0    
227        fld     dword [eax + 68]
228        fmul    dword [edx + 68]
229        faddp   st1,st0    
230        fld     dword [eax + 72]
231        fmul    dword [edx + 72]
232        faddp   st1,st0    
233        fld     dword [eax + 76]
234        fmul    dword [edx + 76]
235        faddp   st1,st0    
236endproc
237
238
239proc    scalar24_float32_i387
240%$p     arg     4
241%$q     arg     4
242;;;     alloc
243
244        mov     eax,[sp(%$p)]
245        mov     edx,[sp(%$q)]
246        fld     dword [eax]
247        fmul    dword [edx]
248        fld     dword [eax +  4]
249        fmul    dword [edx +  4]
250        faddp   st1,st0
251        fld     dword [eax +  8]
252        fmul    dword [edx +  8]
253        faddp   st1,st0
254        fld     dword [eax + 12]
255        fmul    dword [edx + 12]
256        faddp   st1,st0    
257        fld     dword [eax + 16]
258        fmul    dword [edx + 16]
259        faddp   st1,st0    
260        fld     dword [eax + 20]
261        fmul    dword [edx + 20]
262        faddp   st1,st0    
263        fld     dword [eax + 24]
264        fmul    dword [edx + 24]
265        faddp   st1,st0    
266        fld     dword [eax + 28]
267        fmul    dword [edx + 28]
268        faddp   st1,st0    
269        fld     dword [eax + 32]
270        fmul    dword [edx + 32]
271        faddp   st1,st0    
272        fld     dword [eax + 36]
273        fmul    dword [edx + 36]
274        faddp   st1,st0    
275        fld     dword [eax + 40]
276        fmul    dword [edx + 40]
277        faddp   st1,st0    
278        fld     dword [eax + 44]
279        fmul    dword [edx + 44]
280        faddp   st1,st0    
281        fld     dword [eax + 48]
282        fmul    dword [edx + 48]
283        faddp   st1,st0    
284        fld     dword [eax + 52]
285        fmul    dword [edx + 52]
286        faddp   st1,st0    
287        fld     dword [eax + 56]
288        fmul    dword [edx + 56]
289        faddp   st1,st0    
290        fld     dword [eax + 60]
291        fmul    dword [edx + 60]
292        faddp   st1,st0    
293        fld     dword [eax + 64]
294        fmul    dword [edx + 64]
295        faddp   st1,st0    
296        fld     dword [eax + 68]
297        fmul    dword [edx + 68]
298        faddp   st1,st0    
299        fld     dword [eax + 72]
300        fmul    dword [edx + 72]
301        faddp   st1,st0    
302        fld     dword [eax + 76]
303        fmul    dword [edx + 76]
304        faddp   st1,st0    
305        fld     dword [eax + 80]
306        fmul    dword [edx + 80]
307        faddp   st1,st0    
308        fld     dword [eax + 84]
309        fmul    dword [edx + 84]
310        faddp   st1,st0    
311        fld     dword [eax + 88]
312        fmul    dword [edx + 88]
313        faddp   st1,st0    
314        fld     dword [eax + 92]
315        fmul    dword [edx + 92]
316        faddp   st1,st0    
317endproc
318
319
320proc    scalar32_float32_i387
321%$p     arg     4
322%$q     arg     4
323;;;     alloc
324
325        mov     eax,[sp(%$p)]
326        mov     edx,[sp(%$q)]
327        fld     dword [eax]
328        fmul    dword [edx]
329        fld     dword [eax +  4]
330        fmul    dword [edx +  4]
331        faddp   st1,st0
332        fld     dword [eax +  8]
333        fmul    dword [edx +  8]
334        faddp   st1,st0
335        fld     dword [eax + 12]
336        fmul    dword [edx + 12]
337        faddp   st1,st0    
338        fld     dword [eax + 16]
339        fmul    dword [edx + 16]
340        faddp   st1,st0    
341        fld     dword [eax + 20]
342        fmul    dword [edx + 20]
343        faddp   st1,st0    
344        fld     dword [eax + 24]
345        fmul    dword [edx + 24]
346        faddp   st1,st0    
347        fld     dword [eax + 28]
348        fmul    dword [edx + 28]
349        faddp   st1,st0    
350        fld     dword [eax + 32]
351        fmul    dword [edx + 32]
352        faddp   st1,st0    
353        fld     dword [eax + 36]
354        fmul    dword [edx + 36]
355        faddp   st1,st0    
356        fld     dword [eax + 40]
357        fmul    dword [edx + 40]
358        faddp   st1,st0    
359        fld     dword [eax + 44]
360        fmul    dword [edx + 44]
361        faddp   st1,st0    
362        fld     dword [eax + 48]
363        fmul    dword [edx + 48]
364        faddp   st1,st0    
365        fld     dword [eax + 52]
366        fmul    dword [edx + 52]
367        faddp   st1,st0    
368        fld     dword [eax + 56]
369        fmul    dword [edx + 56]
370        faddp   st1,st0    
371        fld     dword [eax + 60]
372        fmul    dword [edx + 60]
373        faddp   st1,st0    
374        fld     dword [eax + 64]
375        fmul    dword [edx + 64]
376        faddp   st1,st0    
377        fld     dword [eax + 68]
378        fmul    dword [edx + 68]
379        faddp   st1,st0    
380        fld     dword [eax + 72]
381        fmul    dword [edx + 72]
382        faddp   st1,st0    
383        fld     dword [eax + 76]
384        fmul    dword [edx + 76]
385        faddp   st1,st0    
386        fld     dword [eax + 80]
387        fmul    dword [edx + 80]
388        faddp   st1,st0    
389        fld     dword [eax + 84]
390        fmul    dword [edx + 84]
391        faddp   st1,st0    
392        fld     dword [eax + 88]
393        fmul    dword [edx + 88]
394        faddp   st1,st0    
395        fld     dword [eax + 92]
396        fmul    dword [edx + 92]
397        faddp   st1,st0    
398        fld     dword [eax + 96]
399        fmul    dword [edx + 96]
400        faddp   st1,st0    
401        fld     dword [eax +100]
402        fmul    dword [edx +100]
403        faddp   st1,st0    
404        fld     dword [eax +104]
405        fmul    dword [edx +104]
406        faddp   st1,st0    
407        fld     dword [eax +108]
408        fmul    dword [edx +108]
409        faddp   st1,st0    
410        fld     dword [eax +112]
411        fmul    dword [edx +112]
412        faddp   st1,st0    
413        fld     dword [eax +116]
414        fmul    dword [edx +116]
415        faddp   st1,st0    
416        fld     dword [eax +120]
417        fmul    dword [edx +120]
418        faddp   st1,st0    
419        fld     dword [eax +124]
420        fmul    dword [edx +124]
421        faddp   st1,st0    
422endproc
423
424
425; float_t  scalar4n_float32_i387 ( 
426;         const float32_t* const  p, 
427;         const float32_t* const  q,
428;         const size_t            len );
429
430proc    scalar4n_float32_i387
431%$p     arg     4
432%$q     arg     4
433%$len   arg     4
434;;;     alloc
435
436        mov     eax,[sp(%$p)]
437        mov     edx,[sp(%$q)]
438        mov     ecx,[sp(%$len)]
439        fld     dword [eax]
440        fmul    dword [edx]
441        fld     dword [eax +  4]
442        fmul    dword [edx +  4]
443        faddp   st1,st0
444        fld     dword [eax +  8]
445        fmul    dword [edx +  8]
446        faddp   st1,st0
447        fld     dword [eax + 12]
448        fmul    dword [edx + 12]
449        faddp   st1,st0
450        dec     ecx
451        jz      .ret1
452        add     eax,byte 16
453        add     edx,byte 16
454.lbl1
455        fld     dword [eax]
456        fmul    dword [edx]
457        faddp   st1,st0
458        fld     dword [eax +  4]
459        fmul    dword [edx +  4]
460        faddp   st1,st0
461        fld     dword [eax +  8]
462        fmul    dword [edx +  8]
463        faddp   st1,st0
464        fld     dword [eax + 12]
465        fmul    dword [edx + 12]
466        faddp   st1,st0
467        add     eax,byte 16
468        add     edx,byte 16
469        dec     ecx
470        jnz     .lbl1
471.ret1   
472endproc
473
474
475; float_t  scalar1n_float32_i387 ( 
476;         const float32_t* const  p, 
477;         const float32_t* const  q,
478;         const size_t            len );
479
480proc    scalar1n_float32_i387
481%$p     arg     4
482%$q     arg     4
483%$len   arg     4
484;;;     alloc
485
486        mov     eax,[sp(%$p)]
487        mov     edx,[sp(%$q)]
488        mov     ecx,[sp(%$len)]
489        fld0
490        shr     ecx,1
491        jnc     .lbl2
492        fld     dword [eax]
493        fmul    dword [edx]
494        faddp   st1,st0
495        add     eax,byte 4
496        add     edx,byte 4
497.lbl2
498        shr     ecx,1
499        jnc     .lbl3
500        fld     dword [eax]
501        fmul    dword [edx]
502        faddp   st1,st0
503        fld     dword [eax + 4]
504        fmul    dword [edx + 4]
505        faddp   st1,st0
506        add     eax,byte 8
507        add     edx,byte 8
508        and     ecx,ecx
509.lbl3
510        jz      .ret2
511.lbl4
512        fld     dword [eax]
513        fmul    dword [edx]
514        faddp   st1,st0
515        fld     dword [eax +  4]
516        fmul    dword [edx +  4]
517        faddp   st1,st0
518        fld     dword [eax +  8]
519        fmul    dword [edx +  8]
520        faddp   st1,st0
521        fld     dword [eax + 12]
522        fmul    dword [edx + 12]
523        faddp   st1,st0
524        add     eax,byte 16
525        add     edx,byte 16
526        dec     ecx
527        jnz     .lbl4
528.ret2
529endproc
530
531
532proc    scalar04_float32_3DNow
533%$p     arg     4
534%$q     arg     4
535        mov     eax,[sp(%$p)]
536        mov     edx,[sp(%$q)]
537
538        pmov    mm0,qword [eax]
539        pmov    mm1,qword [eax+8]
540        pfmul   mm0,qword [edx]
541        pfmul   mm1,qword [edx+8]
542
543        pfadd   mm0,mm1
544        pmov    qword [sp(%$p)],mm0
545        femms
546        fld     dword [sp(%$p)]
547        fadd    dword [sp(%$p)+4]
548endproc
549
550
551proc    scalar08_float32_3DNow
552%$p     arg     4
553%$q     arg     4
554        mov     eax,[sp(%$p)]
555        mov     edx,[sp(%$q)]
556
557        pmov    mm0,qword [eax]
558        pmov    mm1,qword [eax+8]
559        pfmul   mm0,qword [edx]
560        pfmul   mm1,qword [edx+8]
561
562        pmov    mm2,qword [eax+16]
563        pmov    mm3,qword [eax+24]
564        pfmul   mm2,qword [edx+16]
565        pfmul   mm3,qword [edx+24]
566        pfadd   mm0,mm2
567        pfadd   mm1,mm3
568
569        pfadd   mm0,mm1
570        pmov    qword [sp(%$p)],mm0
571        femms
572        fld     dword [sp(%$p)]
573        fadd    dword [sp(%$p)+4]
574endproc
575
576
577proc    scalar12_float32_3DNow
578%$p     arg     4
579%$q     arg     4
580        mov     eax,[sp(%$p)]
581        mov     edx,[sp(%$q)]
582
583        pmov    mm0,qword [eax]
584        pmov    mm1,qword [eax+8]
585        pfmul   mm0,qword [edx]
586        pfmul   mm1,qword [edx+8]
587
588        pmov    mm2,qword [eax+16]
589        pmov    mm3,qword [eax+24]
590        pfmul   mm2,qword [edx+16]
591        pfmul   mm3,qword [edx+24]
592        pfadd   mm0,mm2
593        pfadd   mm1,mm3
594
595        pmov    mm2,qword [eax+32]
596        pmov    mm3,qword [eax+40]
597        pfmul   mm2,qword [edx+32]
598        pfmul   mm3,qword [edx+40]
599        pfadd   mm0,mm2
600        pfadd   mm1,mm3
601
602        pfadd   mm0,mm1
603        pmov    qword [sp(%$p)],mm0
604        femms
605        fld     dword [sp(%$p)]
606        fadd    dword [sp(%$p)+4]
607endproc
608
609
610proc    scalar16_float32_3DNow
611%$p     arg     4
612%$q     arg     4
613        mov     eax,[sp(%$p)]
614        mov     edx,[sp(%$q)]
615
616        pmov    mm0,qword [eax]
617        pmov    mm1,qword [eax+8]
618        pfmul   mm0,qword [edx]
619        pfmul   mm1,qword [edx+8]
620
621        pmov    mm2,qword [eax+16]
622        pmov    mm3,qword [eax+24]
623        pfmul   mm2,qword [edx+16]
624        pfmul   mm3,qword [edx+24]
625        pfadd   mm0,mm2
626        pfadd   mm1,mm3
627
628        pmov    mm2,qword [eax+32]
629        pmov    mm3,qword [eax+40]
630        pfmul   mm2,qword [edx+32]
631        pfmul   mm3,qword [edx+40]
632        pfadd   mm0,mm2
633        pfadd   mm1,mm3
634
635        pmov    mm2,qword [eax+48]
636        pmov    mm3,qword [eax+56]
637        pfmul   mm2,qword [edx+48]
638        pfmul   mm3,qword [edx+56]
639        pfadd   mm0,mm2
640        pfadd   mm1,mm3
641
642        pfadd   mm0,mm1
643        pmov    qword [sp(%$p)],mm0
644        femms
645        fld     dword [sp(%$p)]
646        fadd    dword [sp(%$p)+4]
647endproc
648
649
650proc    scalar20_float32_3DNow
651%$p     arg     4
652%$q     arg     4
653        mov     eax,[sp(%$p)]
654        mov     edx,[sp(%$q)]
655
656        pmov    mm0,qword [eax]
657        pmov    mm1,qword [eax+8]
658        pfmul   mm0,qword [edx]
659        pfmul   mm1,qword [edx+8]
660
661        pmov    mm2,qword [eax+16]
662        pmov    mm3,qword [eax+24]
663        pfmul   mm2,qword [edx+16]
664        pfmul   mm3,qword [edx+24]
665        pfadd   mm0,mm2
666        pfadd   mm1,mm3
667
668        pmov    mm2,qword [eax+32]
669        pmov    mm3,qword [eax+40]
670        pfmul   mm2,qword [edx+32]
671        pfmul   mm3,qword [edx+40]
672        pfadd   mm0,mm2
673        pfadd   mm1,mm3
674
675        pmov    mm2,qword [eax+48]
676        pmov    mm3,qword [eax+56]
677        pfmul   mm2,qword [edx+48]
678        pfmul   mm3,qword [edx+56]
679        pfadd   mm0,mm2
680        pfadd   mm1,mm3
681
682        pmov    mm2,qword [eax+64]
683        pmov    mm3,qword [eax+72]
684        pfmul   mm2,qword [edx+64]
685        pfmul   mm3,qword [edx+72]
686        pfadd   mm0,mm2
687        pfadd   mm1,mm3
688
689        pfadd   mm0,mm1
690        pmov    qword [sp(%$p)],mm0
691        femms
692        fld     dword [sp(%$p)]
693        fadd    dword [sp(%$p)+4]
694endproc
695
696
697proc    scalar24_float32_3DNow
698%$p     arg     4
699%$q     arg     4
700        mov     eax,[sp(%$p)]
701        mov     edx,[sp(%$q)]
702
703        pmov    mm0,qword [eax]
704        pmov    mm1,qword [eax+8]
705        pfmul   mm0,qword [edx]
706        pfmul   mm1,qword [edx+8]
707
708        pmov    mm2,qword [eax+16]
709        pmov    mm3,qword [eax+24]
710        pfmul   mm2,qword [edx+16]
711        pfmul   mm3,qword [edx+24]
712        pfadd   mm0,mm2
713        pfadd   mm1,mm3
714
715        pmov    mm2,qword [eax+32]
716        pmov    mm3,qword [eax+40]
717        pfmul   mm2,qword [edx+32]
718        pfmul   mm3,qword [edx+40]
719        pfadd   mm0,mm2
720        pfadd   mm1,mm3
721
722        pmov    mm2,qword [eax+48]
723        pmov    mm3,qword [eax+56]
724        pfmul   mm2,qword [edx+48]
725        pfmul   mm3,qword [edx+56]
726        pfadd   mm0,mm2
727        pfadd   mm1,mm3
728
729        pmov    mm2,qword [eax+64]
730        pmov    mm3,qword [eax+72]
731        pfmul   mm2,qword [edx+64]
732        pfmul   mm3,qword [edx+72]
733        pfadd   mm0,mm2
734        pfadd   mm1,mm3
735
736        pmov    mm2,qword [eax+80]
737        pmov    mm3,qword [eax+88]
738        pfmul   mm2,qword [edx+80]
739        pfmul   mm3,qword [edx+88]
740        pfadd   mm0,mm2
741        pfadd   mm1,mm3
742
743        pfadd   mm0,mm1
744        pmov    qword [sp(%$p)],mm0
745        femms
746        fld     dword [sp(%$p)]
747        fadd    dword [sp(%$p)+4]
748endproc
749
750proc    scalar32_float32_3DNow
751%$p     arg     4
752%$q     arg     4
753        mov     eax,[sp(%$p)]
754        mov     edx,[sp(%$q)]
755
756        pmov    mm0,qword [eax]
757        pmov    mm1,qword [eax+8]
758        pfmul   mm0,qword [edx]
759        pfmul   mm1,qword [edx+8]
760
761        pmov    mm2,qword [eax+16]
762        pmov    mm3,qword [eax+24]
763        pfmul   mm2,qword [edx+16]
764        pfmul   mm3,qword [edx+24]
765        pfadd   mm0,mm2
766        pfadd   mm1,mm3
767
768        pmov    mm2,qword [eax+32]
769        pmov    mm3,qword [eax+40]
770        pfmul   mm2,qword [edx+32]
771        pfmul   mm3,qword [edx+40]
772        pfadd   mm0,mm2
773        pfadd   mm1,mm3
774
775        pmov    mm2,qword [eax+48]
776        pmov    mm3,qword [eax+56]
777        pfmul   mm2,qword [edx+48]
778        pfmul   mm3,qword [edx+56]
779        pfadd   mm0,mm2
780        pfadd   mm1,mm3
781
782        pmov    mm2,qword [eax+64]
783        pmov    mm3,qword [eax+72]
784        pfmul   mm2,qword [edx+64]
785        pfmul   mm3,qword [edx+72]
786        pfadd   mm0,mm2
787        pfadd   mm1,mm3
788
789        pmov    mm2,qword [eax+80]
790        pmov    mm3,qword [eax+88]
791        pfmul   mm2,qword [edx+80]
792        pfmul   mm3,qword [edx+88]
793        pfadd   mm0,mm2
794        pfadd   mm1,mm3
795
796        pmov    mm2,qword [eax+96]
797        pmov    mm3,qword [eax+104]
798        pfmul   mm2,qword [edx+96]
799        pfmul   mm3,qword [edx+104]
800        pfadd   mm0,mm2
801        pfadd   mm1,mm3
802
803        pmov    mm2,qword [eax+112]
804        pmov    mm3,qword [eax+120]
805        pfmul   mm2,qword [edx+112]
806        pfmul   mm3,qword [edx+120]
807        pfadd   mm0,mm2
808        pfadd   mm1,mm3
809
810        pfadd   mm0,mm1
811        pmov    qword [sp(%$p)],mm0
812        femms
813        fld     dword [sp(%$p)]
814        fadd    dword [sp(%$p)+4]
815endproc
816
817
818proc    scalar4n_float32_3DNow
819%$p     arg     4
820%$q     arg     4
821%$len   arg     4
822
823        mov     eax,[sp(%$p)]
824        mov     edx,[sp(%$q)]
825        mov     ecx,[sp(%$len)]
826
827        pmov    mm0,qword [eax]
828        pmov    mm1,qword [eax+8]
829        pfmul   mm0,qword [edx]
830        pfmul   mm1,qword [edx+8]
831        dec     ecx
832        jz      .ret4
833        
834        add     eax,byte 16
835        add     edx,byte 16
836.lbl4:  
837        pmov    mm2,qword [eax]
838        pmov    mm3,qword [eax+8]
839        pfmul   mm2,qword [edx]
840        pfmul   mm3,qword [edx+8]
841        add     eax,byte 16
842        add     edx,byte 16
843        pfadd   mm0,mm2
844        pfadd   mm1,mm3
845        dec     ecx
846        jnz     .lbl4
847
848.ret4:  pfadd   mm0,mm1
849        pmov    qword [sp(%$p)],mm0
850        femms
851        fld     dword [sp(%$p)]
852        fadd    dword [sp(%$p)+4]
853endproc
854
855
856proc    scalar1n_float32_3DNow
857        jmp     scalar24_float32_i387
858endproc
859
860
861proc    scalar04_float32_SIMD
862        jmp     scalar04_float32_i387
863endproc
864
865
866proc    scalar08_float32_SIMD
867%$p     arg     4
868%$q     arg     4
869        mov     eax,[sp(%$p)]
870        mov     edx,[sp(%$q)]
871
872        movups  xmm0, [eax]
873        movups  xmm1, [eax+16]
874        mulps   xmm0, [edx]
875        mulps   xmm1, [edx+16]
876
877        addps   xmm0,xmm1
878        sub     esp,16
879        movups  [esp],xmm0
880        fld     dword [esp+ 0]
881        fadd    dword [esp+ 4]
882        fadd    dword [esp+ 8]
883        fadd    dword [esp+12]
884        add     esp,16
885endproc
886
887
888proc    scalar12_float32_SIMD
889        jmp     scalar12_float32_i387
890endproc
891
892
893proc    scalar16_float32_SIMD
894%$p     arg     4
895%$q     arg     4
896        mov     eax,[sp(%$p)]
897        mov     edx,[sp(%$q)]
898
899        movups  xmm0, [eax]
900        movups  xmm1, [eax+16]
901        mulps   xmm0, [edx]
902        mulps   xmm1, [edx+16]
903
904        movups  xmm2, [eax+32]
905        movups  xmm3, [eax+48]
906        mulps   xmm2, [edx+32]
907        mulps   xmm3, [edx+48]
908        addps   xmm0,xmm2
909        addps   xmm1,xmm3
910
911        addps   xmm0,xmm1
912        sub     esp,16
913        movups  [esp],xmm0
914        fld     dword [esp+ 0]
915        fadd    dword [esp+ 4]
916        fadd    dword [esp+ 8]
917        fadd    dword [esp+12]
918        add     esp,16
919endproc
920
921
922proc    scalar20_float32_SIMD
923        jmp     scalar20_float32_i387
924endproc
925
926
927proc    scalar24_float32_SIMD
928%$p     arg     4
929%$q     arg     4
930        mov     eax,[sp(%$p)]
931        mov     edx,[sp(%$q)]
932
933        movups  xmm0, [eax]
934        movups  xmm1, [eax+16]
935        mulps   xmm0, [edx]
936        mulps   xmm1, [edx+16]
937
938        movups  xmm2, [eax+32]
939        movups  xmm3, [eax+48]
940        mulps   xmm2, [edx+32]
941        mulps   xmm3, [edx+48]
942        addps   xmm0,xmm2
943        addps   xmm1,xmm3
944
945        movups  xmm2, [eax+64]
946        movups  xmm3, [eax+80]
947        mulps   xmm2, [edx+64]
948        mulps   xmm3, [edx+80]
949        addps   xmm0,xmm2
950        addps   xmm1,xmm3
951
952        addps   xmm0,xmm1
953        sub     esp,16
954        movups  [esp],xmm0
955        fld     dword [esp+ 0]
956        fadd    dword [esp+ 4]
957        fadd    dword [esp+ 8]
958        fadd    dword [esp+12]
959        add     esp,16
960endproc
961
962
963proc    scalar32_float32_SIMD
964%$p     arg     4
965%$q     arg     4
966        mov     eax,[sp(%$p)]
967        mov     edx,[sp(%$q)]
968
969        movups  xmm0, [eax]
970        movups  xmm1, [eax+16]
971        mulps   xmm0, [edx]
972        mulps   xmm1, [edx+16]
973
974        movups  xmm2, [eax+32]
975        movups  xmm3, [eax+48]
976        mulps   xmm2, [edx+32]
977        mulps   xmm3, [edx+48]
978        addps   xmm0,xmm2
979        addps   xmm1,xmm3
980
981        movups  xmm2, [eax+64]
982        movups  xmm3, [eax+80]
983        mulps   xmm2, [edx+64]
984        mulps   xmm3, [edx+80]
985        addps   xmm0,xmm2
986        addps   xmm1,xmm3
987
988        movups  xmm2, [eax+96]
989        movups  xmm3, [eax+112]
990        mulps   xmm2, [edx+96]
991        mulps   xmm3, [edx+112]
992        addps   xmm0,xmm2
993        addps   xmm1,xmm3
994
995        addps   xmm0,xmm1
996
997        ;sub     esp,16
998        ;movups  [esp],xmm0
999        ;fld     dword [esp+ 0]
1000        ;fadd    dword [esp+ 4]
1001        ;fadd    dword [esp+ 8]
1002        ;fadd    dword [esp+12]
1003        ;add     esp,16
1004         
1005         movhlps xmm1,xmm0
1006         addps   xmm0,xmm1
1007         movlps  [sp(%$p)],xmm0
1008        fld     dword [sp(%$p)]
1009        fadd    dword [sp(%$p)+4]
1010endproc
1011
1012
1013proc    scalar4n_float32_SIMD
1014        jmp     scalar4n_float32_i387
1015endproc
1016
1017
1018proc    scalar1n_float32_SIMD
1019        jmp     scalar1n_float32_i387
1020endproc
1021
1022; end of scalar.nas
1023