1;******************************************************************************
2;* 36 point SSE-optimized IMDCT transform
3;* Copyright (c) 2011 Vitor Sessak
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25
26ps_mask:  dd 0, ~0, ~0, ~0
27ps_mask2: dd 0, ~0,  0, ~0
28ps_mask3: dd 0,  0,  0, ~0
29ps_mask4: dd 0, ~0,  0,  0
30
31ps_val1:  dd          -0.5,          -0.5, -0.8660254038, -0.8660254038
32ps_val2:  dd           1.0,           1.0,  0.8660254038,  0.8660254038
33ps_val3:  dd  0.1736481777,  0.1736481777,  0.3420201433,  0.3420201433
34ps_val4:  dd -0.7660444431, -0.7660444431,  0.8660254038,  0.8660254038
35ps_val5:  dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
36ps_val6:  dd           0.5,           0.5, -0.6427876097, -0.6427876097
37ps_val7:  dd           1.0,           1.0, -0.6427876097, -0.6427876097
38
39ps_p1p1m1m1: dd 0,          0, 0x80000000, 0x80000000
40ps_p1m1p1m1: dd 0, 0x80000000,          0, 0x80000000
41
42ps_cosh:       dd 1.0, 0.50190991877167369479,  1.0,  5.73685662283492756461
43               dd 1.0, 0.51763809020504152469,  1.0,  1.93185165257813657349
44               dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
45               dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
46               dd 1.0, 0.70710678118654752439,  0.0,  0.0
47
48ps_cosh_sse3:  dd 1.0, -0.50190991877167369479,  1.0, -5.73685662283492756461
49               dd 1.0, -0.51763809020504152469,  1.0, -1.93185165257813657349
50               dd 1.0, -0.55168895948124587824, -1.0,  1.18310079157624925896
51               dd 1.0, -0.61038729438072803416, -1.0,  0.87172339781054900991
52               dd 1.0, -0.70710678118654752439,  0.0,  0.0
53
54costabs:  times 4 dd  0.98480773
55          times 4 dd  0.93969262
56          times 4 dd  0.86602539
57          times 4 dd -0.76604444
58          times 4 dd -0.64278764
59          times 4 dd  0.50000000
60          times 4 dd -0.50000000
61          times 4 dd -0.34202015
62          times 4 dd -0.17364818
63          times 4 dd  0.50190992
64          times 4 dd  0.51763808
65          times 4 dd  0.55168896
66          times 4 dd  0.61038726
67          times 4 dd  0.70710677
68          times 4 dd  0.87172341
69          times 4 dd  1.18310082
70          times 4 dd  1.93185163
71          times 4 dd  5.73685646
72
73%define SBLIMIT 32
74SECTION .text
75
76%macro PSHUFD 3
77%if cpuflag(sse2) && notcpuflag(avx)
78    pshufd %1, %2, %3
79%else
80    shufps %1, %2, %2, %3
81%endif
82%endmacro
83
84; input  %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
85; output %1={x3,x4,y1,y2}
86%macro BUILDINVHIGHLOW 3
87%if cpuflag(avx)
88    shufps %1, %2, %3, 0x4e
89%else
90    movlhps %1, %3
91    movhlps %1, %2
92%endif
93%endmacro
94
95; input  %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
96; output %1={x4,y1,y2,y3}
97%macro ROTLEFT 3
98%if cpuflag(ssse3)
99    palignr  %1, %3, %2, 12
100%else
101    BUILDINVHIGHLOW %1, %2, %3
102    shufps  %1, %1, %3, 0x99
103%endif
104%endmacro
105
106%macro INVERTHL 2
107%if cpuflag(sse2)
108    PSHUFD  %1, %2, 0x4e
109%else
110    movhlps %1, %2
111    movlhps %1, %2
112%endif
113%endmacro
114
115%macro BUTTERF 3
116    INVERTHL %2, %1
117    xorps    %1, [ps_p1p1m1m1]
118    addps    %1, %2
119%if cpuflag(sse3)
120    mulps    %1, %1, [ps_cosh_sse3 + %3]
121    PSHUFD   %2, %1, 0xb1
122    addsubps %1, %1, %2
123%else
124    mulps    %1, [ps_cosh + %3]
125    PSHUFD   %2, %1, 0xb1
126    xorps    %1, [ps_p1m1p1m1]
127    addps    %1, %2
128%endif
129%endmacro
130
131%macro BUTTERF2 3
132%if cpuflag(sse3)
133    mulps    %1, %1, [ps_cosh_sse3 + %3]
134    PSHUFD   %2, %1, 0xe1
135    addsubps %1, %1, %2
136%else
137    mulps    %1, [ps_cosh + %3]
138    PSHUFD   %2, %1, 0xe1
139    xorps    %1, [ps_p1m1p1m1]
140    addps    %1, %2
141%endif
142%endmacro
143
144%macro STORE 4
145%if cpuflag(sse4)
146    movss     [%3       ], %1
147    extractps dword [%3 +   %4], %1, 1
148    extractps dword [%3 + 2*%4], %1, 2
149    extractps dword [%3 + 3*%4], %1, 3
150%else
151    movhlps %2, %1
152    movss   [%3       ], %1
153    movss   [%3 + 2*%4], %2
154    shufps  %1, %1, 0xb1
155    movss   [%3 +   %4], %1
156    movhlps %2, %1
157    movss   [%3 + 3*%4], %2
158%endif
159%endmacro
160
161%macro LOAD 4
162    movlps  %1, [%3       ]
163    movhps  %1, [%3 +   %4]
164    movlps  %2, [%3 + 2*%4]
165    movhps  %2, [%3 + 3*%4]
166    shufps  %1, %2, 0x88
167%endmacro
168
169%macro LOADA64 2
170%if cpuflag(avx)
171   movu     %1, [%2]
172%else
173   movlps   %1, [%2]
174   movhps   %1, [%2 + 8]
175%endif
176%endmacro
177
178%macro DEFINE_IMDCT 0
179cglobal imdct36_float, 4,4,9, out, buf, in, win
180
181    ; for(i=17;i>=1;i--) in[i] += in[i-1];
182    LOADA64 m0, inq
183    LOADA64 m1, inq + 16
184
185    ROTLEFT m5, m0, m1
186
187    PSHUFD  m6, m0, 0x93
188    andps   m6, m6, [ps_mask]
189    addps   m0, m0, m6
190
191    LOADA64 m2, inq + 32
192
193    ROTLEFT m7, m1, m2
194
195    addps   m1, m1, m5
196    LOADA64 m3, inq + 48
197
198    ROTLEFT m5, m2, m3
199
200    xorps   m4, m4, m4
201    movlps  m4, [inq+64]
202    BUILDINVHIGHLOW m6, m3, m4
203    shufps  m6, m6, m4, 0xa9
204
205    addps   m4, m4, m6
206    addps   m2, m2, m7
207    addps   m3, m3, m5
208
209    ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
210    movlhps m5, m5, m0
211    andps   m5, m5, [ps_mask3]
212
213    BUILDINVHIGHLOW m7, m0, m1
214    andps   m7, m7, [ps_mask2]
215
216    addps   m0, m0, m5
217
218    BUILDINVHIGHLOW m6, m1, m2
219    andps   m6, m6, [ps_mask2]
220
221    addps  m1, m1, m7
222
223    BUILDINVHIGHLOW m7, m2, m3
224    andps   m7, m7, [ps_mask2]
225
226    addps   m2, m2, m6
227
228    movhlps m6, m6, m3
229    andps   m6, m6, [ps_mask4]
230
231    addps  m3, m3, m7
232    addps  m4, m4, m6
233
234    ; Populate tmp[]
235    movlhps m6, m1, m5    ; zero out high values
236    subps   m6, m6, m4
237
238    subps  m5, m0, m3
239
240%if ARCH_X86_64
241    SWAP   m5, m8
242%endif
243
244    mulps  m7, m2, [ps_val1]
245
246%if ARCH_X86_64
247    mulps  m5, m8, [ps_val2]
248%else
249    mulps  m5, m5, [ps_val2]
250%endif
251    addps  m7, m7, m5
252
253    mulps  m5, m6, [ps_val1]
254    subps  m7, m7, m5
255
256%if ARCH_X86_64
257    SWAP   m5, m8
258%else
259    subps  m5, m0, m3
260%endif
261
262    subps  m5, m5, m6
263    addps  m5, m5, m2
264
265    shufps m6, m4, m3, 0xe4
266    subps  m6, m6, m2
267    mulps  m6, m6, [ps_val3]
268
269    addps  m4, m4, m1
270    mulps  m4, m4, [ps_val4]
271
272    shufps m1, m1, m0, 0xe4
273    addps  m1, m1, m2
274    mulps  m1, m1, [ps_val5]
275
276    mulps  m3, m3, [ps_val6]
277    mulps  m0, m0, [ps_val7]
278    addps  m0, m0, m3
279
280    xorps  m2, m1, [ps_p1p1m1m1]
281    subps  m2, m2, m4
282    addps  m2, m2, m0
283
284    addps  m3, m4, m0
285    subps  m3, m3, m6
286    xorps  m3, m3, [ps_p1p1m1m1]
287
288    shufps m0, m0, m4, 0xe4
289    subps  m0, m0, m1
290    addps  m0, m0, m6
291
292    BUILDINVHIGHLOW m4, m2, m3
293    shufps  m3, m3, m2, 0x4e
294
295    ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
296
297    BUTTERF  m0, m1, 0
298    BUTTERF  m7, m2, 16
299    BUTTERF  m3, m6, 32
300    BUTTERF  m4, m1, 48
301    BUTTERF2 m5, m1, 64
302
303    ; permutates:
304    ; m0    0  1  2  3     =>     2  6 10 14   m1
305    ; m7    4  5  6  7     =>     3  7 11 15   m2
306    ; m3    8  9 10 11     =>    17 13  9  5   m3
307    ; m4   12 13 14 15     =>    16 12  8  4   m5
308    ; m5   16 17 xx xx     =>     0  1 xx xx   m0
309
310    unpckhps m1, m0, m7
311    unpckhps m6, m3, m4
312    movhlps  m2, m6, m1
313    movlhps  m1, m1, m6
314
315    unpcklps m5, m5, m4
316    unpcklps m3, m3, m7
317    movhlps  m4, m3, m5
318    movlhps  m5, m5, m3
319    SWAP m4, m3
320    ; permutation done
321
322    PSHUFD  m6, m2, 0xb1
323    movss   m4, [bufq + 4*68]
324    movss   m7, [bufq + 4*64]
325    unpcklps  m7, m7, m4
326    mulps   m6, m6, [winq + 16*4]
327    addps   m6, m6, m7
328    movss   [outq + 64*SBLIMIT], m6
329    shufps  m6, m6, m6, 0xb1
330    movss   [outq + 68*SBLIMIT], m6
331
332    mulps   m6, m3, [winq + 4*4]
333    LOAD    m4, m7, bufq + 4*16, 16
334    addps   m6, m6, m4
335    STORE   m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
336
337    shufps  m4, m0, m3, 0xb5
338    mulps   m4, m4, [winq + 8*4]
339    LOAD    m7, m6, bufq + 4*32, 16
340    addps   m4, m4, m7
341    STORE   m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
342
343    shufps  m3, m3, m2, 0xb1
344    mulps   m3, m3, [winq + 12*4]
345    LOAD    m7, m6, bufq + 4*48, 16
346    addps   m3, m3, m7
347    STORE   m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
348
349    mulps   m2, m2, [winq]
350    LOAD    m6, m7, bufq, 16
351    addps   m2, m2, m6
352    STORE   m2, m7, outq, 4*SBLIMIT
353
354    mulps    m4, m1, [winq + 20*4]
355    STORE    m4, m7, bufq, 16
356
357    mulps    m3, m5, [winq + 24*4]
358    STORE    m3, m7, bufq + 4*16, 16
359
360    shufps   m0, m0, m5, 0xb0
361    mulps    m0, m0, [winq + 28*4]
362    STORE    m0, m7, bufq + 4*32, 16
363
364    shufps   m5, m5, m1, 0xb1
365    mulps    m5, m5, [winq + 32*4]
366    STORE    m5, m7, bufq + 4*48, 16
367
368    shufps   m1, m1, m1, 0xb1
369    mulps    m1, m1, [winq + 36*4]
370    movss    [bufq + 4*64], m1
371    shufps   m1, m1, 0xb1
372    movss    [bufq + 4*68], m1
373    RET
374%endmacro
375
376INIT_XMM sse2
377DEFINE_IMDCT
378
379INIT_XMM sse3
380DEFINE_IMDCT
381
382INIT_XMM ssse3
383DEFINE_IMDCT
384
385%if HAVE_AVX_EXTERNAL
386INIT_XMM avx
387DEFINE_IMDCT
388%endif
389
390INIT_XMM sse
391
392%if ARCH_X86_64
393%define SPILL SWAP
394%define UNSPILL SWAP
395%define SPILLED(x) m %+ x
396%else
397%define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
398%macro SPILL 2 ; xmm#, mempos
399    movaps SPILLED(%2), m%1
400%endmacro
401%macro UNSPILL 2
402    movaps m%1, SPILLED(%2)
403%endmacro
404%endif
405
406%macro DEFINE_FOUR_IMDCT 0
407cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
408    movlps  m0, [inq+64]
409    movhps  m0, [inq+64 +   72]
410    movlps  m3, [inq+64 + 2*72]
411    movhps  m3, [inq+64 + 3*72]
412
413    shufps  m5, m0, m3, 0xdd
414    shufps  m0, m0, m3, 0x88
415
416    mova     m1, [inq+48]
417    movu     m6, [inq+48 +   72]
418    mova     m7, [inq+48 + 2*72]
419    movu     m3, [inq+48 + 3*72]
420
421    TRANSPOSE4x4PS 1, 6, 7, 3, 4
422
423    addps   m4, m6, m7
424    mova    [tmpq+4*28], m4
425
426    addps    m7, m3
427    addps    m6, m1
428    addps    m3, m0
429    addps    m0, m5
430    addps    m0, m7
431    addps    m7, m6
432    mova    [tmpq+4*12], m7
433    SPILL   3, 12
434
435    mova     m4, [inq+32]
436    movu     m5, [inq+32 +   72]
437    mova     m2, [inq+32 + 2*72]
438    movu     m7, [inq+32 + 3*72]
439
440    TRANSPOSE4x4PS 4, 5, 2, 7, 3
441
442    addps   m1, m7
443    SPILL   1, 11
444
445    addps   m3, m5, m2
446    SPILL   3, 13
447
448    addps    m7, m2
449    addps    m5, m4
450    addps    m6, m7
451    mova    [tmpq], m6
452    addps   m7, m5
453    mova    [tmpq+4*16], m7
454
455    mova    m2, [inq+16]
456    movu    m7, [inq+16 +   72]
457    mova    m1, [inq+16 + 2*72]
458    movu    m6, [inq+16 + 3*72]
459
460    TRANSPOSE4x4PS 2, 7, 1, 6, 3
461
462    addps   m4, m6
463    addps   m6, m1
464    addps   m1, m7
465    addps   m7, m2
466    addps   m5, m6
467    SPILL   5, 15
468    addps   m6, m7
469    mulps   m6, [costabs + 16*2]
470    mova    [tmpq+4*8], m6
471    SPILL   1, 10
472    SPILL   0, 14
473
474    mova    m1, [inq]
475    movu    m6, [inq +   72]
476    mova    m3, [inq + 2*72]
477    movu    m5, [inq + 3*72]
478
479    TRANSPOSE4x4PS 1, 6, 3, 5, 0
480
481    addps    m2, m5
482    addps    m5, m3
483    addps    m7, m5
484    addps    m3, m6
485    addps    m6, m1
486    SPILL    7, 8
487    addps    m5, m6
488    SPILL    6, 9
489    addps    m6, m4, SPILLED(12)
490    subps    m6, m2
491    UNSPILL  7, 11
492    SPILL    5, 11
493    subps    m5, m1, m7
494    mulps    m7, [costabs + 16*5]
495    addps    m7, m1
496    mulps    m0, m6, [costabs + 16*6]
497    addps    m0, m5
498    mova     [tmpq+4*24], m0
499    addps    m6, m5
500    mova     [tmpq+4*4], m6
501    addps    m6, m4, m2
502    mulps    m6, [costabs + 16*1]
503    subps    m4, SPILLED(12)
504    mulps    m4, [costabs + 16*8]
505    addps    m2, SPILLED(12)
506    mulps    m2, [costabs + 16*3]
507    subps    m5, m7, m6
508    subps    m5, m2
509    addps    m6, m7
510    addps    m6, m4
511    addps    m7, m2
512    subps    m7, m4
513    mova     [tmpq+4*20], m7
514    mova     m2, [tmpq+4*28]
515    mova     [tmpq+4*28], m5
516    UNSPILL  7, 13
517    subps    m5, m7, m2
518    mulps    m5, [costabs + 16*7]
519    UNSPILL  1, 10
520    mulps    m1, [costabs + 16*2]
521    addps    m4, m3, m2
522    mulps    m4, [costabs + 16*4]
523    addps    m2, m7
524    addps    m7, m3
525    mulps    m7, [costabs]
526    subps    m3, m2
527    mulps    m3, [costabs + 16*2]
528    addps    m2, m7, m5
529    addps    m2, m1
530    SPILL    2, 10
531    addps    m7, m4
532    subps    m7, m1
533    SPILL    7, 12
534    subps    m5, m4
535    subps    m5, m1
536    UNSPILL  0, 14
537    SPILL    5, 13
538    addps    m1, m0, SPILLED(15)
539    subps    m1, SPILLED(8)
540    mova     m4, [costabs + 16*5]
541    mulps    m4, [tmpq]
542    UNSPILL  2, 9
543    addps    m4, m2
544    subps    m2, [tmpq]
545    mulps    m5, m1, [costabs + 16*6]
546    addps    m5, m2
547    SPILL    5, 9
548    addps    m2, m1
549    SPILL    2, 14
550    UNSPILL  5, 15
551    subps    m7, m5, m0
552    addps    m5, SPILLED(8)
553    mulps    m5, [costabs + 16*1]
554    mulps    m7, [costabs + 16*8]
555    addps    m0, SPILLED(8)
556    mulps    m0, [costabs + 16*3]
557    subps    m2, m4, m5
558    subps    m2, m0
559    SPILL    2, 15
560    addps    m5, m4
561    addps    m5, m7
562    addps    m4, m0
563    subps    m4, m7
564    SPILL    4, 8
565    mova     m7, [tmpq+4*16]
566    mova     m2, [tmpq+4*12]
567    addps    m0, m7, m2
568    subps    m0, SPILLED(11)
569    mulps    m0, [costabs + 16*2]
570    addps    m4, m7, SPILLED(11)
571    mulps    m4, [costabs]
572    subps    m7, m2
573    mulps    m7, [costabs + 16*7]
574    addps    m2, SPILLED(11)
575    mulps    m2, [costabs + 16*4]
576    addps    m1, m7, [tmpq+4*8]
577    addps    m1, m4
578    addps    m4, m2
579    subps    m4, [tmpq+4*8]
580    SPILL    4, 11
581    subps    m7, m2
582    subps    m7, [tmpq+4*8]
583    addps    m4, m6, SPILLED(10)
584    subps    m6, SPILLED(10)
585    addps    m2, m5, m1
586    mulps    m2, [costabs + 16*9]
587    subps    m5, m1
588    mulps    m5, [costabs + 16*17]
589    subps    m1, m4, m2
590    addps    m4, m2
591    mulps    m2, m1, [winq+4*36]
592    addps    m2, [bufq+4*36]
593    mova     [outq+1152], m2
594    mulps    m1, [winq+4*32]
595    addps    m1, [bufq+4*32]
596    mova     [outq+1024], m1
597    mulps    m1, m4, [winq+4*116]
598    mova     [bufq+4*36], m1
599    mulps    m4, [winq+4*112]
600    mova     [bufq+4*32], m4
601    addps    m2, m6, m5
602    subps    m6, m5
603    mulps    m1, m6, [winq+4*68]
604    addps    m1, [bufq+4*68]
605    mova     [outq+2176], m1
606    mulps    m6, [winq]
607    addps    m6, [bufq]
608    mova     [outq], m6
609    mulps    m1, m2, [winq+4*148]
610    mova     [bufq+4*68], m1
611    mulps    m2, [winq+4*80]
612    mova     [bufq], m2
613    addps    m5, m3, [tmpq+4*24]
614    mova     m2, [tmpq+4*24]
615    subps    m2, m3
616    mova     m1, SPILLED(9)
617    subps    m1, m0
618    mulps    m1, [costabs + 16*10]
619    addps    m0, SPILLED(9)
620    mulps    m0, [costabs + 16*16]
621    addps    m6, m5, m1
622    subps    m5, m1
623    mulps    m3, m5, [winq+4*40]
624    addps    m3, [bufq+4*40]
625    mova     [outq+1280], m3
626    mulps    m5, [winq+4*28]
627    addps    m5, [bufq+4*28]
628    mova     [outq+896], m5
629    mulps    m1, m6, [winq+4*120]
630    mova     [bufq+4*40], m1
631    mulps    m6, [winq+4*108]
632    mova     [bufq+4*28], m6
633    addps    m1, m2, m0
634    subps    m2, m0
635    mulps    m5, m2, [winq+4*64]
636    addps    m5, [bufq+4*64]
637    mova     [outq+2048], m5
638    mulps    m2, [winq+4*4]
639    addps    m2, [bufq+4*4]
640    mova     [outq+128], m2
641    mulps    m0, m1, [winq+4*144]
642    mova     [bufq+4*64], m0
643    mulps    m1, [winq+4*84]
644    mova     [bufq+4*4], m1
645    mova     m1, [tmpq+4*28]
646    mova     m5, m1
647    addps    m1, SPILLED(13)
648    subps    m5, SPILLED(13)
649    UNSPILL  3, 15
650    addps    m2, m7, m3
651    mulps    m2, [costabs + 16*11]
652    subps    m3, m7
653    mulps    m3, [costabs + 16*15]
654    addps    m0, m2, m1
655    subps    m1, m2
656    SWAP     m0, m2
657    mulps    m6, m1, [winq+4*44]
658    addps    m6, [bufq+4*44]
659    mova     [outq+1408], m6
660    mulps    m1, [winq+4*24]
661    addps    m1, [bufq+4*24]
662    mova     [outq+768], m1
663    mulps    m0, m2, [winq+4*124]
664    mova     [bufq+4*44], m0
665    mulps    m2, [winq+4*104]
666    mova     [bufq+4*24], m2
667    addps    m0, m5, m3
668    subps    m5, m3
669    mulps    m1, m5, [winq+4*60]
670    addps    m1, [bufq+4*60]
671    mova     [outq+1920], m1
672    mulps    m5, [winq+4*8]
673    addps    m5, [bufq+4*8]
674    mova     [outq+256], m5
675    mulps    m1, m0, [winq+4*140]
676    mova     [bufq+4*60], m1
677    mulps    m0, [winq+4*88]
678    mova     [bufq+4*8], m0
679    mova     m1, [tmpq+4*20]
680    addps    m1, SPILLED(12)
681    mova     m2, [tmpq+4*20]
682    subps    m2, SPILLED(12)
683    UNSPILL  7, 8
684    subps    m0, m7, SPILLED(11)
685    addps    m7, SPILLED(11)
686    mulps    m4, m7, [costabs + 16*12]
687    mulps    m0, [costabs + 16*14]
688    addps    m5, m1, m4
689    subps    m1, m4
690    mulps    m7, m1, [winq+4*48]
691    addps    m7, [bufq+4*48]
692    mova     [outq+1536], m7
693    mulps    m1, [winq+4*20]
694    addps    m1, [bufq+4*20]
695    mova     [outq+640], m1
696    mulps    m1, m5, [winq+4*128]
697    mova     [bufq+4*48], m1
698    mulps    m5, [winq+4*100]
699    mova     [bufq+4*20], m5
700    addps    m6, m2, m0
701    subps    m2, m0
702    mulps    m1, m2, [winq+4*56]
703    addps    m1, [bufq+4*56]
704    mova     [outq+1792], m1
705    mulps    m2, [winq+4*12]
706    addps    m2, [bufq+4*12]
707    mova     [outq+384], m2
708    mulps    m0, m6, [winq+4*136]
709    mova    [bufq+4*56], m0
710    mulps    m6, [winq+4*92]
711    mova     [bufq+4*12], m6
712    UNSPILL  0, 14
713    mulps    m0, [costabs + 16*13]
714    mova     m3, [tmpq+4*4]
715    addps    m2, m0, m3
716    subps    m3, m0
717    mulps    m0, m3, [winq+4*52]
718    addps    m0, [bufq+4*52]
719    mova     [outq+1664], m0
720    mulps    m3, [winq+4*16]
721    addps    m3, [bufq+4*16]
722    mova     [outq+512], m3
723    mulps    m0, m2, [winq+4*132]
724    mova     [bufq+4*52], m0
725    mulps    m2, [winq+4*96]
726    mova     [bufq+4*16], m2
727    RET
728%endmacro
729
730INIT_XMM sse
731DEFINE_FOUR_IMDCT
732
733%if HAVE_AVX_EXTERNAL
734INIT_XMM avx
735DEFINE_FOUR_IMDCT
736%endif
737