1;*****************************************************************************
2;* MMX/SSE2-optimized H.264 iDCT
3;*****************************************************************************
4;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
5;* Copyright (C) 2003-2008 x264 project
6;*
7;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
8;*          Loren Merritt <lorenm@u.washington.edu>
9;*          Holger Lubitz <hal@duncan.ol.sub.de>
10;*          Min Chen <chenm001.163.com>
11;*
12;* This file is part of FFmpeg.
13;*
14;* FFmpeg is free software; you can redistribute it and/or
15;* modify it under the terms of the GNU Lesser General Public
16;* License as published by the Free Software Foundation; either
17;* version 2.1 of the License, or (at your option) any later version.
18;*
19;* FFmpeg is distributed in the hope that it will be useful,
20;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22;* Lesser General Public License for more details.
23;*
24;* You should have received a copy of the GNU Lesser General Public
25;* License along with FFmpeg; if not, write to the Free Software
26;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27;*****************************************************************************
28
29%include "libavutil/x86/x86util.asm"
30
31SECTION_RODATA
32
33scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
34           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
35           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
36           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
37           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
38           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
39           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
40           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
41           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
42           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
43           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
44           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
45%ifdef PIC
46%define npicregs 1
47%define scan8 picregq
48%else
49%define npicregs 0
50%define scan8 scan8_mem
51%endif
52
53cextern pw_32
54cextern pw_1
55
56SECTION .text
57
58; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
59%macro IDCT4_ADD 3
60    ; Load dct coeffs
61    movq         m0, [%2]
62    movq         m1, [%2+8]
63    movq         m2, [%2+16]
64    movq         m3, [%2+24]
65
66    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
67    mova         m6, [pw_32]
68    %if mmsize == 8
69        TRANSPOSE4x4W 0, 1, 2, 3, 4
70    %else
71        punpcklwd m0, m1
72        punpcklwd m2, m3
73        SBUTTERFLY dq, 0, 2, 4
74        MOVHL m1, m0
75        MOVHL m3, m2
76    %endif
77    paddw        m0, m6
78    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
79    pxor         m7, m7
80    movq    [%2+ 0], m7
81    movq    [%2+ 8], m7
82    movq    [%2+16], m7
83    movq    [%2+24], m7
84
85    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
86    lea          %1, [%1+%3*2]
87    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
88%endmacro
89
90%macro IDCT8_1D 2
91    psraw        m0, m1, 1
92    SWAP 0, 1
93    psraw        m4, m5, 1
94    paddw        m4, m5
95    paddw        m1, m0
96    paddw        m4, m7
97    paddw        m1, m5
98    psubw        m4, m0
99    paddw        m1, m3
100
101    psubw        m0, m3
102    psubw        m5, m3
103    psraw        m3, 1
104    paddw        m0, m7
105    psubw        m5, m7
106    psraw        m7, 1
107    psubw        m0, m3
108    psubw        m5, m7
109
110    psraw        m7, m1, 2
111    SWAP 7,1
112    psraw        m3, m4, 2
113    paddw        m3, m0
114    psraw        m0, 2
115    paddw        m1, m5
116    psraw        m5, 2
117    psubw        m0, m4
118    psubw        m7, m5
119
120    psraw        m5, m6, 1
121    SWAP 5,6
122    psraw        m4, m2, 1
123    paddw        m6, m2
124    psubw        m4, m5
125
126    mova         m2, %1
127    mova         m5, %2
128    SUMSUB_BA    w, 5, 2
129    SUMSUB_BA    w, 6, 5
130    SUMSUB_BA    w, 4, 2
131    SUMSUB_BA    w, 7, 6
132    SUMSUB_BA    w, 0, 4
133    SUMSUB_BA    w, 3, 2
134    SUMSUB_BA    w, 1, 5
135    SWAP         7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
136%endmacro
137
138%macro IDCT8_1D_FULL 1
139    mova         m7, [%1+112]
140    mova         m6, [%1+ 96]
141    mova         m5, [%1+ 80]
142    mova         m3, [%1+ 48]
143    mova         m2, [%1+ 32]
144    mova         m1, [%1+ 16]
145    IDCT8_1D   [%1], [%1+ 64]
146%endmacro
147
148; %1=int16_t *block, %2=int16_t *dstblock
149%macro IDCT8_ADD_MMX_START 2
150    IDCT8_1D_FULL %1
151    mova       [%1], m7
152    TRANSPOSE4x4W 0, 1, 2, 3, 7
153    mova         m7, [%1]
154    mova    [%2   ], m0
155    mova    [%2+16], m1
156    mova    [%2+32], m2
157    mova    [%2+48], m3
158    TRANSPOSE4x4W 4, 5, 6, 7, 3
159    mova    [%2+ 8], m4
160    mova    [%2+24], m5
161    mova    [%2+40], m6
162    mova    [%2+56], m7
163%endmacro
164
165; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
166%macro IDCT8_ADD_MMX_END 3-4
167    IDCT8_1D_FULL %2
168    mova    [%2   ], m5
169    mova    [%2+16], m6
170    mova    [%2+32], m7
171
172    pxor         m7, m7
173%if %0 == 4
174    movq   [%4+  0], m7
175    movq   [%4+  8], m7
176    movq   [%4+ 16], m7
177    movq   [%4+ 24], m7
178    movq   [%4+ 32], m7
179    movq   [%4+ 40], m7
180    movq   [%4+ 48], m7
181    movq   [%4+ 56], m7
182    movq   [%4+ 64], m7
183    movq   [%4+ 72], m7
184    movq   [%4+ 80], m7
185    movq   [%4+ 88], m7
186    movq   [%4+ 96], m7
187    movq   [%4+104], m7
188    movq   [%4+112], m7
189    movq   [%4+120], m7
190%endif
191    STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
192    lea          %1, [%1+%3*2]
193    STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
194    mova         m0, [%2   ]
195    mova         m1, [%2+16]
196    mova         m2, [%2+32]
197    lea          %1, [%1+%3*2]
198    STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
199    lea          %1, [%1+%3*2]
200    STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
201%endmacro
202
203; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
204%macro IDCT8_ADD_SSE 4
205    IDCT8_1D_FULL %2
206%if ARCH_X86_64
207    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
208%else
209    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
210%endif
211    paddw        m0, [pw_32]
212
213%if ARCH_X86_64 == 0
214    mova    [%2   ], m0
215    mova    [%2+16], m4
216    IDCT8_1D   [%2], [%2+ 16]
217    mova    [%2   ], m6
218    mova    [%2+16], m7
219%else
220    SWAP          0, 8
221    SWAP          4, 9
222    IDCT8_1D     m8, m9
223    SWAP          6, 8
224    SWAP          7, 9
225%endif
226
227    pxor         m7, m7
228    lea          %4, [%3*3]
229    STORE_DIFF   m0, m6, m7, [%1     ]
230    STORE_DIFF   m1, m6, m7, [%1+%3  ]
231    STORE_DIFF   m2, m6, m7, [%1+%3*2]
232    STORE_DIFF   m3, m6, m7, [%1+%4  ]
233%if ARCH_X86_64 == 0
234    mova         m0, [%2   ]
235    mova         m1, [%2+16]
236%else
237    SWAP          0, 8
238    SWAP          1, 9
239%endif
240    mova   [%2+  0], m7
241    mova   [%2+ 16], m7
242    mova   [%2+ 32], m7
243    mova   [%2+ 48], m7
244    mova   [%2+ 64], m7
245    mova   [%2+ 80], m7
246    mova   [%2+ 96], m7
247    mova   [%2+112], m7
248    lea          %1, [%1+%3*4]
249    STORE_DIFF   m4, m6, m7, [%1     ]
250    STORE_DIFF   m5, m6, m7, [%1+%3  ]
251    STORE_DIFF   m0, m6, m7, [%1+%3*2]
252    STORE_DIFF   m1, m6, m7, [%1+%4  ]
253%endmacro
254
255INIT_XMM sse2
256; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride)
257cglobal h264_idct8_add_8, 3, 4, 10
258    movsxdifnidn  r2, r2d
259    IDCT8_ADD_SSE r0, r1, r2, r3
260    RET
261
262%macro DC_ADD_MMXEXT_INIT 2
263    add          %1, 32
264    sar          %1, 6
265    movd         m0, %1d
266    lea          %1, [%2*3]
267    pshufw       m0, m0, 0
268    pxor         m1, m1
269    psubw        m1, m0
270    packuswb     m0, m0
271    packuswb     m1, m1
272%endmacro
273
274%macro DC_ADD_MMXEXT_OP 4
275    %1           m2, [%2     ]
276    %1           m3, [%2+%3  ]
277    %1           m4, [%2+%3*2]
278    %1           m5, [%2+%4  ]
279    paddusb      m2, m0
280    paddusb      m3, m0
281    paddusb      m4, m0
282    paddusb      m5, m0
283    psubusb      m2, m1
284    psubusb      m3, m1
285    psubusb      m4, m1
286    psubusb      m5, m1
287    %1    [%2     ], m2
288    %1    [%2+%3  ], m3
289    %1    [%2+%3*2], m4
290    %1    [%2+%4  ], m5
291%endmacro
292
293INIT_MMX mmxext
294%if ARCH_X86_64
295; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
296cglobal h264_idct8_dc_add_8, 3, 4, 0
297    movsxd       r2, r2d
298    movsx        r3, word [r1]
299    mov  dword [r1], 0
300    DC_ADD_MMXEXT_INIT r3, r2
301    DC_ADD_MMXEXT_OP mova, r0, r2, r3
302    lea          r0, [r0+r2*4]
303    DC_ADD_MMXEXT_OP mova, r0, r2, r3
304    RET
305%else
306; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
307cglobal h264_idct8_dc_add_8, 2, 3, 0
308    movsx        r2, word [r1]
309    mov  dword [r1], 0
310    mov          r1, r2m
311    DC_ADD_MMXEXT_INIT r2, r1
312    DC_ADD_MMXEXT_OP mova, r0, r1, r2
313    lea          r0, [r0+r1*4]
314    DC_ADD_MMXEXT_OP mova, r0, r1, r2
315    RET
316%endif
317
318INIT_XMM sse2
319; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
320;                                int16_t *block, int stride,
321;                                const uint8_t nnzc[6 * 8])
322cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
323    movsxdifnidn r3, r3d
324    xor          r5, r5
325%ifdef PIC
326    lea     picregq, [scan8_mem]
327%endif
328.nextblock:
329    movzx        r6, byte [scan8+r5]
330    movzx        r6, byte [r4+r6]
331    test         r6, r6
332    jz .skipblock
333    cmp          r6, 1
334    jnz .no_dc
335    movsx        r6, word [r2]
336    test         r6, r6
337    jz .no_dc
338INIT_MMX cpuname
339    mov   word [r2], 0
340    DC_ADD_MMXEXT_INIT r6, r3
341%if ARCH_X86_64 == 0
342%define dst2q r1
343%define dst2d r1d
344%endif
345    mov       dst2d, dword [r1+r5*4]
346    add       dst2q, r0
347    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
348    lea       dst2q, [dst2q+r3*4]
349    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
350%if ARCH_X86_64 == 0
351    mov          r1, r1m
352%endif
353    add          r5, 4
354    add          r2, 128
355    cmp          r5, 16
356    jl .nextblock
357    REP_RET
358.no_dc:
359INIT_XMM cpuname
360    mov       dst2d, dword [r1+r5*4]
361    add       dst2q, r0
362    IDCT8_ADD_SSE dst2q, r2, r3, r6
363%if ARCH_X86_64 == 0
364    mov          r1, r1m
365%endif
366.skipblock:
367    add          r5, 4
368    add          r2, 128
369    cmp          r5, 16
370    jl .nextblock
371    REP_RET
372
373INIT_MMX mmx
374h264_idct_add8_mmx_plane:
375    movsxdifnidn r3, r3d
376.nextblock:
377    movzx        r6, byte [scan8+r5]
378    movzx        r6, byte [r4+r6]
379    or          r6w, word [r2]
380    test         r6, r6
381    jz .skipblock
382%if ARCH_X86_64
383    mov         r0d, dword [r1+r5*4]
384    add          r0, [dst2q]
385%else
386    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
387    mov          r0, [r0]
388    add          r0, dword [r1+r5*4]
389%endif
390    IDCT4_ADD    r0, r2, r3
391.skipblock:
392    inc          r5
393    add          r2, 32
394    test         r5, 3
395    jnz .nextblock
396    rep ret
397
398cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
399; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
400    movsxdifnidn r3, r3d
401%ifdef PIC
402    lea     picregq, [scan8_mem]
403%endif
404%if ARCH_X86_64
405    mov       dst2q, r0
406%endif
407
408    mov          r5, 16  ; i
409    add          r2, 512 ; i * 16 * sizeof(dctcoef) ; #define dctcoef int16_t
410
411    call         h264_idct_add8_mmx_plane
412    add r5, 4
413    call         h264_idct_add8_mmx_plane
414
415%if ARCH_X86_64
416    add       dst2q, gprsize ; dest[1]
417%else
418    add        r0mp, gprsize
419%endif
420
421    add r5, 4   ; set to 32
422    add r2, 256 ; set to i * 16 * sizeof(dctcoef)
423
424    call         h264_idct_add8_mmx_plane
425    add r5, 4
426    call         h264_idct_add8_mmx_plane
427
428    RET ; TODO: check rep ret after a function call
429
430; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
431h264_idct_dc_add8_mmxext:
432    movsxdifnidn r3, r3d
433    movd         m0, [r2   ]          ;  0 0 X D
434    mov word [r2+ 0], 0
435    punpcklwd    m0, [r2+32]          ;  x X d D
436    mov word [r2+32], 0
437    paddsw       m0, [pw_32]
438    psraw        m0, 6
439    punpcklwd    m0, m0               ;  d d D D
440    pxor         m1, m1               ;  0 0 0 0
441    psubw        m1, m0               ; -d-d-D-D
442    packuswb     m0, m1               ; -d-d-D-D d d D D
443    pshufw       m1, m0, 0xFA         ; -d-d-d-d-D-D-D-D
444    punpcklwd    m0, m0               ;  d d d d D D D D
445    lea          r6, [r3*3]
446    DC_ADD_MMXEXT_OP movq, r0, r3, r6
447    ret
448
449ALIGN 16
450INIT_XMM sse2
451; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
452h264_add8x4_idct_sse2:
453    movsxdifnidn r3, r3d
454    movq   m0, [r2+ 0]
455    movq   m1, [r2+ 8]
456    movq   m2, [r2+16]
457    movq   m3, [r2+24]
458    movhps m0, [r2+32]
459    movhps m1, [r2+40]
460    movhps m2, [r2+48]
461    movhps m3, [r2+56]
462    IDCT4_1D w,0,1,2,3,4,5
463    TRANSPOSE2x4x4W 0,1,2,3,4
464    paddw m0, [pw_32]
465    IDCT4_1D w,0,1,2,3,4,5
466    pxor  m7, m7
467    mova [r2+ 0], m7
468    mova [r2+16], m7
469    mova [r2+32], m7
470    mova [r2+48], m7
471    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
472    lea   r0, [r0+r3*2]
473    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
474    ret
475
476%macro add16_sse2_cycle 2
477    movzx       r0, word [r4+%2]
478    test        r0, r0
479    jz .cycle%1end
480    mov        r0d, dword [r1+%1*8]
481%if ARCH_X86_64
482    add         r0, r5
483%else
484    add         r0, r0m
485%endif
486    call        h264_add8x4_idct_sse2
487.cycle%1end:
488%if %1 < 7
489    add         r2, 64
490%endif
491%endmacro
492
493; void ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset,
494;                                int16_t *block, int stride,
495;                                const uint8_t nnzc[6 * 8])
496cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
497    movsxdifnidn r3, r3d
498%if ARCH_X86_64
499    mov         r5, r0
500%endif
501    ; unrolling of the loop leads to an average performance gain of
502    ; 20-25%
503    add16_sse2_cycle 0, 0xc
504    add16_sse2_cycle 1, 0x14
505    add16_sse2_cycle 2, 0xe
506    add16_sse2_cycle 3, 0x16
507    add16_sse2_cycle 4, 0x1c
508    add16_sse2_cycle 5, 0x24
509    add16_sse2_cycle 6, 0x1e
510    add16_sse2_cycle 7, 0x26
511REP_RET
512
513%macro add16intra_sse2_cycle 2
514    movzx       r0, word [r4+%2]
515    test        r0, r0
516    jz .try%1dc
517    mov        r0d, dword [r1+%1*8]
518%if ARCH_X86_64
519    add         r0, r7
520%else
521    add         r0, r0m
522%endif
523    call        h264_add8x4_idct_sse2
524    jmp .cycle%1end
525.try%1dc:
526    movsx       r0, word [r2   ]
527    or         r0w, word [r2+32]
528    jz .cycle%1end
529    mov        r0d, dword [r1+%1*8]
530%if ARCH_X86_64
531    add         r0, r7
532%else
533    add         r0, r0m
534%endif
535    call        h264_idct_dc_add8_mmxext
536.cycle%1end:
537%if %1 < 7
538    add         r2, 64
539%endif
540%endmacro
541
542; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset,
543;                                     int16_t *block, int stride,
544;                                     const uint8_t nnzc[6 * 8])
545cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
546    movsxdifnidn r3, r3d
547%if ARCH_X86_64
548    mov         r7, r0
549%endif
550    add16intra_sse2_cycle 0, 0xc
551    add16intra_sse2_cycle 1, 0x14
552    add16intra_sse2_cycle 2, 0xe
553    add16intra_sse2_cycle 3, 0x16
554    add16intra_sse2_cycle 4, 0x1c
555    add16intra_sse2_cycle 5, 0x24
556    add16intra_sse2_cycle 6, 0x1e
557    add16intra_sse2_cycle 7, 0x26
558REP_RET
559
560%macro add8_sse2_cycle 2
561    movzx       r0, word [r4+%2]
562    test        r0, r0
563    jz .try%1dc
564%if ARCH_X86_64
565    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
566    add         r0, [r7]
567%else
568    mov         r0, r0m
569    mov         r0, [r0]
570    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
571%endif
572    call        h264_add8x4_idct_sse2
573    jmp .cycle%1end
574.try%1dc:
575    movsx       r0, word [r2   ]
576    or         r0w, word [r2+32]
577    jz .cycle%1end
578%if ARCH_X86_64
579    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
580    add         r0, [r7]
581%else
582    mov         r0, r0m
583    mov         r0, [r0]
584    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
585%endif
586    call        h264_idct_dc_add8_mmxext
587.cycle%1end:
588%if %1 == 1
589    add         r2, 384+64
590%elif %1 < 3
591    add         r2, 64
592%endif
593%endmacro
594
595; void ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset,
596;                               int16_t *block, int stride,
597;                               const uint8_t nnzc[6 * 8])
598cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
599    movsxdifnidn r3, r3d
600    add          r2, 512
601%if ARCH_X86_64
602    mov          r7, r0
603%endif
604    add8_sse2_cycle 0, 0x34
605    add8_sse2_cycle 1, 0x3c
606%if ARCH_X86_64
607    add          r7, gprsize
608%else
609    add        r0mp, gprsize
610%endif
611    add8_sse2_cycle 2, 0x5c
612    add8_sse2_cycle 3, 0x64
613REP_RET
614
615;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
616
617%macro WALSH4_1D 5
618    SUMSUB_BADC w, %4, %3, %2, %1, %5
619    SUMSUB_BADC w, %4, %2, %3, %1, %5
620    SWAP %1, %4, %3
621%endmacro
622
623%macro DEQUANT 1-3
624%if cpuflag(sse2)
625    movd      xmm4, t3d
626    movq      xmm5, [pw_1]
627    pshufd    xmm4, xmm4, 0
628    movq2dq   xmm0, m0
629    movq2dq   xmm1, m1
630    movq2dq   xmm2, m2
631    movq2dq   xmm3, m3
632    punpcklwd xmm0, xmm5
633    punpcklwd xmm1, xmm5
634    punpcklwd xmm2, xmm5
635    punpcklwd xmm3, xmm5
636    pmaddwd   xmm0, xmm4
637    pmaddwd   xmm1, xmm4
638    pmaddwd   xmm2, xmm4
639    pmaddwd   xmm3, xmm4
640    psrad     xmm0, %1
641    psrad     xmm1, %1
642    psrad     xmm2, %1
643    psrad     xmm3, %1
644    packssdw  xmm0, xmm1
645    packssdw  xmm2, xmm3
646%else
647    mova        m7, [pw_1]
648    mova        m4, %1
649    punpcklwd   %1, m7
650    punpckhwd   m4, m7
651    mova        m5, %2
652    punpcklwd   %2, m7
653    punpckhwd   m5, m7
654    movd        m7, t3d
655    punpckldq   m7, m7
656    pmaddwd     %1, m7
657    pmaddwd     %2, m7
658    pmaddwd     m4, m7
659    pmaddwd     m5, m7
660    psrad       %1, %3
661    psrad       %2, %3
662    psrad       m4, %3
663    psrad       m5, %3
664    packssdw    %1, m4
665    packssdw    %2, m5
666%endif
667%endmacro
668
669%macro STORE_WORDS 5-9
670%if cpuflag(sse)
671    movd  t0d, %1
672    psrldq  %1, 4
673    movd  t1d, %1
674    psrldq  %1, 4
675    mov [t2+%2*32], t0w
676    mov [t2+%4*32], t1w
677    shr   t0d, 16
678    shr   t1d, 16
679    mov [t2+%3*32], t0w
680    mov [t2+%5*32], t1w
681    movd  t0d, %1
682    psrldq  %1, 4
683    movd  t1d, %1
684    mov [t2+%6*32], t0w
685    mov [t2+%8*32], t1w
686    shr   t0d, 16
687    shr   t1d, 16
688    mov [t2+%7*32], t0w
689    mov [t2+%9*32], t1w
690%else
691    movd  t0d, %1
692    psrlq  %1, 32
693    movd  t1d, %1
694    mov [t2+%2*32], t0w
695    mov [t2+%4*32], t1w
696    shr   t0d, 16
697    shr   t1d, 16
698    mov [t2+%3*32], t0w
699    mov [t2+%5*32], t1w
700%endif
701%endmacro
702
703%macro DEQUANT_STORE 1
704%if cpuflag(sse2)
705    DEQUANT     %1
706    STORE_WORDS xmm0,  0,  1,  4,  5,  2,  3,  6,  7
707    STORE_WORDS xmm2,  8,  9, 12, 13, 10, 11, 14, 15
708%else
709    DEQUANT     m0, m1, %1
710    STORE_WORDS m0,  0,  1,  4,  5
711    STORE_WORDS m1,  2,  3,  6,  7
712
713    DEQUANT     m2, m3, %1
714    STORE_WORDS m2,  8,  9, 12, 13
715    STORE_WORDS m3, 10, 11, 14, 15
716%endif
717%endmacro
718
719%macro IDCT_DC_DEQUANT 1
720cglobal h264_luma_dc_dequant_idct, 3, 4, %1
721    ; manually spill XMM registers for Win64 because
722    ; the code here is initialized with INIT_MMX
723    WIN64_SPILL_XMM %1
724    movq        m3, [r1+24]
725    movq        m2, [r1+16]
726    movq        m1, [r1+ 8]
727    movq        m0, [r1+ 0]
728    WALSH4_1D    0,1,2,3,4
729    TRANSPOSE4x4W 0,1,2,3,4
730    WALSH4_1D    0,1,2,3,4
731
732; shift, tmp, output, qmul
733%if WIN64
734    DECLARE_REG_TMP 0,3,1,2
735    ; we can't avoid this, because r0 is the shift register (ecx) on win64
736    xchg        r0, t2
737%elif ARCH_X86_64
738    DECLARE_REG_TMP 3,1,0,2
739%else
740    DECLARE_REG_TMP 1,3,0,2
741%endif
742
743    cmp        t3d, 32767
744    jg .big_qmul
745    add        t3d, 128 << 16
746    DEQUANT_STORE 8
747    RET
748.big_qmul:
749    bsr        t0d, t3d
750    add        t3d, 128 << 16
751    mov        t1d, 7
752    cmp        t0d, t1d
753    cmovg      t0d, t1d
754    inc        t1d
755    shr        t3d, t0b
756    sub        t1d, t0d
757    movd      xmm6, t1d
758    DEQUANT_STORE xmm6
759    RET
760%endmacro
761
762INIT_MMX sse2
763IDCT_DC_DEQUANT 7
764
765%ifdef __NASM_VER__
766%if __NASM_MAJOR__ >= 2 && __NASM_MINOR__ >= 4
767%unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
768%endif
769%endif
770%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
771    movd       %3, [%7]
772    movd       %4, [%7+%8]
773    psraw      %1, %6
774    psraw      %2, %6
775    punpcklbw  %3, %5
776    punpcklbw  %4, %5
777    paddw      %3, %1
778    paddw      %4, %2
779    packuswb   %3, %5
780    packuswb   %4, %5
781    movd     [%7], %3
782    movd  [%7+%8], %4
783%endmacro
784
785%macro DC_ADD_INIT 1
786    add      %1d, 32
787    sar      %1d, 6
788    movd     m0, %1d
789    pshuflw  m0, m0, 0
790    lea      %1, [3*stride_q]
791    pxor     m1, m1
792    psubw    m1, m0
793    packuswb m0, m0
794    packuswb m1, m1
795%endmacro
796
797%macro IDCT_XMM 1
798
799INIT_XMM %1
800
801cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
802    movsxdifnidn stride_q, stride_d
803    IDCT4_ADD    dst_q, block_q, stride_q
804RET
805
806cglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_
807    movsxdifnidn stride_q, stride_d
808    movsx             r3d, word [block_q]
809    mov   dword [block_q], 0
810    DC_ADD_INIT r3
811    DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
812RET
813
814%endmacro
815
816IDCT_XMM sse2
817IDCT_XMM avx
818