1;******************************************************************************
2;* VP8 MMXEXT optimizations
3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27fourtap_filter_hw_m: times 4 dw  -6, 123
28                     times 4 dw  12,  -1
29                     times 4 dw  -9,  93
30                     times 4 dw  50,  -6
31                     times 4 dw  -6,  50
32                     times 4 dw  93,  -9
33                     times 4 dw  -1,  12
34                     times 4 dw 123,  -6
35
36sixtap_filter_hw_m:  times 4 dw   2, -11
37                     times 4 dw 108,  36
38                     times 4 dw  -8,   1
39                     times 4 dw   3, -16
40                     times 4 dw  77,  77
41                     times 4 dw -16,   3
42                     times 4 dw   1,  -8
43                     times 4 dw  36, 108
44                     times 4 dw -11,   2
45
46fourtap_filter_hb_m: times 8 db  -6, 123
47                     times 8 db  12,  -1
48                     times 8 db  -9,  93
49                     times 8 db  50,  -6
50                     times 8 db  -6,  50
51                     times 8 db  93,  -9
52                     times 8 db  -1,  12
53                     times 8 db 123,  -6
54
55sixtap_filter_hb_m:  times 8 db   2,   1
56                     times 8 db -11, 108
57                     times 8 db  36,  -8
58                     times 8 db   3,   3
59                     times 8 db -16,  77
60                     times 8 db  77, -16
61                     times 8 db   1,   2
62                     times 8 db  -8,  36
63                     times 8 db 108, -11
64
65fourtap_filter_v_m:  times 8 dw  -6
66                     times 8 dw 123
67                     times 8 dw  12
68                     times 8 dw  -1
69                     times 8 dw  -9
70                     times 8 dw  93
71                     times 8 dw  50
72                     times 8 dw  -6
73                     times 8 dw  -6
74                     times 8 dw  50
75                     times 8 dw  93
76                     times 8 dw  -9
77                     times 8 dw  -1
78                     times 8 dw  12
79                     times 8 dw 123
80                     times 8 dw  -6
81
82sixtap_filter_v_m:   times 8 dw   2
83                     times 8 dw -11
84                     times 8 dw 108
85                     times 8 dw  36
86                     times 8 dw  -8
87                     times 8 dw   1
88                     times 8 dw   3
89                     times 8 dw -16
90                     times 8 dw  77
91                     times 8 dw  77
92                     times 8 dw -16
93                     times 8 dw   3
94                     times 8 dw   1
95                     times 8 dw  -8
96                     times 8 dw  36
97                     times 8 dw 108
98                     times 8 dw -11
99                     times 8 dw   2
100
101bilinear_filter_vw_m: times 8 dw 1
102                      times 8 dw 2
103                      times 8 dw 3
104                      times 8 dw 4
105                      times 8 dw 5
106                      times 8 dw 6
107                      times 8 dw 7
108
109bilinear_filter_vb_m: times 8 db 7, 1
110                      times 8 db 6, 2
111                      times 8 db 5, 3
112                      times 8 db 4, 4
113                      times 8 db 3, 5
114                      times 8 db 2, 6
115                      times 8 db 1, 7
116
117%ifdef PIC
118%define fourtap_filter_hw  picregq
119%define sixtap_filter_hw   picregq
120%define fourtap_filter_hb  picregq
121%define sixtap_filter_hb   picregq
122%define fourtap_filter_v   picregq
123%define sixtap_filter_v    picregq
124%define bilinear_filter_vw picregq
125%define bilinear_filter_vb picregq
126%define npicregs 1
127%else
128%define fourtap_filter_hw  fourtap_filter_hw_m
129%define sixtap_filter_hw   sixtap_filter_hw_m
130%define fourtap_filter_hb  fourtap_filter_hb_m
131%define sixtap_filter_hb   sixtap_filter_hb_m
132%define fourtap_filter_v   fourtap_filter_v_m
133%define sixtap_filter_v    sixtap_filter_v_m
134%define bilinear_filter_vw bilinear_filter_vw_m
135%define bilinear_filter_vb bilinear_filter_vb_m
136%define npicregs 0
137%endif
138
139filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
140filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
141
142filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
143filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
144filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
145
146pw_20091: times 4 dw 20091
147pw_17734: times 4 dw 17734
148
149cextern pw_3
150cextern pw_4
151cextern pw_64
152cextern pw_256
153
154SECTION .text
155
156;-------------------------------------------------------------------------------
157; subpel MC functions:
158;
159; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, ptrdiff_t deststride,
160;                                                 uint8_t *src, ptrdiff_t srcstride,
161;                                                 int height,   int mx, int my);
162;-------------------------------------------------------------------------------
163
164%macro FILTER_SSSE3 1
165cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
166    lea      mxd, [mxq*3]
167    mova      m3, [filter_h6_shuf2]
168    mova      m4, [filter_h6_shuf3]
169%ifdef PIC
170    lea  picregq, [sixtap_filter_hb_m]
171%endif
172    mova      m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
173    mova      m6, [sixtap_filter_hb+mxq*8-32]
174    mova      m7, [sixtap_filter_hb+mxq*8-16]
175
176.nextrow:
177    movu      m0, [srcq-2]
178    mova      m1, m0
179    mova      m2, m0
180%if mmsize == 8
181; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
182; shuffle with a memory operand
183    punpcklbw m0, [srcq+3]
184%else
185    pshufb    m0, [filter_h6_shuf1]
186%endif
187    pshufb    m1, m3
188    pshufb    m2, m4
189    pmaddubsw m0, m5
190    pmaddubsw m1, m6
191    pmaddubsw m2, m7
192    paddsw    m0, m1
193    paddsw    m0, m2
194    pmulhrsw  m0, [pw_256]
195    packuswb  m0, m0
196    movh  [dstq], m0        ; store
197
198    ; go to next line
199    add     dstq, dststrideq
200    add     srcq, srcstrideq
201    dec  heightd            ; next row
202    jg .nextrow
203    REP_RET
204
205cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
206    shl      mxd, 4
207    mova      m2, [pw_256]
208    mova      m3, [filter_h2_shuf]
209    mova      m4, [filter_h4_shuf]
210%ifdef PIC
211    lea  picregq, [fourtap_filter_hb_m]
212%endif
213    mova      m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
214    mova      m6, [fourtap_filter_hb+mxq]
215
216.nextrow:
217    movu      m0, [srcq-1]
218    mova      m1, m0
219    pshufb    m0, m3
220    pshufb    m1, m4
221    pmaddubsw m0, m5
222    pmaddubsw m1, m6
223    paddsw    m0, m1
224    pmulhrsw  m0, m2
225    packuswb  m0, m0
226    movh  [dstq], m0        ; store
227
228    ; go to next line
229    add     dstq, dststrideq
230    add     srcq, srcstrideq
231    dec  heightd            ; next row
232    jg .nextrow
233    REP_RET
234
235cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
236    shl      myd, 4
237%ifdef PIC
238    lea  picregq, [fourtap_filter_hb_m]
239%endif
240    mova      m5, [fourtap_filter_hb+myq-16]
241    mova      m6, [fourtap_filter_hb+myq]
242    mova      m7, [pw_256]
243
244    ; read 3 lines
245    sub     srcq, srcstrideq
246    movh      m0, [srcq]
247    movh      m1, [srcq+  srcstrideq]
248    movh      m2, [srcq+2*srcstrideq]
249    add     srcq, srcstrideq
250
251.nextrow:
252    movh      m3, [srcq+2*srcstrideq]      ; read new row
253    mova      m4, m0
254    mova      m0, m1
255    punpcklbw m4, m1
256    mova      m1, m2
257    punpcklbw m2, m3
258    pmaddubsw m4, m5
259    pmaddubsw m2, m6
260    paddsw    m4, m2
261    mova      m2, m3
262    pmulhrsw  m4, m7
263    packuswb  m4, m4
264    movh  [dstq], m4
265
266    ; go to next line
267    add      dstq, dststrideq
268    add      srcq, srcstrideq
269    dec   heightd                          ; next row
270    jg .nextrow
271    REP_RET
272
273cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
274    lea      myd, [myq*3]
275%ifdef PIC
276    lea  picregq, [sixtap_filter_hb_m]
277%endif
278    lea      myq, [sixtap_filter_hb+myq*8]
279
280    ; read 5 lines
281    sub     srcq, srcstrideq
282    sub     srcq, srcstrideq
283    movh      m0, [srcq]
284    movh      m1, [srcq+srcstrideq]
285    movh      m2, [srcq+srcstrideq*2]
286    lea     srcq, [srcq+srcstrideq*2]
287    add     srcq, srcstrideq
288    movh      m3, [srcq]
289    movh      m4, [srcq+srcstrideq]
290
291.nextrow:
292    movh      m5, [srcq+2*srcstrideq]      ; read new row
293    mova      m6, m0
294    punpcklbw m6, m5
295    mova      m0, m1
296    punpcklbw m1, m2
297    mova      m7, m3
298    punpcklbw m7, m4
299    pmaddubsw m6, [myq-48]
300    pmaddubsw m1, [myq-32]
301    pmaddubsw m7, [myq-16]
302    paddsw    m6, m1
303    paddsw    m6, m7
304    mova      m1, m2
305    mova      m2, m3
306    pmulhrsw  m6, [pw_256]
307    mova      m3, m4
308    packuswb  m6, m6
309    mova      m4, m5
310    movh  [dstq], m6
311
312    ; go to next line
313    add      dstq, dststrideq
314    add      srcq, srcstrideq
315    dec   heightd                          ; next row
316    jg .nextrow
317    REP_RET
318%endmacro
319
320INIT_MMX ssse3
321FILTER_SSSE3 4
322INIT_XMM ssse3
323FILTER_SSSE3 8
324
325; 4x4 block, H-only 4-tap filter
326INIT_MMX mmxext
327cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
328    shl       mxd, 4
329%ifdef PIC
330    lea   picregq, [fourtap_filter_hw_m]
331%endif
332    movq      mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
333    movq      mm5, [fourtap_filter_hw+mxq]
334    movq      mm7, [pw_64]
335    pxor      mm6, mm6
336
337.nextrow:
338    movq      mm1, [srcq-1]                ; (ABCDEFGH) load 8 horizontal pixels
339
340    ; first set of 2 pixels
341    movq      mm2, mm1                     ; byte ABCD..
342    punpcklbw mm1, mm6                     ; byte->word ABCD
343    pshufw    mm0, mm2, 9                  ; byte CDEF..
344    punpcklbw mm0, mm6                     ; byte->word CDEF
345    pshufw    mm3, mm1, 0x94               ; word ABBC
346    pshufw    mm1, mm0, 0x94               ; word CDDE
347    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
348    movq      mm0, mm1                     ; backup for second set of pixels
349    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
350    paddd     mm3, mm1                     ; finish 1st 2px
351
352    ; second set of 2 pixels, use backup of above
353    punpckhbw mm2, mm6                     ; byte->word EFGH
354    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
355    pshufw    mm1, mm2, 0x94               ; word EFFG
356    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
357    paddd     mm0, mm1                     ; finish 2nd 2px
358
359    ; merge two sets of 2 pixels into one set of 4, round/clip/store
360    packssdw  mm3, mm0                     ; merge dword->word (4px)
361    paddsw    mm3, mm7                     ; rounding
362    psraw     mm3, 7
363    packuswb  mm3, mm6                     ; clip and word->bytes
364    movd   [dstq], mm3                     ; store
365
366    ; go to next line
367    add      dstq, dststrideq
368    add      srcq, srcstrideq
369    dec   heightd                          ; next row
370    jg .nextrow
371    REP_RET
372
373; 4x4 block, H-only 6-tap filter
374INIT_MMX mmxext
375cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
376    lea       mxd, [mxq*3]
377%ifdef PIC
378    lea   picregq, [sixtap_filter_hw_m]
379%endif
380    movq      mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
381    movq      mm5, [sixtap_filter_hw+mxq*8-32]
382    movq      mm6, [sixtap_filter_hw+mxq*8-16]
383    movq      mm7, [pw_64]
384    pxor      mm3, mm3
385
386.nextrow:
387    movq      mm1, [srcq-2]                ; (ABCDEFGH) load 8 horizontal pixels
388
389    ; first set of 2 pixels
390    movq      mm2, mm1                     ; byte ABCD..
391    punpcklbw mm1, mm3                     ; byte->word ABCD
392    pshufw    mm0, mm2, 0x9                ; byte CDEF..
393    punpckhbw mm2, mm3                     ; byte->word EFGH
394    punpcklbw mm0, mm3                     ; byte->word CDEF
395    pshufw    mm1, mm1, 0x94               ; word ABBC
396    pshufw    mm2, mm2, 0x94               ; word EFFG
397    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
398    pshufw    mm3, mm0, 0x94               ; word CDDE
399    movq      mm0, mm3                     ; backup for second set of pixels
400    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
401    paddd     mm1, mm3                     ; add to 1st 2px cache
402    movq      mm3, mm2                     ; backup for second set of pixels
403    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
404    paddd     mm1, mm2                     ; finish 1st 2px
405
406    ; second set of 2 pixels, use backup of above
407    movd      mm2, [srcq+3]                ; byte FGHI (prevent overreads)
408    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
409    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
410    paddd     mm0, mm3                     ; add to 2nd 2px cache
411    pxor      mm3, mm3
412    punpcklbw mm2, mm3                     ; byte->word FGHI
413    pshufw    mm2, mm2, 0xE9               ; word GHHI
414    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
415    paddd     mm0, mm2                     ; finish 2nd 2px
416
417    ; merge two sets of 2 pixels into one set of 4, round/clip/store
418    packssdw  mm1, mm0                     ; merge dword->word (4px)
419    paddsw    mm1, mm7                     ; rounding
420    psraw     mm1, 7
421    packuswb  mm1, mm3                     ; clip and word->bytes
422    movd   [dstq], mm1                     ; store
423
424    ; go to next line
425    add      dstq, dststrideq
426    add      srcq, srcstrideq
427    dec   heightd                          ; next row
428    jg .nextrow
429    REP_RET
430
431INIT_XMM sse2
432cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
433    shl      mxd, 5
434%ifdef PIC
435    lea  picregq, [fourtap_filter_v_m]
436%endif
437    lea      mxq, [fourtap_filter_v+mxq-32]
438    pxor      m7, m7
439    mova      m4, [pw_64]
440    mova      m5, [mxq+ 0]
441    mova      m6, [mxq+16]
442%ifdef m8
443    mova      m8, [mxq+32]
444    mova      m9, [mxq+48]
445%endif
446.nextrow:
447    movq      m0, [srcq-1]
448    movq      m1, [srcq-0]
449    movq      m2, [srcq+1]
450    movq      m3, [srcq+2]
451    punpcklbw m0, m7
452    punpcklbw m1, m7
453    punpcklbw m2, m7
454    punpcklbw m3, m7
455    pmullw    m0, m5
456    pmullw    m1, m6
457%ifdef m8
458    pmullw    m2, m8
459    pmullw    m3, m9
460%else
461    pmullw    m2, [mxq+32]
462    pmullw    m3, [mxq+48]
463%endif
464    paddsw    m0, m1
465    paddsw    m2, m3
466    paddsw    m0, m2
467    paddsw    m0, m4
468    psraw     m0, 7
469    packuswb  m0, m7
470    movh  [dstq], m0        ; store
471
472    ; go to next line
473    add     dstq, dststrideq
474    add     srcq, srcstrideq
475    dec  heightd            ; next row
476    jg .nextrow
477    REP_RET
478
479INIT_XMM sse2
480cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
481    lea      mxd, [mxq*3]
482    shl      mxd, 4
483%ifdef PIC
484    lea  picregq, [sixtap_filter_v_m]
485%endif
486    lea      mxq, [sixtap_filter_v+mxq-96]
487    pxor      m7, m7
488    mova      m6, [pw_64]
489%ifdef m8
490    mova      m8, [mxq+ 0]
491    mova      m9, [mxq+16]
492    mova     m10, [mxq+32]
493    mova     m11, [mxq+48]
494    mova     m12, [mxq+64]
495    mova     m13, [mxq+80]
496%endif
497.nextrow:
498    movq      m0, [srcq-2]
499    movq      m1, [srcq-1]
500    movq      m2, [srcq-0]
501    movq      m3, [srcq+1]
502    movq      m4, [srcq+2]
503    movq      m5, [srcq+3]
504    punpcklbw m0, m7
505    punpcklbw m1, m7
506    punpcklbw m2, m7
507    punpcklbw m3, m7
508    punpcklbw m4, m7
509    punpcklbw m5, m7
510%ifdef m8
511    pmullw    m0, m8
512    pmullw    m1, m9
513    pmullw    m2, m10
514    pmullw    m3, m11
515    pmullw    m4, m12
516    pmullw    m5, m13
517%else
518    pmullw    m0, [mxq+ 0]
519    pmullw    m1, [mxq+16]
520    pmullw    m2, [mxq+32]
521    pmullw    m3, [mxq+48]
522    pmullw    m4, [mxq+64]
523    pmullw    m5, [mxq+80]
524%endif
525    paddsw    m1, m4
526    paddsw    m0, m5
527    paddsw    m1, m2
528    paddsw    m0, m3
529    paddsw    m0, m1
530    paddsw    m0, m6
531    psraw     m0, 7
532    packuswb  m0, m7
533    movh  [dstq], m0        ; store
534
535    ; go to next line
536    add     dstq, dststrideq
537    add     srcq, srcstrideq
538    dec  heightd            ; next row
539    jg .nextrow
540    REP_RET
541
542%macro FILTER_V 1
543; 4x4 block, V-only 4-tap filter
544cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
545    shl      myd, 5
546%ifdef PIC
547    lea  picregq, [fourtap_filter_v_m]
548%endif
549    lea      myq, [fourtap_filter_v+myq-32]
550    mova      m6, [pw_64]
551    pxor      m7, m7
552    mova      m5, [myq+48]
553
554    ; read 3 lines
555    sub     srcq, srcstrideq
556    movh      m0, [srcq]
557    movh      m1, [srcq+  srcstrideq]
558    movh      m2, [srcq+2*srcstrideq]
559    add     srcq, srcstrideq
560    punpcklbw m0, m7
561    punpcklbw m1, m7
562    punpcklbw m2, m7
563
564.nextrow:
565    ; first calculate negative taps (to prevent losing positive overflows)
566    movh      m4, [srcq+2*srcstrideq]      ; read new row
567    punpcklbw m4, m7
568    mova      m3, m4
569    pmullw    m0, [myq+0]
570    pmullw    m4, m5
571    paddsw    m4, m0
572
573    ; then calculate positive taps
574    mova      m0, m1
575    pmullw    m1, [myq+16]
576    paddsw    m4, m1
577    mova      m1, m2
578    pmullw    m2, [myq+32]
579    paddsw    m4, m2
580    mova      m2, m3
581
582    ; round/clip/store
583    paddsw    m4, m6
584    psraw     m4, 7
585    packuswb  m4, m7
586    movh  [dstq], m4
587
588    ; go to next line
589    add     dstq, dststrideq
590    add     srcq, srcstrideq
591    dec  heightd                           ; next row
592    jg .nextrow
593    REP_RET
594
595
596; 4x4 block, V-only 6-tap filter
597cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
598    shl      myd, 4
599    lea      myq, [myq*3]
600%ifdef PIC
601    lea  picregq, [sixtap_filter_v_m]
602%endif
603    lea      myq, [sixtap_filter_v+myq-96]
604    pxor      m7, m7
605
606    ; read 5 lines
607    sub     srcq, srcstrideq
608    sub     srcq, srcstrideq
609    movh      m0, [srcq]
610    movh      m1, [srcq+srcstrideq]
611    movh      m2, [srcq+srcstrideq*2]
612    lea     srcq, [srcq+srcstrideq*2]
613    add     srcq, srcstrideq
614    movh      m3, [srcq]
615    movh      m4, [srcq+srcstrideq]
616    punpcklbw m0, m7
617    punpcklbw m1, m7
618    punpcklbw m2, m7
619    punpcklbw m3, m7
620    punpcklbw m4, m7
621
622.nextrow:
623    ; first calculate negative taps (to prevent losing positive overflows)
624    mova      m5, m1
625    pmullw    m5, [myq+16]
626    mova      m6, m4
627    pmullw    m6, [myq+64]
628    paddsw    m6, m5
629
630    ; then calculate positive taps
631    movh      m5, [srcq+2*srcstrideq]      ; read new row
632    punpcklbw m5, m7
633    pmullw    m0, [myq+0]
634    paddsw    m6, m0
635    mova      m0, m1
636    mova      m1, m2
637    pmullw    m2, [myq+32]
638    paddsw    m6, m2
639    mova      m2, m3
640    pmullw    m3, [myq+48]
641    paddsw    m6, m3
642    mova      m3, m4
643    mova      m4, m5
644    pmullw    m5, [myq+80]
645    paddsw    m6, m5
646
647    ; round/clip/store
648    paddsw    m6, [pw_64]
649    psraw     m6, 7
650    packuswb  m6, m7
651    movh  [dstq], m6
652
653    ; go to next line
654    add     dstq, dststrideq
655    add     srcq, srcstrideq
656    dec  heightd                           ; next row
657    jg .nextrow
658    REP_RET
659%endmacro
660
661INIT_MMX mmxext
662FILTER_V 4
663INIT_XMM sse2
664FILTER_V 8
665
666%macro FILTER_BILINEAR 1
667%if cpuflag(ssse3)
668cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
669    shl      myd, 4
670%ifdef PIC
671    lea  picregq, [bilinear_filter_vb_m]
672%endif
673    pxor      m4, m4
674    mova      m3, [bilinear_filter_vb+myq-16]
675.nextrow:
676    movh      m0, [srcq+srcstrideq*0]
677    movh      m1, [srcq+srcstrideq*1]
678    movh      m2, [srcq+srcstrideq*2]
679    punpcklbw m0, m1
680    punpcklbw m1, m2
681    pmaddubsw m0, m3
682    pmaddubsw m1, m3
683    psraw     m0, 2
684    psraw     m1, 2
685    pavgw     m0, m4
686    pavgw     m1, m4
687%if mmsize==8
688    packuswb  m0, m0
689    packuswb  m1, m1
690    movh   [dstq+dststrideq*0], m0
691    movh   [dstq+dststrideq*1], m1
692%else
693    packuswb  m0, m1
694    movh   [dstq+dststrideq*0], m0
695    movhps [dstq+dststrideq*1], m0
696%endif
697%else ; cpuflag(ssse3)
698cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
699    shl      myd, 4
700%ifdef PIC
701    lea  picregq, [bilinear_filter_vw_m]
702%endif
703    pxor      m6, m6
704    mova      m5, [bilinear_filter_vw+myq-1*16]
705    neg      myq
706    mova      m4, [bilinear_filter_vw+myq+7*16]
707.nextrow:
708    movh      m0, [srcq+srcstrideq*0]
709    movh      m1, [srcq+srcstrideq*1]
710    movh      m3, [srcq+srcstrideq*2]
711    punpcklbw m0, m6
712    punpcklbw m1, m6
713    punpcklbw m3, m6
714    mova      m2, m1
715    pmullw    m0, m4
716    pmullw    m1, m5
717    pmullw    m2, m4
718    pmullw    m3, m5
719    paddsw    m0, m1
720    paddsw    m2, m3
721    psraw     m0, 2
722    psraw     m2, 2
723    pavgw     m0, m6
724    pavgw     m2, m6
725%if mmsize == 8
726    packuswb  m0, m0
727    packuswb  m2, m2
728    movh   [dstq+dststrideq*0], m0
729    movh   [dstq+dststrideq*1], m2
730%else
731    packuswb  m0, m2
732    movh   [dstq+dststrideq*0], m0
733    movhps [dstq+dststrideq*1], m0
734%endif
735%endif ; cpuflag(ssse3)
736
737    lea     dstq, [dstq+dststrideq*2]
738    lea     srcq, [srcq+srcstrideq*2]
739    sub  heightd, 2
740    jg .nextrow
741    REP_RET
742
743%if cpuflag(ssse3)
744cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
745    shl      mxd, 4
746%ifdef PIC
747    lea  picregq, [bilinear_filter_vb_m]
748%endif
749    pxor      m4, m4
750    mova      m2, [filter_h2_shuf]
751    mova      m3, [bilinear_filter_vb+mxq-16]
752.nextrow:
753    movu      m0, [srcq+srcstrideq*0]
754    movu      m1, [srcq+srcstrideq*1]
755    pshufb    m0, m2
756    pshufb    m1, m2
757    pmaddubsw m0, m3
758    pmaddubsw m1, m3
759    psraw     m0, 2
760    psraw     m1, 2
761    pavgw     m0, m4
762    pavgw     m1, m4
763%if mmsize==8
764    packuswb  m0, m0
765    packuswb  m1, m1
766    movh   [dstq+dststrideq*0], m0
767    movh   [dstq+dststrideq*1], m1
768%else
769    packuswb  m0, m1
770    movh   [dstq+dststrideq*0], m0
771    movhps [dstq+dststrideq*1], m0
772%endif
773%else ; cpuflag(ssse3)
774cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
775    shl      mxd, 4
776%ifdef PIC
777    lea  picregq, [bilinear_filter_vw_m]
778%endif
779    pxor      m6, m6
780    mova      m5, [bilinear_filter_vw+mxq-1*16]
781    neg      mxq
782    mova      m4, [bilinear_filter_vw+mxq+7*16]
783.nextrow:
784    movh      m0, [srcq+srcstrideq*0+0]
785    movh      m1, [srcq+srcstrideq*0+1]
786    movh      m2, [srcq+srcstrideq*1+0]
787    movh      m3, [srcq+srcstrideq*1+1]
788    punpcklbw m0, m6
789    punpcklbw m1, m6
790    punpcklbw m2, m6
791    punpcklbw m3, m6
792    pmullw    m0, m4
793    pmullw    m1, m5
794    pmullw    m2, m4
795    pmullw    m3, m5
796    paddsw    m0, m1
797    paddsw    m2, m3
798    psraw     m0, 2
799    psraw     m2, 2
800    pavgw     m0, m6
801    pavgw     m2, m6
802%if mmsize == 8
803    packuswb  m0, m0
804    packuswb  m2, m2
805    movh   [dstq+dststrideq*0], m0
806    movh   [dstq+dststrideq*1], m2
807%else
808    packuswb  m0, m2
809    movh   [dstq+dststrideq*0], m0
810    movhps [dstq+dststrideq*1], m0
811%endif
812%endif ; cpuflag(ssse3)
813
814    lea     dstq, [dstq+dststrideq*2]
815    lea     srcq, [srcq+srcstrideq*2]
816    sub  heightd, 2
817    jg .nextrow
818    REP_RET
819%endmacro
820
821INIT_MMX mmxext
822FILTER_BILINEAR 4
823INIT_XMM sse2
824FILTER_BILINEAR 8
825INIT_MMX ssse3
826FILTER_BILINEAR 4
827INIT_XMM ssse3
828FILTER_BILINEAR 8
829
830INIT_MMX mmx
831cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
832.nextrow:
833    movq    mm0, [srcq+srcstrideq*0]
834    movq    mm1, [srcq+srcstrideq*1]
835    lea    srcq, [srcq+srcstrideq*2]
836    movq [dstq+dststrideq*0], mm0
837    movq [dstq+dststrideq*1], mm1
838    lea    dstq, [dstq+dststrideq*2]
839    sub heightd, 2
840    jg .nextrow
841    REP_RET
842
843INIT_XMM sse
844cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
845.nextrow:
846    movups xmm0, [srcq+srcstrideq*0]
847    movups xmm1, [srcq+srcstrideq*1]
848    lea    srcq, [srcq+srcstrideq*2]
849    movaps [dstq+dststrideq*0], xmm0
850    movaps [dstq+dststrideq*1], xmm1
851    lea    dstq, [dstq+dststrideq*2]
852    sub heightd, 2
853    jg .nextrow
854    REP_RET
855
856;-----------------------------------------------------------------------------
857; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
858;-----------------------------------------------------------------------------
859
860%macro ADD_DC 4
861    %4        m2, [dst1q+%3]
862    %4        m3, [dst1q+strideq+%3]
863    %4        m4, [dst2q+%3]
864    %4        m5, [dst2q+strideq+%3]
865    paddusb   m2, %1
866    paddusb   m3, %1
867    paddusb   m4, %1
868    paddusb   m5, %1
869    psubusb   m2, %2
870    psubusb   m3, %2
871    psubusb   m4, %2
872    psubusb   m5, %2
873    %4 [dst1q+%3], m2
874    %4 [dst1q+strideq+%3], m3
875    %4 [dst2q+%3], m4
876    %4 [dst2q+strideq+%3], m5
877%endmacro
878
879%macro VP8_IDCT_DC_ADD 0
880cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
881    ; load data
882    movd       m0, [blockq]
883    pxor       m1, m1
884
885    ; calculate DC
886    paddw      m0, [pw_4]
887    movd [blockq], m1
888    DEFINE_ARGS dst1, dst2, stride
889    lea     dst2q, [dst1q+strideq*2]
890    movd       m2, [dst1q]
891    movd       m3, [dst1q+strideq]
892    movd       m4, [dst2q]
893    movd       m5, [dst2q+strideq]
894    psraw      m0, 3
895    pshuflw    m0, m0, 0
896    punpcklqdq m0, m0
897    punpckldq  m2, m3
898    punpckldq  m4, m5
899    punpcklbw  m2, m1
900    punpcklbw  m4, m1
901    paddw      m2, m0
902    paddw      m4, m0
903    packuswb   m2, m4
904    movd   [dst1q], m2
905%if cpuflag(sse4)
906    pextrd [dst1q+strideq], m2, 1
907    pextrd [dst2q], m2, 2
908    pextrd [dst2q+strideq], m2, 3
909%else
910    psrldq     m2, 4
911    movd [dst1q+strideq], m2
912    psrldq     m2, 4
913    movd [dst2q], m2
914    psrldq     m2, 4
915    movd [dst2q+strideq], m2
916%endif
917    RET
918%endmacro
919
920INIT_XMM sse2
921VP8_IDCT_DC_ADD
922INIT_XMM sse4
923VP8_IDCT_DC_ADD
924
925;-----------------------------------------------------------------------------
926; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
927;-----------------------------------------------------------------------------
928
929INIT_XMM sse2
930cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
931    ; load data
932    movd      m0, [blockq+32*0] ; A
933    movd      m1, [blockq+32*2] ; C
934    punpcklwd m0, [blockq+32*1] ; A B
935    punpcklwd m1, [blockq+32*3] ; C D
936    punpckldq m0, m1        ; A B C D
937    pxor      m1, m1
938
939    ; calculate DC
940    paddw     m0, [pw_4]
941    movd [blockq+32*0], m1
942    movd [blockq+32*1], m1
943    movd [blockq+32*2], m1
944    movd [blockq+32*3], m1
945    psraw     m0, 3
946    psubw     m1, m0
947    packuswb  m0, m0
948    packuswb  m1, m1
949    punpcklbw m0, m0
950    punpcklbw m1, m1
951    punpcklbw m0, m0
952    punpcklbw m1, m1
953
954    ; add DC
955    DEFINE_ARGS dst1, dst2, stride
956    lea    dst2q, [dst1q+strideq*2]
957    ADD_DC    m0, m1, 0, mova
958    RET
959
960;-----------------------------------------------------------------------------
961; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
962;-----------------------------------------------------------------------------
963
964INIT_MMX mmx
965cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
966    ; load data
967    movd      m0, [blockq+32*0] ; A
968    movd      m1, [blockq+32*2] ; C
969    punpcklwd m0, [blockq+32*1] ; A B
970    punpcklwd m1, [blockq+32*3] ; C D
971    punpckldq m0, m1        ; A B C D
972    pxor      m6, m6
973
974    ; calculate DC
975    paddw     m0, [pw_4]
976    movd [blockq+32*0], m6
977    movd [blockq+32*1], m6
978    movd [blockq+32*2], m6
979    movd [blockq+32*3], m6
980    psraw     m0, 3
981    psubw     m6, m0
982    packuswb  m0, m0
983    packuswb  m6, m6
984    punpcklbw m0, m0 ; AABBCCDD
985    punpcklbw m6, m6 ; AABBCCDD
986    movq      m1, m0
987    movq      m7, m6
988    punpcklbw m0, m0 ; AAAABBBB
989    punpckhbw m1, m1 ; CCCCDDDD
990    punpcklbw m6, m6 ; AAAABBBB
991    punpckhbw m7, m7 ; CCCCDDDD
992
993    ; add DC
994    DEFINE_ARGS dst1, dst2, stride
995    lea    dst2q, [dst1q+strideq*2]
996    ADD_DC    m0, m6, 0, mova
997    lea    dst1q, [dst1q+strideq*4]
998    lea    dst2q, [dst2q+strideq*4]
999    ADD_DC    m1, m7, 0, mova
1000    RET
1001
1002;-----------------------------------------------------------------------------
1003; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
1004;-----------------------------------------------------------------------------
1005
1006; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
1007;           this macro assumes that m6/m7 have words for 20091/17734 loaded
1008%macro VP8_MULTIPLY_SUMSUB 4
1009    mova      %3, %1
1010    mova      %4, %2
1011    pmulhw    %3, m6 ;20091(1)
1012    pmulhw    %4, m6 ;20091(2)
1013    paddw     %3, %1
1014    paddw     %4, %2
1015    paddw     %1, %1
1016    paddw     %2, %2
1017    pmulhw    %1, m7 ;35468(1)
1018    pmulhw    %2, m7 ;35468(2)
1019    psubw     %1, %4
1020    paddw     %2, %3
1021%endmacro
1022
1023; calculate x0=%1+%3; x1=%1-%3
1024;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
1025;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
1026;           %5/%6 are temporary registers
1027;           we assume m6/m7 have constant words 20091/17734 loaded in them
1028%macro VP8_IDCT_TRANSFORM4x4_1D 6
1029    SUMSUB_BA         w, %3,  %1,  %5     ;t0, t1
1030    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1031    SUMSUB_BA         w, %4,  %3,  %5     ;tmp0, tmp3
1032    SUMSUB_BA         w, %2,  %1,  %5     ;tmp1, tmp2
1033    SWAP                 %4,  %1
1034    SWAP                 %4,  %3
1035%endmacro
1036
1037INIT_MMX sse
1038cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
1039    ; load block data
1040    movq         m0, [blockq+ 0]
1041    movq         m1, [blockq+ 8]
1042    movq         m2, [blockq+16]
1043    movq         m3, [blockq+24]
1044    movq         m6, [pw_20091]
1045    movq         m7, [pw_17734]
1046    xorps      xmm0, xmm0
1047    movaps [blockq+ 0], xmm0
1048    movaps [blockq+16], xmm0
1049
1050    ; actual IDCT
1051    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1052    TRANSPOSE4x4W            0, 1, 2, 3, 4
1053    paddw        m0, [pw_4]
1054    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1055    TRANSPOSE4x4W            0, 1, 2, 3, 4
1056
1057    ; store
1058    pxor         m4, m4
1059    DEFINE_ARGS dst1, dst2, stride
1060    lea       dst2q, [dst1q+2*strideq]
1061    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
1062    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
1063
1064    RET
1065
1066;-----------------------------------------------------------------------------
1067; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
1068;-----------------------------------------------------------------------------
1069
1070%macro SCATTER_WHT 3
1071    movd dc1d, m%1
1072    movd dc2d, m%2
1073    mov [blockq+2*16*(0+%3)], dc1w
1074    mov [blockq+2*16*(1+%3)], dc2w
1075    shr  dc1d, 16
1076    shr  dc2d, 16
1077    psrlq m%1, 32
1078    psrlq m%2, 32
1079    mov [blockq+2*16*(4+%3)], dc1w
1080    mov [blockq+2*16*(5+%3)], dc2w
1081    movd dc1d, m%1
1082    movd dc2d, m%2
1083    mov [blockq+2*16*(8+%3)], dc1w
1084    mov [blockq+2*16*(9+%3)], dc2w
1085    shr  dc1d, 16
1086    shr  dc2d, 16
1087    mov [blockq+2*16*(12+%3)], dc1w
1088    mov [blockq+2*16*(13+%3)], dc2w
1089%endmacro
1090
1091%macro HADAMARD4_1D 4
1092    SUMSUB_BADC w, %2, %1, %4, %3
1093    SUMSUB_BADC w, %4, %2, %3, %1
1094    SWAP %1, %4, %3
1095%endmacro
1096
1097INIT_MMX sse
1098cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
1099    movq          m0, [dc1q]
1100    movq          m1, [dc1q+8]
1101    movq          m2, [dc1q+16]
1102    movq          m3, [dc1q+24]
1103    xorps      xmm0, xmm0
1104    movaps [dc1q+ 0], xmm0
1105    movaps [dc1q+16], xmm0
1106    HADAMARD4_1D  0, 1, 2, 3
1107    TRANSPOSE4x4W 0, 1, 2, 3, 4
1108    paddw         m0, [pw_3]
1109    HADAMARD4_1D  0, 1, 2, 3
1110    psraw         m0, 3
1111    psraw         m1, 3
1112    psraw         m2, 3
1113    psraw         m3, 3
1114    SCATTER_WHT   0, 1, 0
1115    SCATTER_WHT   2, 3, 2
1116    RET
1117