1;******************************************************************************
2;* MMX/SSSE3-optimized functions for H.264 chroma MC
3;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
4;*               2005-2008 Loren Merritt
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27rnd_rv40_2d_tbl: times 4 dw  0
28                 times 4 dw 16
29                 times 4 dw 32
30                 times 4 dw 16
31                 times 4 dw 32
32                 times 4 dw 28
33                 times 4 dw 32
34                 times 4 dw 28
35                 times 4 dw  0
36                 times 4 dw 32
37                 times 4 dw 16
38                 times 4 dw 32
39                 times 4 dw 32
40                 times 4 dw 28
41                 times 4 dw 32
42                 times 4 dw 28
43rnd_rv40_1d_tbl: times 4 dw  0
44                 times 4 dw  2
45                 times 4 dw  4
46                 times 4 dw  2
47                 times 4 dw  4
48                 times 4 dw  3
49                 times 4 dw  4
50                 times 4 dw  3
51                 times 4 dw  0
52                 times 4 dw  4
53                 times 4 dw  2
54                 times 4 dw  4
55                 times 4 dw  4
56                 times 4 dw  3
57                 times 4 dw  4
58                 times 4 dw  3
59
60cextern pw_3
61cextern pw_4
62cextern pw_8
63pw_28: times 8 dw 28
64cextern pw_32
65cextern pw_64
66
67SECTION .text
68
69%macro mv0_pixels_mc8 0
70    lea           r4, [r2*2 ]
71.next4rows:
72    movq         mm0, [r1   ]
73    movq         mm1, [r1+r2]
74    add           r1, r4
75    CHROMAMC_AVG mm0, [r0   ]
76    CHROMAMC_AVG mm1, [r0+r2]
77    movq     [r0   ], mm0
78    movq     [r0+r2], mm1
79    add           r0, r4
80    movq         mm0, [r1   ]
81    movq         mm1, [r1+r2]
82    add           r1, r4
83    CHROMAMC_AVG mm0, [r0   ]
84    CHROMAMC_AVG mm1, [r0+r2]
85    movq     [r0   ], mm0
86    movq     [r0+r2], mm1
87    add           r0, r4
88    sub          r3d, 4
89    jne .next4rows
90%endmacro
91
92%macro chroma_mc8_mmx_func 2-3
93%ifidn %2, rv40
94%ifdef PIC
95%define rnd_1d_rv40 r8
96%define rnd_2d_rv40 r8
97%define extra_regs 2
98%else ; no-PIC
99%define rnd_1d_rv40 rnd_rv40_1d_tbl
100%define rnd_2d_rv40 rnd_rv40_2d_tbl
101%define extra_regs 1
102%endif ; PIC
103%else
104%define extra_regs 0
105%endif ; rv40
106; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */,
107;                                   uint8_t *src /* align 1 */,
108;                                   ptrdiff_t stride, int h, int mx, int my)
109cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
110    mov          r6d, r5d
111    or           r6d, r4d
112    jne .at_least_one_non_zero
113    ; mx == 0 AND my == 0 - no filter needed
114    mv0_pixels_mc8
115    REP_RET
116
117.at_least_one_non_zero:
118%ifidn %2, rv40
119%if ARCH_X86_64
120    mov           r7, r5
121    and           r7, 6         ; &~1 for mx/my=[0,7]
122    lea           r7, [r7*4+r4]
123    sar          r7d, 1
124%define rnd_bias r7
125%define dest_reg r0
126%else ; x86-32
127    mov           r0, r5
128    and           r0, 6         ; &~1 for mx/my=[0,7]
129    lea           r0, [r0*4+r4]
130    sar          r0d, 1
131%define rnd_bias r0
132%define dest_reg r5
133%endif
134%else ; vc1, h264
135%define rnd_bias  0
136%define dest_reg r0
137%endif
138
139    test         r5d, r5d
140    mov           r6, 1
141    je .my_is_zero
142    test         r4d, r4d
143    mov           r6, r2        ; dxy = x ? 1 : stride
144    jne .both_non_zero
145.my_is_zero:
146    ; mx == 0 XOR my == 0 - 1 dimensional filter only
147    or           r4d, r5d       ; x + y
148
149%ifidn %2, rv40
150%ifdef PIC
151    lea           r8, [rnd_rv40_1d_tbl]
152%endif
153%if ARCH_X86_64 == 0
154    mov           r5, r0m
155%endif
156%endif
157
158    movd          m5, r4d
159    movq          m4, [pw_8]
160    movq          m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
161    punpcklwd     m5, m5
162    punpckldq     m5, m5        ; mm5 = B = x
163    pxor          m7, m7
164    psubw         m4, m5        ; mm4 = A = 8-x
165
166.next1drow:
167    movq          m0, [r1   ]   ; mm0 = src[0..7]
168    movq          m2, [r1+r6]   ; mm1 = src[1..8]
169
170    movq          m1, m0
171    movq          m3, m2
172    punpcklbw     m0, m7
173    punpckhbw     m1, m7
174    punpcklbw     m2, m7
175    punpckhbw     m3, m7
176    pmullw        m0, m4        ; [mm0,mm1] = A * src[0..7]
177    pmullw        m1, m4
178    pmullw        m2, m5        ; [mm2,mm3] = B * src[1..8]
179    pmullw        m3, m5
180
181    paddw         m0, m6
182    paddw         m1, m6
183    paddw         m0, m2
184    paddw         m1, m3
185    psrlw         m0, 3
186    psrlw         m1, 3
187    packuswb      m0, m1
188    CHROMAMC_AVG  m0, [dest_reg]
189    movq  [dest_reg], m0        ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
190
191    add     dest_reg, r2
192    add           r1, r2
193    dec           r3d
194    jne .next1drow
195    REP_RET
196
197.both_non_zero: ; general case, bilinear
198    movd          m4, r4d         ; x
199    movd          m6, r5d         ; y
200%ifidn %2, rv40
201%ifdef PIC
202    lea           r8, [rnd_rv40_2d_tbl]
203%endif
204%if ARCH_X86_64 == 0
205    mov           r5, r0m
206%endif
207%endif
208    mov           r6, rsp         ; backup stack pointer
209    and          rsp, ~(mmsize-1) ; align stack
210    sub          rsp, 16          ; AA and DD
211
212    punpcklwd     m4, m4
213    punpcklwd     m6, m6
214    punpckldq     m4, m4          ; mm4 = x words
215    punpckldq     m6, m6          ; mm6 = y words
216    movq          m5, m4
217    pmullw        m4, m6          ; mm4 = x * y
218    psllw         m5, 3
219    psllw         m6, 3
220    movq          m7, m5
221    paddw         m7, m6
222    movq     [rsp+8], m4          ; DD = x * y
223    psubw         m5, m4          ; mm5 = B = 8x - xy
224    psubw         m6, m4          ; mm6 = C = 8y - xy
225    paddw         m4, [pw_64]
226    psubw         m4, m7          ; mm4 = A = xy - (8x+8y) + 64
227    pxor          m7, m7
228    movq     [rsp  ], m4
229
230    movq          m0, [r1  ]      ; mm0 = src[0..7]
231    movq          m1, [r1+1]      ; mm1 = src[1..8]
232.next2drow:
233    add           r1, r2
234
235    movq          m2, m0
236    movq          m3, m1
237    punpckhbw     m0, m7
238    punpcklbw     m1, m7
239    punpcklbw     m2, m7
240    punpckhbw     m3, m7
241    pmullw        m0, [rsp]
242    pmullw        m2, [rsp]
243    pmullw        m1, m5
244    pmullw        m3, m5
245    paddw         m2, m1          ; mm2 = A * src[0..3] + B * src[1..4]
246    paddw         m3, m0          ; mm3 = A * src[4..7] + B * src[5..8]
247
248    movq          m0, [r1]
249    movq          m1, m0
250    punpcklbw     m0, m7
251    punpckhbw     m1, m7
252    pmullw        m0, m6
253    pmullw        m1, m6
254    paddw         m2, m0
255    paddw         m3, m1          ; [mm2,mm3] += C * src[0..7]
256
257    movq          m1, [r1+1]
258    movq          m0, m1
259    movq          m4, m1
260    punpcklbw     m0, m7
261    punpckhbw     m4, m7
262    pmullw        m0, [rsp+8]
263    pmullw        m4, [rsp+8]
264    paddw         m2, m0
265    paddw         m3, m4          ; [mm2,mm3] += D * src[1..8]
266    movq          m0, [r1]
267
268    paddw         m2, [rnd_2d_%2+rnd_bias*8]
269    paddw         m3, [rnd_2d_%2+rnd_bias*8]
270    psrlw         m2, 6
271    psrlw         m3, 6
272    packuswb      m2, m3
273    CHROMAMC_AVG  m2, [dest_reg]
274    movq  [dest_reg], m2          ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
275
276    add     dest_reg, r2
277    dec          r3d
278    jne .next2drow
279    mov          rsp, r6          ; restore stack pointer
280    RET
281%endmacro
282
283%macro chroma_mc4_mmx_func 2
284%define extra_regs 0
285%ifidn %2, rv40
286%ifdef PIC
287%define extra_regs 1
288%endif ; PIC
289%endif ; rv40
290cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
291    pxor          m7, m7
292    movd          m2, r4d         ; x
293    movd          m3, r5d         ; y
294    movq          m4, [pw_8]
295    movq          m5, [pw_8]
296    punpcklwd     m2, m2
297    punpcklwd     m3, m3
298    punpcklwd     m2, m2
299    punpcklwd     m3, m3
300    psubw         m4, m2
301    psubw         m5, m3
302
303%ifidn %2, rv40
304%ifdef PIC
305   lea            r6, [rnd_rv40_2d_tbl]
306%define rnd_2d_rv40 r6
307%else
308%define rnd_2d_rv40 rnd_rv40_2d_tbl
309%endif
310    and           r5, 6         ; &~1 for mx/my=[0,7]
311    lea           r5, [r5*4+r4]
312    sar          r5d, 1
313%define rnd_bias r5
314%else ; vc1, h264
315%define rnd_bias 0
316%endif
317
318    movd          m0, [r1  ]
319    movd          m6, [r1+1]
320    add           r1, r2
321    punpcklbw     m0, m7
322    punpcklbw     m6, m7
323    pmullw        m0, m4
324    pmullw        m6, m2
325    paddw         m6, m0
326
327.next2rows:
328    movd          m0, [r1  ]
329    movd          m1, [r1+1]
330    add           r1, r2
331    punpcklbw     m0, m7
332    punpcklbw     m1, m7
333    pmullw        m0, m4
334    pmullw        m1, m2
335    paddw         m1, m0
336    movq          m0, m1
337
338    pmullw        m6, m5
339    pmullw        m1, m3
340    paddw         m6, [rnd_2d_%2+rnd_bias*8]
341    paddw         m1, m6
342    psrlw         m1, 6
343    packuswb      m1, m1
344    CHROMAMC_AVG4 m1, m6, [r0]
345    movd        [r0], m1
346    add           r0, r2
347
348    movd          m6, [r1  ]
349    movd          m1, [r1+1]
350    add           r1, r2
351    punpcklbw     m6, m7
352    punpcklbw     m1, m7
353    pmullw        m6, m4
354    pmullw        m1, m2
355    paddw         m1, m6
356    movq          m6, m1
357    pmullw        m0, m5
358    pmullw        m1, m3
359    paddw         m0, [rnd_2d_%2+rnd_bias*8]
360    paddw         m1, m0
361    psrlw         m1, 6
362    packuswb      m1, m1
363    CHROMAMC_AVG4 m1, m0, [r0]
364    movd        [r0], m1
365    add           r0, r2
366    sub          r3d, 2
367    jnz .next2rows
368    REP_RET
369%endmacro
370
371%macro chroma_mc2_mmx_func 2
372cglobal %1_%2_chroma_mc2, 6, 7, 0
373    mov          r6d, r4d
374    shl          r4d, 16
375    sub          r4d, r6d
376    add          r4d, 8
377    imul         r5d, r4d         ; x*y<<16 | y*(8-x)
378    shl          r4d, 3
379    sub          r4d, r5d         ; x*(8-y)<<16 | (8-x)*(8-y)
380
381    movd          m5, r4d
382    movd          m6, r5d
383    punpckldq     m5, m5          ; mm5 = {A,B,A,B}
384    punpckldq     m6, m6          ; mm6 = {C,D,C,D}
385    pxor          m7, m7
386    movd          m2, [r1]
387    punpcklbw     m2, m7
388    pshufw        m2, m2, 0x94    ; mm0 = src[0,1,1,2]
389
390.nextrow:
391    add           r1, r2
392    movq          m1, m2
393    pmaddwd       m1, m5          ; mm1 = A * src[0,1] + B * src[1,2]
394    movd          m0, [r1]
395    punpcklbw     m0, m7
396    pshufw        m0, m0, 0x94    ; mm0 = src[0,1,1,2]
397    movq          m2, m0
398    pmaddwd       m0, m6
399    paddw         m1, [rnd_2d_%2]
400    paddw         m1, m0          ; mm1 += C * src[0,1] + D * src[1,2]
401    psrlw         m1, 6
402    packssdw      m1, m7
403    packuswb      m1, m7
404    CHROMAMC_AVG4 m1, m3, [r0]
405    movd         r5d, m1
406    mov         [r0], r5w
407    add           r0, r2
408    sub          r3d, 1
409    jnz .nextrow
410    REP_RET
411%endmacro
412
413%define rnd_1d_h264 pw_4
414%define rnd_2d_h264 pw_32
415%define rnd_1d_vc1  pw_3
416%define rnd_2d_vc1  pw_28
417
418%macro NOTHING 2-3
419%endmacro
420%macro DIRECT_AVG 2
421    PAVGB         %1, %2
422%endmacro
423%macro COPY_AVG 3
424    movd          %2, %3
425    PAVGB         %1, %2
426%endmacro
427
428INIT_MMX mmx
429%define CHROMAMC_AVG  NOTHING
430%define CHROMAMC_AVG4 NOTHING
431chroma_mc8_mmx_func put, h264, _rnd
432chroma_mc8_mmx_func put, vc1,  _nornd
433chroma_mc8_mmx_func put, rv40
434chroma_mc4_mmx_func put, h264
435chroma_mc4_mmx_func put, rv40
436
437INIT_MMX mmxext
438chroma_mc2_mmx_func put, h264
439
440%define CHROMAMC_AVG  DIRECT_AVG
441%define CHROMAMC_AVG4 COPY_AVG
442chroma_mc8_mmx_func avg, h264, _rnd
443chroma_mc8_mmx_func avg, vc1,  _nornd
444chroma_mc8_mmx_func avg, rv40
445chroma_mc4_mmx_func avg, h264
446chroma_mc4_mmx_func avg, rv40
447chroma_mc2_mmx_func avg, h264
448
449%macro chroma_mc8_ssse3_func 2-3
450cglobal %1_%2_chroma_mc8%3, 6, 7, 8
451    mov          r6d, r5d
452    or           r6d, r4d
453    jne .at_least_one_non_zero
454    ; mx == 0 AND my == 0 - no filter needed
455    mv0_pixels_mc8
456    REP_RET
457
458.at_least_one_non_zero:
459    test         r5d, r5d
460    je .my_is_zero
461    test         r4d, r4d
462    je .mx_is_zero
463
464    ; general case, bilinear
465    mov          r6d, r4d
466    shl          r4d, 8
467    sub           r4, r6
468    mov           r6, 8
469    add           r4, 8           ; x*288+8 = x<<8 | (8-x)
470    sub          r6d, r5d
471    imul          r6, r4          ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
472    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
473
474    movd          m7, r6d
475    movd          m6, r4d
476    movdqa        m5, [rnd_2d_%2]
477    movq          m0, [r1  ]
478    movq          m1, [r1+1]
479    pshuflw       m7, m7, 0
480    pshuflw       m6, m6, 0
481    punpcklbw     m0, m1
482    movlhps       m7, m7
483    movlhps       m6, m6
484
485.next2rows:
486    movq          m1, [r1+r2*1   ]
487    movq          m2, [r1+r2*1+1]
488    movq          m3, [r1+r2*2  ]
489    movq          m4, [r1+r2*2+1]
490    lea           r1, [r1+r2*2]
491    punpcklbw     m1, m2
492    movdqa        m2, m1
493    punpcklbw     m3, m4
494    movdqa        m4, m3
495    pmaddubsw     m0, m7
496    pmaddubsw     m1, m6
497    pmaddubsw     m2, m7
498    pmaddubsw     m3, m6
499    paddw         m0, m5
500    paddw         m2, m5
501    paddw         m1, m0
502    paddw         m3, m2
503    psrlw         m1, 6
504    movdqa        m0, m4
505    psrlw         m3, 6
506%ifidn %1, avg
507    movq          m2, [r0   ]
508    movhps        m2, [r0+r2]
509%endif
510    packuswb      m1, m3
511    CHROMAMC_AVG  m1, m2
512    movq     [r0   ], m1
513    movhps   [r0+r2], m1
514    sub          r3d, 2
515    lea           r0, [r0+r2*2]
516    jg .next2rows
517    REP_RET
518
519.my_is_zero:
520    mov          r5d, r4d
521    shl          r4d, 8
522    add           r4, 8
523    sub           r4, r5          ; 255*x+8 = x<<8 | (8-x)
524    movd          m7, r4d
525    movdqa        m6, [rnd_1d_%2]
526    pshuflw       m7, m7, 0
527    movlhps       m7, m7
528
529.next2xrows:
530    movq          m0, [r1     ]
531    movq          m1, [r1   +1]
532    movq          m2, [r1+r2  ]
533    movq          m3, [r1+r2+1]
534    punpcklbw     m0, m1
535    punpcklbw     m2, m3
536    pmaddubsw     m0, m7
537    pmaddubsw     m2, m7
538%ifidn %1, avg
539    movq          m4, [r0   ]
540    movhps        m4, [r0+r2]
541%endif
542    paddw         m0, m6
543    paddw         m2, m6
544    psrlw         m0, 3
545    psrlw         m2, 3
546    packuswb      m0, m2
547    CHROMAMC_AVG  m0, m4
548    movq     [r0   ], m0
549    movhps   [r0+r2], m0
550    sub          r3d, 2
551    lea           r0, [r0+r2*2]
552    lea           r1, [r1+r2*2]
553    jg .next2xrows
554    REP_RET
555
556.mx_is_zero:
557    mov          r4d, r5d
558    shl          r5d, 8
559    add           r5, 8
560    sub           r5, r4          ; 255*y+8 = y<<8 | (8-y)
561    movd          m7, r5d
562    movdqa        m6, [rnd_1d_%2]
563    pshuflw       m7, m7, 0
564    movlhps       m7, m7
565
566.next2yrows:
567    movq          m0, [r1     ]
568    movq          m1, [r1+r2  ]
569    movdqa        m2, m1
570    movq          m3, [r1+r2*2]
571    lea           r1, [r1+r2*2]
572    punpcklbw     m0, m1
573    punpcklbw     m2, m3
574    pmaddubsw     m0, m7
575    pmaddubsw     m2, m7
576%ifidn %1, avg
577    movq          m4, [r0   ]
578    movhps        m4, [r0+r2]
579%endif
580    paddw         m0, m6
581    paddw         m2, m6
582    psrlw         m0, 3
583    psrlw         m2, 3
584    packuswb      m0, m2
585    CHROMAMC_AVG  m0, m4
586    movq     [r0   ], m0
587    movhps   [r0+r2], m0
588    sub          r3d, 2
589    lea           r0, [r0+r2*2]
590    jg .next2yrows
591    REP_RET
592%endmacro
593
594%macro chroma_mc4_ssse3_func 2
595cglobal %1_%2_chroma_mc4, 6, 7, 0
596    mov           r6, r4
597    shl          r4d, 8
598    sub          r4d, r6d
599    mov           r6, 8
600    add          r4d, 8           ; x*288+8
601    sub          r6d, r5d
602    imul         r6d, r4d         ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
603    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
604
605    movd          m7, r6d
606    movd          m6, r4d
607    movq          m5, [pw_32]
608    movd          m0, [r1  ]
609    pshufw        m7, m7, 0
610    punpcklbw     m0, [r1+1]
611    pshufw        m6, m6, 0
612
613.next2rows:
614    movd          m1, [r1+r2*1  ]
615    movd          m3, [r1+r2*2  ]
616    punpcklbw     m1, [r1+r2*1+1]
617    punpcklbw     m3, [r1+r2*2+1]
618    lea           r1, [r1+r2*2]
619    movq          m2, m1
620    movq          m4, m3
621    pmaddubsw     m0, m7
622    pmaddubsw     m1, m6
623    pmaddubsw     m2, m7
624    pmaddubsw     m3, m6
625    paddw         m0, m5
626    paddw         m2, m5
627    paddw         m1, m0
628    paddw         m3, m2
629    psrlw         m1, 6
630    movq          m0, m4
631    psrlw         m3, 6
632    packuswb      m1, m1
633    packuswb      m3, m3
634    CHROMAMC_AVG  m1, [r0  ]
635    CHROMAMC_AVG  m3, [r0+r2]
636    movd     [r0   ], m1
637    movd     [r0+r2], m3
638    sub          r3d, 2
639    lea           r0, [r0+r2*2]
640    jg .next2rows
641    REP_RET
642%endmacro
643
644%define CHROMAMC_AVG NOTHING
645INIT_XMM ssse3
646chroma_mc8_ssse3_func put, h264, _rnd
647chroma_mc8_ssse3_func put, vc1,  _nornd
648INIT_MMX ssse3
649chroma_mc4_ssse3_func put, h264
650
651%define CHROMAMC_AVG DIRECT_AVG
652INIT_XMM ssse3
653chroma_mc8_ssse3_func avg, h264, _rnd
654chroma_mc8_ssse3_func avg, vc1,  _nornd
655INIT_MMX ssse3
656chroma_mc4_ssse3_func avg, h264
657