1;*****************************************************************************
2;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
3;*****************************************************************************
4;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5;* Copyright (C) 2012 Daniel Kang
6;*
7;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8;*
9;* This file is part of FFmpeg.
10;*
11;* FFmpeg is free software; you can redistribute it and/or
12;* modify it under the terms of the GNU Lesser General Public
13;* License as published by the Free Software Foundation; either
14;* version 2.1 of the License, or (at your option) any later version.
15;*
16;* FFmpeg is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19;* Lesser General Public License for more details.
20;*
21;* You should have received a copy of the GNU Lesser General Public
22;* License along with FFmpeg; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24;******************************************************************************
25
26%include "libavutil/x86/x86util.asm"
27
28SECTION_RODATA 32
29
30cextern pw_16
31cextern pw_5
32cextern pb_0
33
34SECTION .text
35
36
37%macro op_avgh 3
38    movh   %3, %2
39    pavgb  %1, %3
40    movh   %2, %1
41%endmacro
42
43%macro op_avg 2-3
44    pavgb  %1, %2
45    mova   %2, %1
46%endmacro
47
48%macro op_puth 2-3
49    movh   %2, %1
50%endmacro
51
52%macro op_put 2-3
53    mova   %2, %1
54%endmacro
55
56%macro QPEL4_H_LOWPASS_OP 1
57cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
58    movsxdifnidn  r2, r2d
59    movsxdifnidn  r3, r3d
60    pxor          m7, m7
61    mova          m4, [pw_5]
62    mova          m5, [pw_16]
63    mov          r4d, 4
64.loop:
65    movh          m1, [r1-1]
66    movh          m2, [r1+0]
67    movh          m3, [r1+1]
68    movh          m0, [r1+2]
69    punpcklbw     m1, m7
70    punpcklbw     m2, m7
71    punpcklbw     m3, m7
72    punpcklbw     m0, m7
73    paddw         m1, m0
74    paddw         m2, m3
75    movh          m0, [r1-2]
76    movh          m3, [r1+3]
77    punpcklbw     m0, m7
78    punpcklbw     m3, m7
79    paddw         m0, m3
80    psllw         m2, 2
81    psubw         m2, m1
82    pmullw        m2, m4
83    paddw         m0, m5
84    paddw         m0, m2
85    psraw         m0, 5
86    packuswb      m0, m0
87    op_%1h        m0, [r0], m6
88    add           r0, r2
89    add           r1, r3
90    dec          r4d
91    jg         .loop
92    REP_RET
93%endmacro
94
95INIT_MMX mmxext
96QPEL4_H_LOWPASS_OP put
97QPEL4_H_LOWPASS_OP avg
98
99%macro QPEL8_H_LOWPASS_OP 1
100cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
101    movsxdifnidn  r2, r2d
102    movsxdifnidn  r3, r3d
103    mov          r4d, 8
104    pxor          m7, m7
105    mova          m6, [pw_5]
106.loop:
107    mova          m0, [r1]
108    mova          m2, [r1+1]
109    mova          m1, m0
110    mova          m3, m2
111    punpcklbw     m0, m7
112    punpckhbw     m1, m7
113    punpcklbw     m2, m7
114    punpckhbw     m3, m7
115    paddw         m0, m2
116    paddw         m1, m3
117    psllw         m0, 2
118    psllw         m1, 2
119    mova          m2, [r1-1]
120    mova          m4, [r1+2]
121    mova          m3, m2
122    mova          m5, m4
123    punpcklbw     m2, m7
124    punpckhbw     m3, m7
125    punpcklbw     m4, m7
126    punpckhbw     m5, m7
127    paddw         m2, m4
128    paddw         m5, m3
129    psubw         m0, m2
130    psubw         m1, m5
131    pmullw        m0, m6
132    pmullw        m1, m6
133    movd          m2, [r1-2]
134    movd          m5, [r1+7]
135    punpcklbw     m2, m7
136    punpcklbw     m5, m7
137    paddw         m2, m3
138    paddw         m4, m5
139    mova          m5, [pw_16]
140    paddw         m2, m5
141    paddw         m4, m5
142    paddw         m0, m2
143    paddw         m1, m4
144    psraw         m0, 5
145    psraw         m1, 5
146    packuswb      m0, m1
147    op_%1         m0, [r0], m4
148    add           r0, r2
149    add           r1, r3
150    dec          r4d
151    jg         .loop
152    REP_RET
153%endmacro
154
155INIT_MMX mmxext
156QPEL8_H_LOWPASS_OP put
157QPEL8_H_LOWPASS_OP avg
158
159%macro QPEL8_H_LOWPASS_OP_XMM 1
160cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
161    movsxdifnidn  r2, r2d
162    movsxdifnidn  r3, r3d
163    mov          r4d, 8
164    pxor          m7, m7
165    mova          m6, [pw_5]
166.loop:
167    movu          m1, [r1-2]
168    mova          m0, m1
169    punpckhbw     m1, m7
170    punpcklbw     m0, m7
171    mova          m2, m1
172    mova          m3, m1
173    mova          m4, m1
174    mova          m5, m1
175    palignr       m4, m0, 2
176    palignr       m3, m0, 4
177    palignr       m2, m0, 6
178    palignr       m1, m0, 8
179    palignr       m5, m0, 10
180    paddw         m0, m5
181    paddw         m2, m3
182    paddw         m1, m4
183    psllw         m2, 2
184    psubw         m2, m1
185    paddw         m0, [pw_16]
186    pmullw        m2, m6
187    paddw         m2, m0
188    psraw         m2, 5
189    packuswb      m2, m2
190    op_%1h        m2, [r0], m4
191    add           r1, r3
192    add           r0, r2
193    dec          r4d
194    jne        .loop
195    REP_RET
196%endmacro
197
198INIT_XMM ssse3
199QPEL8_H_LOWPASS_OP_XMM put
200QPEL8_H_LOWPASS_OP_XMM avg
201
202
203%macro QPEL4_H_LOWPASS_L2_OP 1
204cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
205    movsxdifnidn  r3, r3d
206    movsxdifnidn  r4, r4d
207    pxor          m7, m7
208    mova          m4, [pw_5]
209    mova          m5, [pw_16]
210    mov          r5d, 4
211.loop:
212    movh          m1, [r1-1]
213    movh          m2, [r1+0]
214    movh          m3, [r1+1]
215    movh          m0, [r1+2]
216    punpcklbw     m1, m7
217    punpcklbw     m2, m7
218    punpcklbw     m3, m7
219    punpcklbw     m0, m7
220    paddw         m1, m0
221    paddw         m2, m3
222    movh          m0, [r1-2]
223    movh          m3, [r1+3]
224    punpcklbw     m0, m7
225    punpcklbw     m3, m7
226    paddw         m0, m3
227    psllw         m2, 2
228    psubw         m2, m1
229    pmullw        m2, m4
230    paddw         m0, m5
231    paddw         m0, m2
232    movh          m3, [r2]
233    psraw         m0, 5
234    packuswb      m0, m0
235    pavgb         m0, m3
236    op_%1h        m0, [r0], m6
237    add           r0, r3
238    add           r1, r3
239    add           r2, r4
240    dec          r5d
241    jg         .loop
242    REP_RET
243%endmacro
244
245INIT_MMX mmxext
246QPEL4_H_LOWPASS_L2_OP put
247QPEL4_H_LOWPASS_L2_OP avg
248
249
250%macro QPEL8_H_LOWPASS_L2_OP 1
251cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
252    movsxdifnidn  r3, r3d
253    movsxdifnidn  r4, r4d
254    mov          r5d, 8
255    pxor          m7, m7
256    mova          m6, [pw_5]
257.loop:
258    mova          m0, [r1]
259    mova          m2, [r1+1]
260    mova          m1, m0
261    mova          m3, m2
262    punpcklbw     m0, m7
263    punpckhbw     m1, m7
264    punpcklbw     m2, m7
265    punpckhbw     m3, m7
266    paddw         m0, m2
267    paddw         m1, m3
268    psllw         m0, 2
269    psllw         m1, 2
270    mova          m2, [r1-1]
271    mova          m4, [r1+2]
272    mova          m3, m2
273    mova          m5, m4
274    punpcklbw     m2, m7
275    punpckhbw     m3, m7
276    punpcklbw     m4, m7
277    punpckhbw     m5, m7
278    paddw         m2, m4
279    paddw         m5, m3
280    psubw         m0, m2
281    psubw         m1, m5
282    pmullw        m0, m6
283    pmullw        m1, m6
284    movd          m2, [r1-2]
285    movd          m5, [r1+7]
286    punpcklbw     m2, m7
287    punpcklbw     m5, m7
288    paddw         m2, m3
289    paddw         m4, m5
290    mova          m5, [pw_16]
291    paddw         m2, m5
292    paddw         m4, m5
293    paddw         m0, m2
294    paddw         m1, m4
295    psraw         m0, 5
296    psraw         m1, 5
297    mova          m4, [r2]
298    packuswb      m0, m1
299    pavgb         m0, m4
300    op_%1         m0, [r0], m4
301    add           r0, r3
302    add           r1, r3
303    add           r2, r4
304    dec          r5d
305    jg         .loop
306    REP_RET
307%endmacro
308
309INIT_MMX mmxext
310QPEL8_H_LOWPASS_L2_OP put
311QPEL8_H_LOWPASS_L2_OP avg
312
313
314%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
315cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
316    movsxdifnidn  r3, r3d
317    movsxdifnidn  r4, r4d
318    mov          r5d, 8
319    pxor          m7, m7
320    mova          m6, [pw_5]
321.loop:
322    lddqu         m1, [r1-2]
323    mova          m0, m1
324    punpckhbw     m1, m7
325    punpcklbw     m0, m7
326    mova          m2, m1
327    mova          m3, m1
328    mova          m4, m1
329    mova          m5, m1
330    palignr       m4, m0, 2
331    palignr       m3, m0, 4
332    palignr       m2, m0, 6
333    palignr       m1, m0, 8
334    palignr       m5, m0, 10
335    paddw         m0, m5
336    paddw         m2, m3
337    paddw         m1, m4
338    psllw         m2, 2
339    movh          m3, [r2]
340    psubw         m2, m1
341    paddw         m0, [pw_16]
342    pmullw        m2, m6
343    paddw         m2, m0
344    psraw         m2, 5
345    packuswb      m2, m2
346    pavgb         m2, m3
347    op_%1h        m2, [r0], m4
348    add           r1, r3
349    add           r0, r3
350    add           r2, r4
351    dec          r5d
352    jg         .loop
353    REP_RET
354%endmacro
355
356INIT_XMM ssse3
357QPEL8_H_LOWPASS_L2_OP_XMM put
358QPEL8_H_LOWPASS_L2_OP_XMM avg
359
360
361; All functions that call this are required to have function arguments of
362; dst, src, dstStride, srcStride
363%macro FILT_V 1
364    mova      m6, m2
365    movh      m5, [r1]
366    paddw     m6, m3
367    psllw     m6, 2
368    psubw     m6, m1
369    psubw     m6, m4
370    punpcklbw m5, m7
371    pmullw    m6, [pw_5]
372    paddw     m0, [pw_16]
373    add       r1, r3
374    paddw     m0, m5
375    paddw     m6, m0
376    psraw     m6, 5
377    packuswb  m6, m6
378    op_%1h    m6, [r0], m0 ; 1
379    add       r0, r2
380    SWAP       0, 1, 2, 3, 4, 5
381%endmacro
382
383%macro QPEL4_V_LOWPASS_OP 1
384cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
385    movsxdifnidn  r2, r2d
386    movsxdifnidn  r3, r3d
387    sub           r1, r3
388    sub           r1, r3
389    pxor          m7, m7
390    movh          m0, [r1]
391    movh          m1, [r1+r3]
392    lea           r1, [r1+2*r3]
393    movh          m2, [r1]
394    movh          m3, [r1+r3]
395    lea           r1, [r1+2*r3]
396    movh          m4, [r1]
397    add           r1, r3
398    punpcklbw     m0, m7
399    punpcklbw     m1, m7
400    punpcklbw     m2, m7
401    punpcklbw     m3, m7
402    punpcklbw     m4, m7
403    FILT_V        %1
404    FILT_V        %1
405    FILT_V        %1
406    FILT_V        %1
407    RET
408%endmacro
409
410INIT_MMX mmxext
411QPEL4_V_LOWPASS_OP put
412QPEL4_V_LOWPASS_OP avg
413
414
415
416%macro QPEL8OR16_V_LOWPASS_OP 1
417%if cpuflag(sse2)
418cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
419    movsxdifnidn  r2, r2d
420    movsxdifnidn  r3, r3d
421    sub           r1, r3
422    sub           r1, r3
423%else
424cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
425    movsxdifnidn  r2, r2d
426    movsxdifnidn  r3, r3d
427%endif
428    pxor          m7, m7
429    movh          m0, [r1]
430    movh          m1, [r1+r3]
431    lea           r1, [r1+2*r3]
432    movh          m2, [r1]
433    movh          m3, [r1+r3]
434    lea           r1, [r1+2*r3]
435    movh          m4, [r1]
436    add           r1, r3
437    punpcklbw     m0, m7
438    punpcklbw     m1, m7
439    punpcklbw     m2, m7
440    punpcklbw     m3, m7
441    punpcklbw     m4, m7
442    FILT_V        %1
443    FILT_V        %1
444    FILT_V        %1
445    FILT_V        %1
446    FILT_V        %1
447    FILT_V        %1
448    FILT_V        %1
449    FILT_V        %1
450    cmp          r4d, 16
451    jne         .end
452    FILT_V        %1
453    FILT_V        %1
454    FILT_V        %1
455    FILT_V        %1
456    FILT_V        %1
457    FILT_V        %1
458    FILT_V        %1
459    FILT_V        %1
460.end:
461    REP_RET
462%endmacro
463
464INIT_XMM sse2
465QPEL8OR16_V_LOWPASS_OP put
466QPEL8OR16_V_LOWPASS_OP avg
467
468
469; All functions that use this are required to have args:
470; src, tmp, srcSize
471%macro FILT_HV 1 ; offset
472    mova           m6, m2
473    movh           m5, [r0]
474    paddw          m6, m3
475    psllw          m6, 2
476    paddw          m0, [pw_16]
477    psubw          m6, m1
478    psubw          m6, m4
479    punpcklbw      m5, m7
480    pmullw         m6, [pw_5]
481    paddw          m0, m5
482    add            r0, r2
483    paddw          m6, m0
484    mova      [r1+%1], m6
485    SWAP            0, 1, 2, 3, 4, 5
486%endmacro
487
488%macro QPEL4_HV1_LOWPASS_OP 1
489cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
490    movsxdifnidn  r2, r2d
491    pxor          m7, m7
492    movh          m0, [r0]
493    movh          m1, [r0+r2]
494    lea           r0, [r0+2*r2]
495    movh          m2, [r0]
496    movh          m3, [r0+r2]
497    lea           r0, [r0+2*r2]
498    movh          m4, [r0]
499    add           r0, r2
500    punpcklbw     m0, m7
501    punpcklbw     m1, m7
502    punpcklbw     m2, m7
503    punpcklbw     m3, m7
504    punpcklbw     m4, m7
505    FILT_HV       0*24
506    FILT_HV       1*24
507    FILT_HV       2*24
508    FILT_HV       3*24
509    RET
510
511cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
512    movsxdifnidn  r2, r2d
513    mov          r3d, 4
514.loop:
515    mova          m0, [r0]
516    paddw         m0, [r0+10]
517    mova          m1, [r0+2]
518    paddw         m1, [r0+8]
519    mova          m2, [r0+4]
520    paddw         m2, [r0+6]
521    psubw         m0, m1
522    psraw         m0, 2
523    psubw         m0, m1
524    paddsw        m0, m2
525    psraw         m0, 2
526    paddw         m0, m2
527    psraw         m0, 6
528    packuswb      m0, m0
529    op_%1h        m0, [r1], m7
530    add           r0, 24
531    add           r1, r2
532    dec          r3d
533    jnz        .loop
534    REP_RET
535%endmacro
536
537INIT_MMX mmxext
538QPEL4_HV1_LOWPASS_OP put
539QPEL4_HV1_LOWPASS_OP avg
540
541%macro QPEL8OR16_HV1_LOWPASS_OP 1
542cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
543    movsxdifnidn  r2, r2d
544    pxor          m7, m7
545    movh          m0, [r0]
546    movh          m1, [r0+r2]
547    lea           r0, [r0+2*r2]
548    movh          m2, [r0]
549    movh          m3, [r0+r2]
550    lea           r0, [r0+2*r2]
551    movh          m4, [r0]
552    add           r0, r2
553    punpcklbw     m0, m7
554    punpcklbw     m1, m7
555    punpcklbw     m2, m7
556    punpcklbw     m3, m7
557    punpcklbw     m4, m7
558    FILT_HV     0*48
559    FILT_HV     1*48
560    FILT_HV     2*48
561    FILT_HV     3*48
562    FILT_HV     4*48
563    FILT_HV     5*48
564    FILT_HV     6*48
565    FILT_HV     7*48
566    cmp          r3d, 16
567    jne         .end
568    FILT_HV     8*48
569    FILT_HV     9*48
570    FILT_HV    10*48
571    FILT_HV    11*48
572    FILT_HV    12*48
573    FILT_HV    13*48
574    FILT_HV    14*48
575    FILT_HV    15*48
576.end:
577    REP_RET
578%endmacro
579
580INIT_XMM sse2
581QPEL8OR16_HV1_LOWPASS_OP put
582
583
584
585%macro QPEL8OR16_HV2_LOWPASS_OP 1
586; unused is to match ssse3 and mmxext args
587cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
588    movsxdifnidn  r2, r2d
589.loop:
590    mova          m0, [r1]
591    mova          m3, [r1+8]
592    mova          m1, [r1+2]
593    mova          m4, [r1+10]
594    paddw         m0, m4
595    paddw         m1, m3
596    paddw         m3, [r1+18]
597    paddw         m4, [r1+16]
598    mova          m2, [r1+4]
599    mova          m5, [r1+12]
600    paddw         m2, [r1+6]
601    paddw         m5, [r1+14]
602    psubw         m0, m1
603    psubw         m3, m4
604    psraw         m0, 2
605    psraw         m3, 2
606    psubw         m0, m1
607    psubw         m3, m4
608    paddsw        m0, m2
609    paddsw        m3, m5
610    psraw         m0, 2
611    psraw         m3, 2
612    paddw         m0, m2
613    paddw         m3, m5
614    psraw         m0, 6
615    psraw         m3, 6
616    packuswb      m0, m3
617    op_%1         m0, [r0], m7
618    add           r1, 48
619    add           r0, r2
620    dec          r4d
621    jne        .loop
622    REP_RET
623%endmacro
624
625INIT_MMX mmxext
626QPEL8OR16_HV2_LOWPASS_OP put
627QPEL8OR16_HV2_LOWPASS_OP avg
628
629%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
630cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
631    movsxdifnidn  r2, r2d
632    movsxdifnidn  r3, r3d
633    cmp          r4d, 16
634    je         .op16
635.loop8:
636    mova          m1, [r1+16]
637    mova          m0, [r1]
638    mova          m2, m1
639    mova          m3, m1
640    mova          m4, m1
641    mova          m5, m1
642    palignr       m5, m0, 10
643    palignr       m4, m0, 8
644    palignr       m3, m0, 6
645    palignr       m2, m0, 4
646    palignr       m1, m0, 2
647    paddw         m0, m5
648    paddw         m1, m4
649    paddw         m2, m3
650    psubw         m0, m1
651    psraw         m0, 2
652    psubw         m0, m1
653    paddw         m0, m2
654    psraw         m0, 2
655    paddw         m0, m2
656    psraw         m0, 6
657    packuswb      m0, m0
658    op_%1h        m0, [r0], m7
659    add           r1, 48
660    add           r0, r2
661    dec          r4d
662    jne       .loop8
663    jmp        .done
664.op16:
665    mova          m4, [r1+32]
666    mova          m5, [r1+16]
667    mova          m7, [r1]
668    mova          m3, m4
669    mova          m2, m4
670    mova          m1, m4
671    mova          m0, m4
672    palignr       m0, m5, 10
673    palignr       m1, m5, 8
674    palignr       m2, m5, 6
675    palignr       m3, m5, 4
676    palignr       m4, m5, 2
677    paddw         m0, m5
678    paddw         m1, m4
679    paddw         m2, m3
680    mova          m6, m5
681    mova          m4, m5
682    mova          m3, m5
683    palignr       m4, m7, 8
684    palignr       m6, m7, 2
685    palignr       m3, m7, 10
686    paddw         m4, m6
687    mova          m6, m5
688    palignr       m5, m7, 6
689    palignr       m6, m7, 4
690    paddw         m3, m7
691    paddw         m5, m6
692    psubw         m0, m1
693    psubw         m3, m4
694    psraw         m0, 2
695    psraw         m3, 2
696    psubw         m0, m1
697    psubw         m3, m4
698    paddw         m0, m2
699    paddw         m3, m5
700    psraw         m0, 2
701    psraw         m3, 2
702    paddw         m0, m2
703    paddw         m3, m5
704    psraw         m0, 6
705    psraw         m3, 6
706    packuswb      m3, m0
707    op_%1         m3, [r0], m7
708    add           r1, 48
709    add           r0, r2
710    dec          r4d
711    jne        .op16
712.done:
713    REP_RET
714%endmacro
715
716INIT_XMM ssse3
717QPEL8OR16_HV2_LOWPASS_OP_XMM put
718QPEL8OR16_HV2_LOWPASS_OP_XMM avg
719
720
721%macro PIXELS4_L2_SHIFT5 1
722cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
723    movsxdifnidn  r3, r3d
724    movsxdifnidn  r4, r4d
725    mova          m0, [r1]
726    mova          m1, [r1+24]
727    psraw         m0, 5
728    psraw         m1, 5
729    packuswb      m0, m0
730    packuswb      m1, m1
731    pavgb         m0, [r2]
732    pavgb         m1, [r2+r4]
733    op_%1h        m0, [r0], m4
734    op_%1h        m1, [r0+r3], m5
735    lea           r2, [r2+r4*2]
736    lea           r0, [r0+r3*2]
737    mova          m0, [r1+48]
738    mova          m1, [r1+72]
739    psraw         m0, 5
740    psraw         m1, 5
741    packuswb      m0, m0
742    packuswb      m1, m1
743    pavgb         m0, [r2]
744    pavgb         m1, [r2+r4]
745    op_%1h        m0, [r0], m4
746    op_%1h        m1, [r0+r3], m5
747    RET
748%endmacro
749
750INIT_MMX mmxext
751PIXELS4_L2_SHIFT5 put
752PIXELS4_L2_SHIFT5 avg
753
754
755%macro PIXELS8_L2_SHIFT5 1
756cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
757    movsxdifnidn  r3, r3d
758    movsxdifnidn  r4, r4d
759.loop:
760    mova          m0, [r1]
761    mova          m1, [r1+8]
762    mova          m2, [r1+48]
763    mova          m3, [r1+48+8]
764    psraw         m0, 5
765    psraw         m1, 5
766    psraw         m2, 5
767    psraw         m3, 5
768    packuswb      m0, m1
769    packuswb      m2, m3
770    pavgb         m0, [r2]
771    pavgb         m2, [r2+r4]
772    op_%1         m0, [r0], m4
773    op_%1         m2, [r0+r3], m5
774    lea           r2, [r2+2*r4]
775    add           r1, 48*2
776    lea           r0, [r0+2*r3]
777    sub          r5d, 2
778    jne        .loop
779    REP_RET
780%endmacro
781
782INIT_MMX mmxext
783PIXELS8_L2_SHIFT5 put
784PIXELS8_L2_SHIFT5 avg
785
786
787%if ARCH_X86_64
788%macro QPEL16_H_LOWPASS_L2_OP 1
789cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
790    movsxdifnidn  r3, r3d
791    movsxdifnidn  r4, r4d
792    mov          r5d, 16
793    pxor         m15, m15
794    mova         m14, [pw_5]
795    mova         m13, [pw_16]
796.loop:
797    lddqu         m1, [r1+6]
798    lddqu         m7, [r1-2]
799    mova          m0, m1
800    punpckhbw     m1, m15
801    punpcklbw     m0, m15
802    punpcklbw     m7, m15
803    mova          m2, m1
804    mova          m6, m0
805    mova          m3, m1
806    mova          m8, m0
807    mova          m4, m1
808    mova          m9, m0
809    mova         m12, m0
810    mova         m11, m1
811    palignr      m11, m0, 10
812    palignr      m12, m7, 10
813    palignr       m4, m0, 2
814    palignr       m9, m7, 2
815    palignr       m3, m0, 4
816    palignr       m8, m7, 4
817    palignr       m2, m0, 6
818    palignr       m6, m7, 6
819    paddw        m11, m0
820    palignr       m1, m0, 8
821    palignr       m0, m7, 8
822    paddw         m7, m12
823    paddw         m2, m3
824    paddw         m6, m8
825    paddw         m1, m4
826    paddw         m0, m9
827    psllw         m2, 2
828    psllw         m6, 2
829    psubw         m2, m1
830    psubw         m6, m0
831    paddw        m11, m13
832    paddw         m7, m13
833    pmullw        m2, m14
834    pmullw        m6, m14
835    lddqu         m3, [r2]
836    paddw         m2, m11
837    paddw         m6, m7
838    psraw         m2, 5
839    psraw         m6, 5
840    packuswb      m6, m2
841    pavgb         m6, m3
842    op_%1         m6, [r0], m11
843    add           r1, r3
844    add           r0, r3
845    add           r2, r4
846    dec          r5d
847    jg         .loop
848    REP_RET
849%endmacro
850
851INIT_XMM ssse3
852QPEL16_H_LOWPASS_L2_OP put
853QPEL16_H_LOWPASS_L2_OP avg
854%endif
855