1;*****************************************************************************
2;* SIMD-optimized motion compensation estimation
3;*****************************************************************************
4;* Copyright (c) 2000, 2001 Fabrice Bellard
5;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;*****************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28cextern pb_1
29cextern pb_80
30
31SECTION .text
32
33%macro DIFF_PIXELS_1 4
34    movh            %1, %3
35    movh            %2, %4
36    punpcklbw       %2, %1
37    punpcklbw       %1, %1
38    psubw           %1, %2
39%endmacro
40
41; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
42; %6=temporary storage location
43; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
44%macro DIFF_PIXELS_8 6
45    DIFF_PIXELS_1   m0, m7, [%1     +%3], [%2     +%3]
46    DIFF_PIXELS_1   m1, m7, [%1+%4  +%3], [%2+%4  +%3]
47    DIFF_PIXELS_1   m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
48    add             %1, %5
49    add             %2, %5
50    DIFF_PIXELS_1   m3, m7, [%1     +%3], [%2     +%3]
51    DIFF_PIXELS_1   m4, m7, [%1+%4  +%3], [%2+%4  +%3]
52    DIFF_PIXELS_1   m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
53    DIFF_PIXELS_1   m6, m7, [%1+%5  +%3], [%2+%5  +%3]
54%ifdef m8
55    DIFF_PIXELS_1   m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
56%else
57    mova          [%6], m0
58    DIFF_PIXELS_1   m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
59    mova            m0, [%6]
60%endif
61    sub             %1, %5
62    sub             %2, %5
63%endmacro
64
65%macro HADAMARD8 0
66    SUMSUB_BADC       w, 0, 1, 2, 3
67    SUMSUB_BADC       w, 4, 5, 6, 7
68    SUMSUB_BADC       w, 0, 2, 1, 3
69    SUMSUB_BADC       w, 4, 6, 5, 7
70    SUMSUB_BADC       w, 0, 4, 1, 5
71    SUMSUB_BADC       w, 2, 6, 3, 7
72%endmacro
73
74%macro ABS1_SUM 3
75    ABS1            %1, %2
76    paddusw         %3, %1
77%endmacro
78
79%macro ABS2_SUM 6
80    ABS2            %1, %2, %3, %4
81    paddusw         %5, %1
82    paddusw         %6, %2
83%endmacro
84
85%macro ABS_SUM_8x8_64 1
86    ABS2            m0, m1, m8, m9
87    ABS2_SUM        m2, m3, m8, m9, m0, m1
88    ABS2_SUM        m4, m5, m8, m9, m0, m1
89    ABS2_SUM        m6, m7, m8, m9, m0, m1
90    paddusw         m0, m1
91%endmacro
92
93%macro ABS_SUM_8x8_32 1
94    mova          [%1], m7
95    ABS1            m0, m7
96    ABS1            m1, m7
97    ABS1_SUM        m2, m7, m0
98    ABS1_SUM        m3, m7, m1
99    ABS1_SUM        m4, m7, m0
100    ABS1_SUM        m5, m7, m1
101    ABS1_SUM        m6, m7, m0
102    mova            m2, [%1]
103    ABS1_SUM        m2, m7, m1
104    paddusw         m0, m1
105%endmacro
106
107; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
108; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
109; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
110%macro HSUM 3
111%if cpuflag(sse2)
112    movhlps         %2, %1
113    paddusw         %1, %2
114    pshuflw         %2, %1, 0xE
115    paddusw         %1, %2
116    pshuflw         %2, %1, 0x1
117    paddusw         %1, %2
118    movd            %3, %1
119%elif cpuflag(mmxext)
120    pshufw          %2, %1, 0xE
121    paddusw         %1, %2
122    pshufw          %2, %1, 0x1
123    paddusw         %1, %2
124    movd            %3, %1
125%elif cpuflag(mmx)
126    mova            %2, %1
127    psrlq           %1, 32
128    paddusw         %1, %2
129    mova            %2, %1
130    psrlq           %1, 16
131    paddusw         %1, %2
132    movd            %3, %1
133%endif
134%endmacro
135
136%macro STORE4 5
137    mova [%1+mmsize*0], %2
138    mova [%1+mmsize*1], %3
139    mova [%1+mmsize*2], %4
140    mova [%1+mmsize*3], %5
141%endmacro
142
143%macro LOAD4 5
144    mova            %2, [%1+mmsize*0]
145    mova            %3, [%1+mmsize*1]
146    mova            %4, [%1+mmsize*2]
147    mova            %5, [%1+mmsize*3]
148%endmacro
149
150%macro hadamard8_16_wrapper 2
151cglobal hadamard8_diff, 4, 4, %1
152%ifndef m8
153    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
154    SUB            rsp, pad
155%endif
156    call hadamard8x8_diff %+ SUFFIX
157%ifndef m8
158    ADD            rsp, pad
159%endif
160    RET
161
162cglobal hadamard8_diff16, 5, 6, %1
163%ifndef m8
164    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
165    SUB            rsp, pad
166%endif
167
168    call hadamard8x8_diff %+ SUFFIX
169    mov            r5d, eax
170
171    add             r1, 8
172    add             r2, 8
173    call hadamard8x8_diff %+ SUFFIX
174    add            r5d, eax
175
176    cmp            r4d, 16
177    jne .done
178
179    lea             r1, [r1+r3*8-8]
180    lea             r2, [r2+r3*8-8]
181    call hadamard8x8_diff %+ SUFFIX
182    add            r5d, eax
183
184    add             r1, 8
185    add             r2, 8
186    call hadamard8x8_diff %+ SUFFIX
187    add            r5d, eax
188
189.done:
190    mov            eax, r5d
191%ifndef m8
192    ADD            rsp, pad
193%endif
194    RET
195%endmacro
196
197%macro HADAMARD8_DIFF 0-1
198%if cpuflag(sse2)
199hadamard8x8_diff %+ SUFFIX:
200    lea                          r0, [r3*3]
201    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize
202    HADAMARD8
203%if ARCH_X86_64
204    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7,  8
205%else
206    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7, [rsp+gprsize], [rsp+mmsize+gprsize]
207%endif
208    HADAMARD8
209    ABS_SUM_8x8         rsp+gprsize
210    HSUM                        m0, m1, eax
211    and                         eax, 0xFFFF
212    ret
213
214hadamard8_16_wrapper %1, 3
215%elif cpuflag(mmx)
216ALIGN 16
217; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,
218;                               uint8_t *src2, ptrdiff_t stride, int h)
219; r0 = void *s = unused, int h = unused (always 8)
220; note how r1, r2 and r3 are not clobbered in this function, so 16x16
221; can simply call this 2x2x (and that's why we access rsp+gprsize
222; everywhere, which is rsp of calling func
223hadamard8x8_diff %+ SUFFIX:
224    lea                          r0, [r3*3]
225
226    ; first 4x8 pixels
227    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize+0x60
228    HADAMARD8
229    mova         [rsp+gprsize+0x60], m7
230    TRANSPOSE4x4W                 0,  1,  2,  3,  7
231    STORE4              rsp+gprsize, m0, m1, m2, m3
232    mova                         m7, [rsp+gprsize+0x60]
233    TRANSPOSE4x4W                 4,  5,  6,  7,  0
234    STORE4         rsp+gprsize+0x40, m4, m5, m6, m7
235
236    ; second 4x8 pixels
237    DIFF_PIXELS_8                r1, r2,  4, r3, r0, rsp+gprsize+0x60
238    HADAMARD8
239    mova         [rsp+gprsize+0x60], m7
240    TRANSPOSE4x4W                 0,  1,  2,  3,  7
241    STORE4         rsp+gprsize+0x20, m0, m1, m2, m3
242    mova                         m7, [rsp+gprsize+0x60]
243    TRANSPOSE4x4W                 4,  5,  6,  7,  0
244
245    LOAD4          rsp+gprsize+0x40, m0, m1, m2, m3
246    HADAMARD8
247    ABS_SUM_8x8_32 rsp+gprsize+0x60
248    mova         [rsp+gprsize+0x60], m0
249
250    LOAD4          rsp+gprsize     , m0, m1, m2, m3
251    LOAD4          rsp+gprsize+0x20, m4, m5, m6, m7
252    HADAMARD8
253    ABS_SUM_8x8_32 rsp+gprsize
254    paddusw                      m0, [rsp+gprsize+0x60]
255
256    HSUM                         m0, m1, eax
257    and                         rax, 0xFFFF
258    ret
259
260hadamard8_16_wrapper 0, 14
261%endif
262%endmacro
263
264%if HAVE_ALIGNED_STACK == 0
265INIT_MMX mmxext
266HADAMARD8_DIFF
267%endif
268
269INIT_XMM sse2
270%if ARCH_X86_64
271%define ABS_SUM_8x8 ABS_SUM_8x8_64
272%else
273%define ABS_SUM_8x8 ABS_SUM_8x8_32
274%endif
275HADAMARD8_DIFF 10
276
277INIT_XMM ssse3
278%define ABS_SUM_8x8 ABS_SUM_8x8_64
279HADAMARD8_DIFF 9
280
281; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
282;               ptrdiff_t line_size, int h)
283
284%macro SUM_SQUARED_ERRORS 1
285cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
286%if %1 == mmsize
287    shr       hd, 1
288%endif
289    pxor      m0, m0         ; mm0 = 0
290    pxor      m7, m7         ; mm7 holds the sum
291
292.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
293    movu      m1, [pix1q]    ; m1 = pix1[0][0-15], [0-7] for mmx
294    movu      m2, [pix2q]    ; m2 = pix2[0][0-15], [0-7] for mmx
295%if %1 == mmsize
296    movu      m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx
297    movu      m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx
298%else  ; %1 / 2 == mmsize; mmx only
299    mova      m3, [pix1q+8]  ; m3 = pix1[0][8-15]
300    mova      m4, [pix2q+8]  ; m4 = pix2[0][8-15]
301%endif
302
303    ; todo: mm1-mm2, mm3-mm4
304    ; algo: subtract mm1 from mm2 with saturation and vice versa
305    ;       OR the result to get the absolute difference
306    mova      m5, m1
307    mova      m6, m3
308    psubusb   m1, m2
309    psubusb   m3, m4
310    psubusb   m2, m5
311    psubusb   m4, m6
312
313    por       m2, m1
314    por       m4, m3
315
316    ; now convert to 16-bit vectors so we can square them
317    mova      m1, m2
318    mova      m3, m4
319
320    punpckhbw m2, m0
321    punpckhbw m4, m0
322    punpcklbw m1, m0         ; mm1 not spread over (mm1,mm2)
323    punpcklbw m3, m0         ; mm4 not spread over (mm3,mm4)
324
325    pmaddwd   m2, m2
326    pmaddwd   m4, m4
327    pmaddwd   m1, m1
328    pmaddwd   m3, m3
329
330    paddd     m1, m2
331    paddd     m3, m4
332    paddd     m7, m1
333    paddd     m7, m3
334
335%if %1 == mmsize
336    lea    pix1q, [pix1q + 2*lsizeq]
337    lea    pix2q, [pix2q + 2*lsizeq]
338%else
339    add    pix1q, lsizeq
340    add    pix2q, lsizeq
341%endif
342    dec       hd
343    jnz .next2lines
344
345    HADDD     m7, m1
346    movd     eax, m7         ; return value
347    RET
348%endmacro
349
350INIT_MMX mmx
351SUM_SQUARED_ERRORS 8
352
353INIT_MMX mmx
354SUM_SQUARED_ERRORS 16
355
356INIT_XMM sse2
357SUM_SQUARED_ERRORS 16
358
359;-----------------------------------------------
360;int ff_sum_abs_dctelem(int16_t *block)
361;-----------------------------------------------
362; %1 = number of xmm registers used
363; %2 = number of inline loops
364
365%macro SUM_ABS_DCTELEM 2
366cglobal sum_abs_dctelem, 1, 1, %1, block
367    pxor    m0, m0
368    pxor    m1, m1
369%assign %%i 0
370%rep %2
371    mova      m2, [blockq+mmsize*(0+%%i)]
372    mova      m3, [blockq+mmsize*(1+%%i)]
373    mova      m4, [blockq+mmsize*(2+%%i)]
374    mova      m5, [blockq+mmsize*(3+%%i)]
375    ABS1_SUM  m2, m6, m0
376    ABS1_SUM  m3, m6, m1
377    ABS1_SUM  m4, m6, m0
378    ABS1_SUM  m5, m6, m1
379%assign %%i %%i+4
380%endrep
381    paddusw m0, m1
382    HSUM    m0, m1, eax
383    and     eax, 0xFFFF
384    RET
385%endmacro
386
387INIT_XMM sse2
388SUM_ABS_DCTELEM 7, 2
389INIT_XMM ssse3
390SUM_ABS_DCTELEM 6, 2
391
392;------------------------------------------------------------------------------
393; int ff_hf_noise*_mmx(uint8_t *pix1, ptrdiff_t lsize, int h)
394;------------------------------------------------------------------------------
395; %1 = 8/16. %2-5=m#
396%macro HF_NOISE_PART1 5
397    mova      m%2, [pix1q]
398%if %1 == 8
399    mova      m%3, m%2
400    psllq     m%2, 8
401    psrlq     m%3, 8
402    psrlq     m%2, 8
403%else
404    mova      m%3, [pix1q+1]
405%endif
406    mova      m%4, m%2
407    mova      m%5, m%3
408    punpcklbw m%2, m7
409    punpcklbw m%3, m7
410    punpckhbw m%4, m7
411    punpckhbw m%5, m7
412    psubw     m%2, m%3
413    psubw     m%4, m%5
414%endmacro
415
416; %1-2 = m#
417%macro HF_NOISE_PART2 4
418    psubw     m%1, m%3
419    psubw     m%2, m%4
420    pxor       m3, m3
421    pxor       m1, m1
422    pcmpgtw    m3, m%1
423    pcmpgtw    m1, m%2
424    pxor      m%1, m3
425    pxor      m%2, m1
426    psubw     m%1, m3
427    psubw     m%2, m1
428    paddw     m%2, m%1
429    paddw      m6, m%2
430%endmacro
431
432; %1 = 8/16
433%macro HF_NOISE 1
434cglobal hf_noise%1, 3,3,0, pix1, lsize, h
435    sub        hd, 2
436    pxor       m7, m7
437    pxor       m6, m6
438    HF_NOISE_PART1 %1, 0, 1, 2, 3
439    add     pix1q, lsizeq
440    HF_NOISE_PART1 %1, 4, 1, 5, 3
441    HF_NOISE_PART2     0, 2, 4, 5
442    add     pix1q, lsizeq
443.loop:
444    HF_NOISE_PART1 %1, 0, 1, 2, 3
445    HF_NOISE_PART2     4, 5, 0, 2
446    add     pix1q, lsizeq
447    HF_NOISE_PART1 %1, 4, 1, 5, 3
448    HF_NOISE_PART2     0, 2, 4, 5
449    add     pix1q, lsizeq
450    sub        hd, 2
451        jne .loop
452
453    mova       m0, m6
454    punpcklwd  m0, m7
455    punpckhwd  m6, m7
456    paddd      m6, m0
457    mova       m0, m6
458    psrlq      m6, 32
459    paddd      m0, m6
460    movd      eax, m0   ; eax = result of hf_noise8;
461    REP_RET                 ; return eax;
462%endmacro
463
464INIT_MMX mmx
465HF_NOISE 8
466HF_NOISE 16
467
468;---------------------------------------------------------------------------------------
469;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
470;---------------------------------------------------------------------------------------
471;%1 = 8/16
472%macro SAD 1
473cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
474    movu      m2, [pix2q]
475    movu      m1, [pix2q+strideq]
476    psadbw    m2, [pix1q]
477    psadbw    m1, [pix1q+strideq]
478    paddw     m2, m1
479%if %1 != mmsize
480    movu      m0, [pix2q+8]
481    movu      m1, [pix2q+strideq+8]
482    psadbw    m0, [pix1q+8]
483    psadbw    m1, [pix1q+strideq+8]
484    paddw     m2, m0
485    paddw     m2, m1
486%endif
487    sub       hd, 2
488
489align 16
490.loop:
491    lea    pix1q, [pix1q+strideq*2]
492    lea    pix2q, [pix2q+strideq*2]
493    movu      m0, [pix2q]
494    movu      m1, [pix2q+strideq]
495    psadbw    m0, [pix1q]
496    psadbw    m1, [pix1q+strideq]
497    paddw     m2, m0
498    paddw     m2, m1
499%if %1 != mmsize
500    movu      m0, [pix2q+8]
501    movu      m1, [pix2q+strideq+8]
502    psadbw    m0, [pix1q+8]
503    psadbw    m1, [pix1q+strideq+8]
504    paddw     m2, m0
505    paddw     m2, m1
506%endif
507    sub       hd, 2
508    jg .loop
509%if mmsize == 16
510    movhlps   m0, m2
511    paddw     m2, m0
512%endif
513    movd     eax, m2
514    RET
515%endmacro
516
517INIT_MMX mmxext
518SAD 8
519SAD 16
520INIT_XMM sse2
521SAD 16
522
523;------------------------------------------------------------------------------------------
524;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
525;------------------------------------------------------------------------------------------
526;%1 = 8/16
527%macro SAD_X2 1
528cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
529    movu      m0, [pix2q]
530    movu      m2, [pix2q+strideq]
531%if mmsize == 16
532    movu      m3, [pix2q+1]
533    movu      m4, [pix2q+strideq+1]
534    pavgb     m0, m3
535    pavgb     m2, m4
536%else
537    pavgb     m0, [pix2q+1]
538    pavgb     m2, [pix2q+strideq+1]
539%endif
540    psadbw    m0, [pix1q]
541    psadbw    m2, [pix1q+strideq]
542    paddw     m0, m2
543%if %1 != mmsize
544    movu      m1, [pix2q+8]
545    movu      m2, [pix2q+strideq+8]
546    pavgb     m1, [pix2q+9]
547    pavgb     m2, [pix2q+strideq+9]
548    psadbw    m1, [pix1q+8]
549    psadbw    m2, [pix1q+strideq+8]
550    paddw     m0, m1
551    paddw     m0, m2
552%endif
553    sub       hd, 2
554
555align 16
556.loop:
557    lea    pix1q, [pix1q+2*strideq]
558    lea    pix2q, [pix2q+2*strideq]
559    movu      m1, [pix2q]
560    movu      m2, [pix2q+strideq]
561%if mmsize == 16
562    movu      m3, [pix2q+1]
563    movu      m4, [pix2q+strideq+1]
564    pavgb     m1, m3
565    pavgb     m2, m4
566%else
567    pavgb     m1, [pix2q+1]
568    pavgb     m2, [pix2q+strideq+1]
569%endif
570    psadbw    m1, [pix1q]
571    psadbw    m2, [pix1q+strideq]
572    paddw     m0, m1
573    paddw     m0, m2
574%if %1 != mmsize
575    movu      m1, [pix2q+8]
576    movu      m2, [pix2q+strideq+8]
577    pavgb     m1, [pix2q+9]
578    pavgb     m2, [pix2q+strideq+9]
579    psadbw    m1, [pix1q+8]
580    psadbw    m2, [pix1q+strideq+8]
581    paddw     m0, m1
582    paddw     m0, m2
583%endif
584    sub       hd, 2
585    jg .loop
586%if mmsize == 16
587    movhlps   m1, m0
588    paddw     m0, m1
589%endif
590    movd     eax, m0
591    RET
592%endmacro
593
594INIT_MMX mmxext
595SAD_X2 8
596SAD_X2 16
597INIT_XMM sse2
598SAD_X2 16
599
600;------------------------------------------------------------------------------------------
601;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
602;------------------------------------------------------------------------------------------
603;%1 = 8/16
604%macro SAD_Y2 1
605cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h
606    movu      m1, [pix2q]
607    movu      m0, [pix2q+strideq]
608    movu      m3, [pix2q+2*strideq]
609    pavgb     m1, m0
610    pavgb     m0, m3
611    psadbw    m1, [pix1q]
612    psadbw    m0, [pix1q+strideq]
613    paddw     m0, m1
614    mova      m1, m3
615%if %1 != mmsize
616    movu      m4, [pix2q+8]
617    movu      m5, [pix2q+strideq+8]
618    movu      m6, [pix2q+2*strideq+8]
619    pavgb     m4, m5
620    pavgb     m5, m6
621    psadbw    m4, [pix1q+8]
622    psadbw    m5, [pix1q+strideq+8]
623    paddw     m0, m4
624    paddw     m0, m5
625    mova      m4, m6
626%endif
627    add    pix2q, strideq
628    sub       hd, 2
629
630align 16
631.loop:
632    lea    pix1q, [pix1q+2*strideq]
633    lea    pix2q, [pix2q+2*strideq]
634    movu      m2, [pix2q]
635    movu      m3, [pix2q+strideq]
636    pavgb     m1, m2
637    pavgb     m2, m3
638    psadbw    m1, [pix1q]
639    psadbw    m2, [pix1q+strideq]
640    paddw     m0, m1
641    paddw     m0, m2
642    mova      m1, m3
643%if %1 != mmsize
644    movu      m5, [pix2q+8]
645    movu      m6, [pix2q+strideq+8]
646    pavgb     m4, m5
647    pavgb     m5, m6
648    psadbw    m4, [pix1q+8]
649    psadbw    m5, [pix1q+strideq+8]
650    paddw     m0, m4
651    paddw     m0, m5
652    mova      m4, m6
653%endif
654    sub       hd, 2
655    jg .loop
656%if mmsize == 16
657    movhlps   m1, m0
658    paddw     m0, m1
659%endif
660    movd     eax, m0
661    RET
662%endmacro
663
664INIT_MMX mmxext
665SAD_Y2 8
666SAD_Y2 16
667INIT_XMM sse2
668SAD_Y2 16
669
670;-------------------------------------------------------------------------------------------
671;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
672;-------------------------------------------------------------------------------------------
673;%1 = 8/16
674%macro SAD_APPROX_XY2 1
675cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h
676    mova      m4, [pb_1]
677    movu      m1, [pix2q]
678    movu      m0, [pix2q+strideq]
679    movu      m3, [pix2q+2*strideq]
680%if mmsize == 16
681    movu      m5, [pix2q+1]
682    movu      m6, [pix2q+strideq+1]
683    movu      m2, [pix2q+2*strideq+1]
684    pavgb     m1, m5
685    pavgb     m0, m6
686    pavgb     m3, m2
687%else
688    pavgb     m1, [pix2q+1]
689    pavgb     m0, [pix2q+strideq+1]
690    pavgb     m3, [pix2q+2*strideq+1]
691%endif
692    psubusb   m0, m4
693    pavgb     m1, m0
694    pavgb     m0, m3
695    psadbw    m1, [pix1q]
696    psadbw    m0, [pix1q+strideq]
697    paddw     m0, m1
698    mova      m1, m3
699%if %1 != mmsize
700    movu      m5, [pix2q+8]
701    movu      m6, [pix2q+strideq+8]
702    movu      m7, [pix2q+2*strideq+8]
703    pavgb     m5, [pix2q+1+8]
704    pavgb     m6, [pix2q+strideq+1+8]
705    pavgb     m7, [pix2q+2*strideq+1+8]
706    psubusb   m6, m4
707    pavgb     m5, m6
708    pavgb     m6, m7
709    psadbw    m5, [pix1q+8]
710    psadbw    m6, [pix1q+strideq+8]
711    paddw     m0, m5
712    paddw     m0, m6
713    mova      m5, m7
714%endif
715    add    pix2q, strideq
716    sub       hd, 2
717
718align 16
719.loop:
720    lea    pix1q, [pix1q+2*strideq]
721    lea    pix2q, [pix2q+2*strideq]
722    movu      m2, [pix2q]
723    movu      m3, [pix2q+strideq]
724%if mmsize == 16
725    movu      m5, [pix2q+1]
726    movu      m6, [pix2q+strideq+1]
727    pavgb     m2, m5
728    pavgb     m3, m6
729%else
730    pavgb     m2, [pix2q+1]
731    pavgb     m3, [pix2q+strideq+1]
732%endif
733    psubusb   m2, m4
734    pavgb     m1, m2
735    pavgb     m2, m3
736    psadbw    m1, [pix1q]
737    psadbw    m2, [pix1q+strideq]
738    paddw     m0, m1
739    paddw     m0, m2
740    mova      m1, m3
741%if %1 != mmsize
742    movu      m6, [pix2q+8]
743    movu      m7, [pix2q+strideq+8]
744    pavgb     m6, [pix2q+8+1]
745    pavgb     m7, [pix2q+strideq+8+1]
746    psubusb   m6, m4
747    pavgb     m5, m6
748    pavgb     m6, m7
749    psadbw    m5, [pix1q+8]
750    psadbw    m6, [pix1q+strideq+8]
751    paddw     m0, m5
752    paddw     m0, m6
753    mova      m5, m7
754%endif
755    sub       hd, 2
756    jg .loop
757%if mmsize == 16
758    movhlps   m1, m0
759    paddw     m0, m1
760%endif
761    movd     eax, m0
762    RET
763%endmacro
764
765INIT_MMX mmxext
766SAD_APPROX_XY2 8
767SAD_APPROX_XY2 16
768INIT_XMM sse2
769SAD_APPROX_XY2 16
770
771;--------------------------------------------------------------------
772;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
773;                  ptrdiff_t line_size, int h);
774;--------------------------------------------------------------------
775; %1 = 8/16
776%macro VSAD_INTRA 1
777cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
778    mova      m0, [pix1q]
779%if %1 == mmsize
780    mova      m2, [pix1q+lsizeq]
781    psadbw    m0, m2
782%else
783    mova      m2, [pix1q+lsizeq]
784    mova      m3, [pix1q+8]
785    mova      m4, [pix1q+lsizeq+8]
786    psadbw    m0, m2
787    psadbw    m3, m4
788    paddw     m0, m3
789%endif
790    sub       hd, 2
791
792.loop:
793    lea    pix1q, [pix1q + 2*lsizeq]
794%if %1 == mmsize
795    mova      m1, [pix1q]
796    psadbw    m2, m1
797    paddw     m0, m2
798    mova      m2, [pix1q+lsizeq]
799    psadbw    m1, m2
800    paddw     m0, m1
801%else
802    mova      m1, [pix1q]
803    mova      m3, [pix1q+8]
804    psadbw    m2, m1
805    psadbw    m4, m3
806    paddw     m0, m2
807    paddw     m0, m4
808    mova      m2, [pix1q+lsizeq]
809    mova      m4, [pix1q+lsizeq+8]
810    psadbw    m1, m2
811    psadbw    m3, m4
812    paddw     m0, m1
813    paddw     m0, m3
814%endif
815    sub       hd, 2
816    jg     .loop
817
818%if mmsize == 16
819    pshufd m1, m0, 0xe
820    paddd  m0, m1
821%endif
822    movd eax, m0
823    RET
824%endmacro
825
826INIT_MMX mmxext
827VSAD_INTRA 8
828VSAD_INTRA 16
829INIT_XMM sse2
830VSAD_INTRA 16
831
832;---------------------------------------------------------------------
833;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
834;                   ptrdiff_t line_size, int h);
835;---------------------------------------------------------------------
836; %1 = 8/16
837%macro VSAD_APPROX 1
838cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
839    mova   m1, [pb_80]
840    mova   m0, [pix1q]
841%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
842    mova   m4, [pix1q+lsizeq]
843%if mmsize == 16
844    movu   m3, [pix2q]
845    movu   m2, [pix2q+lsizeq]
846    psubb  m0, m3
847    psubb  m4, m2
848%else
849    psubb  m0, [pix2q]
850    psubb  m4, [pix2q+lsizeq]
851%endif
852    pxor   m0, m1
853    pxor   m4, m1
854    psadbw m0, m4
855%else ; vsad16_mmxext
856    mova   m3, [pix1q+8]
857    psubb  m0, [pix2q]
858    psubb  m3, [pix2q+8]
859    pxor   m0, m1
860    pxor   m3, m1
861    mova   m4, [pix1q+lsizeq]
862    mova   m5, [pix1q+lsizeq+8]
863    psubb  m4, [pix2q+lsizeq]
864    psubb  m5, [pix2q+lsizeq+8]
865    pxor   m4, m1
866    pxor   m5, m1
867    psadbw m0, m4
868    psadbw m3, m5
869    paddw  m0, m3
870%endif
871    sub    hd, 2
872
873.loop:
874    lea pix1q, [pix1q + 2*lsizeq]
875    lea pix2q, [pix2q + 2*lsizeq]
876    mova   m2, [pix1q]
877%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
878%if mmsize == 16
879    movu   m3, [pix2q]
880    psubb  m2, m3
881%else
882    psubb  m2, [pix2q]
883%endif
884    pxor   m2, m1
885    psadbw m4, m2
886    paddw  m0, m4
887    mova   m4, [pix1q+lsizeq]
888    movu   m3, [pix2q+lsizeq]
889    psubb  m4, m3
890    pxor   m4, m1
891    psadbw m2, m4
892    paddw  m0, m2
893%else ; vsad16_mmxext
894    mova   m3, [pix1q+8]
895    psubb  m2, [pix2q]
896    psubb  m3, [pix2q+8]
897    pxor   m2, m1
898    pxor   m3, m1
899    psadbw m4, m2
900    psadbw m5, m3
901    paddw  m0, m4
902    paddw  m0, m5
903    mova   m4, [pix1q+lsizeq]
904    mova   m5, [pix1q+lsizeq+8]
905    psubb  m4, [pix2q+lsizeq]
906    psubb  m5, [pix2q+lsizeq+8]
907    pxor   m4, m1
908    pxor   m5, m1
909    psadbw m2, m4
910    psadbw m3, m5
911    paddw  m0, m2
912    paddw  m0, m3
913%endif
914    sub    hd, 2
915    jg  .loop
916
917%if mmsize == 16
918    pshufd m1, m0, 0xe
919    paddd  m0, m1
920%endif
921    movd  eax, m0
922    RET
923%endmacro
924
925INIT_MMX mmxext
926VSAD_APPROX 8
927VSAD_APPROX 16
928INIT_XMM sse2
929VSAD_APPROX 16
930