1cabdff1aSopenharmony_ci;*****************************************************************************
2cabdff1aSopenharmony_ci;* SIMD-optimized motion compensation estimation
3cabdff1aSopenharmony_ci;*****************************************************************************
4cabdff1aSopenharmony_ci;* Copyright (c) 2000, 2001 Fabrice Bellard
5cabdff1aSopenharmony_ci;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
8cabdff1aSopenharmony_ci;*
9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
13cabdff1aSopenharmony_ci;*
14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
18cabdff1aSopenharmony_ci;*
19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22cabdff1aSopenharmony_ci;*****************************************************************************
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ciSECTION_RODATA
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_cicextern pb_1
29cabdff1aSopenharmony_cicextern pb_80
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_ciSECTION .text
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_ci%macro DIFF_PIXELS_1 4
34cabdff1aSopenharmony_ci    movh            %1, %3
35cabdff1aSopenharmony_ci    movh            %2, %4
36cabdff1aSopenharmony_ci    punpcklbw       %2, %1
37cabdff1aSopenharmony_ci    punpcklbw       %1, %1
38cabdff1aSopenharmony_ci    psubw           %1, %2
39cabdff1aSopenharmony_ci%endmacro
40cabdff1aSopenharmony_ci
41cabdff1aSopenharmony_ci; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
42cabdff1aSopenharmony_ci; %6=temporary storage location
43cabdff1aSopenharmony_ci; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
44cabdff1aSopenharmony_ci%macro DIFF_PIXELS_8 6
45cabdff1aSopenharmony_ci    DIFF_PIXELS_1   m0, m7, [%1     +%3], [%2     +%3]
46cabdff1aSopenharmony_ci    DIFF_PIXELS_1   m1, m7, [%1+%4  +%3], [%2+%4  +%3]
47cabdff1aSopenharmony_ci    DIFF_PIXELS_1   m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
48cabdff1aSopenharmony_ci    add             %1, %5
49cabdff1aSopenharmony_ci    add             %2, %5
50cabdff1aSopenharmony_ci    DIFF_PIXELS_1   m3, m7, [%1     +%3], [%2     +%3]
51cabdff1aSopenharmony_ci    DIFF_PIXELS_1   m4, m7, [%1+%4  +%3], [%2+%4  +%3]
52cabdff1aSopenharmony_ci    DIFF_PIXELS_1   m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
53cabdff1aSopenharmony_ci    DIFF_PIXELS_1   m6, m7, [%1+%5  +%3], [%2+%5  +%3]
54cabdff1aSopenharmony_ci%ifdef m8
55cabdff1aSopenharmony_ci    DIFF_PIXELS_1   m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
56cabdff1aSopenharmony_ci%else
57cabdff1aSopenharmony_ci    mova          [%6], m0
58cabdff1aSopenharmony_ci    DIFF_PIXELS_1   m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
59cabdff1aSopenharmony_ci    mova            m0, [%6]
60cabdff1aSopenharmony_ci%endif
61cabdff1aSopenharmony_ci    sub             %1, %5
62cabdff1aSopenharmony_ci    sub             %2, %5
63cabdff1aSopenharmony_ci%endmacro
64cabdff1aSopenharmony_ci
65cabdff1aSopenharmony_ci%macro HADAMARD8 0
66cabdff1aSopenharmony_ci    SUMSUB_BADC       w, 0, 1, 2, 3
67cabdff1aSopenharmony_ci    SUMSUB_BADC       w, 4, 5, 6, 7
68cabdff1aSopenharmony_ci    SUMSUB_BADC       w, 0, 2, 1, 3
69cabdff1aSopenharmony_ci    SUMSUB_BADC       w, 4, 6, 5, 7
70cabdff1aSopenharmony_ci    SUMSUB_BADC       w, 0, 4, 1, 5
71cabdff1aSopenharmony_ci    SUMSUB_BADC       w, 2, 6, 3, 7
72cabdff1aSopenharmony_ci%endmacro
73cabdff1aSopenharmony_ci
74cabdff1aSopenharmony_ci%macro ABS1_SUM 3
75cabdff1aSopenharmony_ci    ABS1            %1, %2
76cabdff1aSopenharmony_ci    paddusw         %3, %1
77cabdff1aSopenharmony_ci%endmacro
78cabdff1aSopenharmony_ci
79cabdff1aSopenharmony_ci%macro ABS2_SUM 6
80cabdff1aSopenharmony_ci    ABS2            %1, %2, %3, %4
81cabdff1aSopenharmony_ci    paddusw         %5, %1
82cabdff1aSopenharmony_ci    paddusw         %6, %2
83cabdff1aSopenharmony_ci%endmacro
84cabdff1aSopenharmony_ci
85cabdff1aSopenharmony_ci%macro ABS_SUM_8x8_64 1
86cabdff1aSopenharmony_ci    ABS2            m0, m1, m8, m9
87cabdff1aSopenharmony_ci    ABS2_SUM        m2, m3, m8, m9, m0, m1
88cabdff1aSopenharmony_ci    ABS2_SUM        m4, m5, m8, m9, m0, m1
89cabdff1aSopenharmony_ci    ABS2_SUM        m6, m7, m8, m9, m0, m1
90cabdff1aSopenharmony_ci    paddusw         m0, m1
91cabdff1aSopenharmony_ci%endmacro
92cabdff1aSopenharmony_ci
93cabdff1aSopenharmony_ci%macro ABS_SUM_8x8_32 1
94cabdff1aSopenharmony_ci    mova          [%1], m7
95cabdff1aSopenharmony_ci    ABS1            m0, m7
96cabdff1aSopenharmony_ci    ABS1            m1, m7
97cabdff1aSopenharmony_ci    ABS1_SUM        m2, m7, m0
98cabdff1aSopenharmony_ci    ABS1_SUM        m3, m7, m1
99cabdff1aSopenharmony_ci    ABS1_SUM        m4, m7, m0
100cabdff1aSopenharmony_ci    ABS1_SUM        m5, m7, m1
101cabdff1aSopenharmony_ci    ABS1_SUM        m6, m7, m0
102cabdff1aSopenharmony_ci    mova            m2, [%1]
103cabdff1aSopenharmony_ci    ABS1_SUM        m2, m7, m1
104cabdff1aSopenharmony_ci    paddusw         m0, m1
105cabdff1aSopenharmony_ci%endmacro
106cabdff1aSopenharmony_ci
107cabdff1aSopenharmony_ci; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
108cabdff1aSopenharmony_ci; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
109cabdff1aSopenharmony_ci; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
110cabdff1aSopenharmony_ci%macro HSUM 3
111cabdff1aSopenharmony_ci%if cpuflag(sse2)
112cabdff1aSopenharmony_ci    movhlps         %2, %1
113cabdff1aSopenharmony_ci    paddusw         %1, %2
114cabdff1aSopenharmony_ci    pshuflw         %2, %1, 0xE
115cabdff1aSopenharmony_ci    paddusw         %1, %2
116cabdff1aSopenharmony_ci    pshuflw         %2, %1, 0x1
117cabdff1aSopenharmony_ci    paddusw         %1, %2
118cabdff1aSopenharmony_ci    movd            %3, %1
119cabdff1aSopenharmony_ci%elif cpuflag(mmxext)
120cabdff1aSopenharmony_ci    pshufw          %2, %1, 0xE
121cabdff1aSopenharmony_ci    paddusw         %1, %2
122cabdff1aSopenharmony_ci    pshufw          %2, %1, 0x1
123cabdff1aSopenharmony_ci    paddusw         %1, %2
124cabdff1aSopenharmony_ci    movd            %3, %1
125cabdff1aSopenharmony_ci%elif cpuflag(mmx)
126cabdff1aSopenharmony_ci    mova            %2, %1
127cabdff1aSopenharmony_ci    psrlq           %1, 32
128cabdff1aSopenharmony_ci    paddusw         %1, %2
129cabdff1aSopenharmony_ci    mova            %2, %1
130cabdff1aSopenharmony_ci    psrlq           %1, 16
131cabdff1aSopenharmony_ci    paddusw         %1, %2
132cabdff1aSopenharmony_ci    movd            %3, %1
133cabdff1aSopenharmony_ci%endif
134cabdff1aSopenharmony_ci%endmacro
135cabdff1aSopenharmony_ci
136cabdff1aSopenharmony_ci%macro STORE4 5
137cabdff1aSopenharmony_ci    mova [%1+mmsize*0], %2
138cabdff1aSopenharmony_ci    mova [%1+mmsize*1], %3
139cabdff1aSopenharmony_ci    mova [%1+mmsize*2], %4
140cabdff1aSopenharmony_ci    mova [%1+mmsize*3], %5
141cabdff1aSopenharmony_ci%endmacro
142cabdff1aSopenharmony_ci
143cabdff1aSopenharmony_ci%macro LOAD4 5
144cabdff1aSopenharmony_ci    mova            %2, [%1+mmsize*0]
145cabdff1aSopenharmony_ci    mova            %3, [%1+mmsize*1]
146cabdff1aSopenharmony_ci    mova            %4, [%1+mmsize*2]
147cabdff1aSopenharmony_ci    mova            %5, [%1+mmsize*3]
148cabdff1aSopenharmony_ci%endmacro
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_ci%macro hadamard8_16_wrapper 2
151cabdff1aSopenharmony_cicglobal hadamard8_diff, 4, 4, %1
152cabdff1aSopenharmony_ci%ifndef m8
153cabdff1aSopenharmony_ci    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
154cabdff1aSopenharmony_ci    SUB            rsp, pad
155cabdff1aSopenharmony_ci%endif
156cabdff1aSopenharmony_ci    call hadamard8x8_diff %+ SUFFIX
157cabdff1aSopenharmony_ci%ifndef m8
158cabdff1aSopenharmony_ci    ADD            rsp, pad
159cabdff1aSopenharmony_ci%endif
160cabdff1aSopenharmony_ci    RET
161cabdff1aSopenharmony_ci
162cabdff1aSopenharmony_cicglobal hadamard8_diff16, 5, 6, %1
163cabdff1aSopenharmony_ci%ifndef m8
164cabdff1aSopenharmony_ci    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
165cabdff1aSopenharmony_ci    SUB            rsp, pad
166cabdff1aSopenharmony_ci%endif
167cabdff1aSopenharmony_ci
168cabdff1aSopenharmony_ci    call hadamard8x8_diff %+ SUFFIX
169cabdff1aSopenharmony_ci    mov            r5d, eax
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_ci    add             r1, 8
172cabdff1aSopenharmony_ci    add             r2, 8
173cabdff1aSopenharmony_ci    call hadamard8x8_diff %+ SUFFIX
174cabdff1aSopenharmony_ci    add            r5d, eax
175cabdff1aSopenharmony_ci
176cabdff1aSopenharmony_ci    cmp            r4d, 16
177cabdff1aSopenharmony_ci    jne .done
178cabdff1aSopenharmony_ci
179cabdff1aSopenharmony_ci    lea             r1, [r1+r3*8-8]
180cabdff1aSopenharmony_ci    lea             r2, [r2+r3*8-8]
181cabdff1aSopenharmony_ci    call hadamard8x8_diff %+ SUFFIX
182cabdff1aSopenharmony_ci    add            r5d, eax
183cabdff1aSopenharmony_ci
184cabdff1aSopenharmony_ci    add             r1, 8
185cabdff1aSopenharmony_ci    add             r2, 8
186cabdff1aSopenharmony_ci    call hadamard8x8_diff %+ SUFFIX
187cabdff1aSopenharmony_ci    add            r5d, eax
188cabdff1aSopenharmony_ci
189cabdff1aSopenharmony_ci.done:
190cabdff1aSopenharmony_ci    mov            eax, r5d
191cabdff1aSopenharmony_ci%ifndef m8
192cabdff1aSopenharmony_ci    ADD            rsp, pad
193cabdff1aSopenharmony_ci%endif
194cabdff1aSopenharmony_ci    RET
195cabdff1aSopenharmony_ci%endmacro
196cabdff1aSopenharmony_ci
197cabdff1aSopenharmony_ci%macro HADAMARD8_DIFF 0-1
198cabdff1aSopenharmony_ci%if cpuflag(sse2)
199cabdff1aSopenharmony_cihadamard8x8_diff %+ SUFFIX:
200cabdff1aSopenharmony_ci    lea                          r0, [r3*3]
201cabdff1aSopenharmony_ci    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize
202cabdff1aSopenharmony_ci    HADAMARD8
203cabdff1aSopenharmony_ci%if ARCH_X86_64
204cabdff1aSopenharmony_ci    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7,  8
205cabdff1aSopenharmony_ci%else
206cabdff1aSopenharmony_ci    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7, [rsp+gprsize], [rsp+mmsize+gprsize]
207cabdff1aSopenharmony_ci%endif
208cabdff1aSopenharmony_ci    HADAMARD8
209cabdff1aSopenharmony_ci    ABS_SUM_8x8         rsp+gprsize
210cabdff1aSopenharmony_ci    HSUM                        m0, m1, eax
211cabdff1aSopenharmony_ci    and                         eax, 0xFFFF
212cabdff1aSopenharmony_ci    ret
213cabdff1aSopenharmony_ci
214cabdff1aSopenharmony_cihadamard8_16_wrapper %1, 3
215cabdff1aSopenharmony_ci%elif cpuflag(mmx)
216cabdff1aSopenharmony_ciALIGN 16
217cabdff1aSopenharmony_ci; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,
218cabdff1aSopenharmony_ci;                               uint8_t *src2, ptrdiff_t stride, int h)
219cabdff1aSopenharmony_ci; r0 = void *s = unused, int h = unused (always 8)
220cabdff1aSopenharmony_ci; note how r1, r2 and r3 are not clobbered in this function, so 16x16
221cabdff1aSopenharmony_ci; can simply call this 2x2x (and that's why we access rsp+gprsize
222cabdff1aSopenharmony_ci; everywhere, which is rsp of calling func
223cabdff1aSopenharmony_cihadamard8x8_diff %+ SUFFIX:
224cabdff1aSopenharmony_ci    lea                          r0, [r3*3]
225cabdff1aSopenharmony_ci
226cabdff1aSopenharmony_ci    ; first 4x8 pixels
227cabdff1aSopenharmony_ci    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize+0x60
228cabdff1aSopenharmony_ci    HADAMARD8
229cabdff1aSopenharmony_ci    mova         [rsp+gprsize+0x60], m7
230cabdff1aSopenharmony_ci    TRANSPOSE4x4W                 0,  1,  2,  3,  7
231cabdff1aSopenharmony_ci    STORE4              rsp+gprsize, m0, m1, m2, m3
232cabdff1aSopenharmony_ci    mova                         m7, [rsp+gprsize+0x60]
233cabdff1aSopenharmony_ci    TRANSPOSE4x4W                 4,  5,  6,  7,  0
234cabdff1aSopenharmony_ci    STORE4         rsp+gprsize+0x40, m4, m5, m6, m7
235cabdff1aSopenharmony_ci
236cabdff1aSopenharmony_ci    ; second 4x8 pixels
237cabdff1aSopenharmony_ci    DIFF_PIXELS_8                r1, r2,  4, r3, r0, rsp+gprsize+0x60
238cabdff1aSopenharmony_ci    HADAMARD8
239cabdff1aSopenharmony_ci    mova         [rsp+gprsize+0x60], m7
240cabdff1aSopenharmony_ci    TRANSPOSE4x4W                 0,  1,  2,  3,  7
241cabdff1aSopenharmony_ci    STORE4         rsp+gprsize+0x20, m0, m1, m2, m3
242cabdff1aSopenharmony_ci    mova                         m7, [rsp+gprsize+0x60]
243cabdff1aSopenharmony_ci    TRANSPOSE4x4W                 4,  5,  6,  7,  0
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ci    LOAD4          rsp+gprsize+0x40, m0, m1, m2, m3
246cabdff1aSopenharmony_ci    HADAMARD8
247cabdff1aSopenharmony_ci    ABS_SUM_8x8_32 rsp+gprsize+0x60
248cabdff1aSopenharmony_ci    mova         [rsp+gprsize+0x60], m0
249cabdff1aSopenharmony_ci
250cabdff1aSopenharmony_ci    LOAD4          rsp+gprsize     , m0, m1, m2, m3
251cabdff1aSopenharmony_ci    LOAD4          rsp+gprsize+0x20, m4, m5, m6, m7
252cabdff1aSopenharmony_ci    HADAMARD8
253cabdff1aSopenharmony_ci    ABS_SUM_8x8_32 rsp+gprsize
254cabdff1aSopenharmony_ci    paddusw                      m0, [rsp+gprsize+0x60]
255cabdff1aSopenharmony_ci
256cabdff1aSopenharmony_ci    HSUM                         m0, m1, eax
257cabdff1aSopenharmony_ci    and                         rax, 0xFFFF
258cabdff1aSopenharmony_ci    ret
259cabdff1aSopenharmony_ci
260cabdff1aSopenharmony_cihadamard8_16_wrapper 0, 14
261cabdff1aSopenharmony_ci%endif
262cabdff1aSopenharmony_ci%endmacro
263cabdff1aSopenharmony_ci
264cabdff1aSopenharmony_ci%if HAVE_ALIGNED_STACK == 0
265cabdff1aSopenharmony_ciINIT_MMX mmxext
266cabdff1aSopenharmony_ciHADAMARD8_DIFF
267cabdff1aSopenharmony_ci%endif
268cabdff1aSopenharmony_ci
269cabdff1aSopenharmony_ciINIT_XMM sse2
270cabdff1aSopenharmony_ci%if ARCH_X86_64
271cabdff1aSopenharmony_ci%define ABS_SUM_8x8 ABS_SUM_8x8_64
272cabdff1aSopenharmony_ci%else
273cabdff1aSopenharmony_ci%define ABS_SUM_8x8 ABS_SUM_8x8_32
274cabdff1aSopenharmony_ci%endif
275cabdff1aSopenharmony_ciHADAMARD8_DIFF 10
276cabdff1aSopenharmony_ci
277cabdff1aSopenharmony_ciINIT_XMM ssse3
278cabdff1aSopenharmony_ci%define ABS_SUM_8x8 ABS_SUM_8x8_64
279cabdff1aSopenharmony_ciHADAMARD8_DIFF 9
280cabdff1aSopenharmony_ci
281cabdff1aSopenharmony_ci; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
282cabdff1aSopenharmony_ci;               ptrdiff_t line_size, int h)
283cabdff1aSopenharmony_ci
284cabdff1aSopenharmony_ci%macro SUM_SQUARED_ERRORS 1
285cabdff1aSopenharmony_cicglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
286cabdff1aSopenharmony_ci%if %1 == mmsize
287cabdff1aSopenharmony_ci    shr       hd, 1
288cabdff1aSopenharmony_ci%endif
289cabdff1aSopenharmony_ci    pxor      m0, m0         ; mm0 = 0
290cabdff1aSopenharmony_ci    pxor      m7, m7         ; mm7 holds the sum
291cabdff1aSopenharmony_ci
292cabdff1aSopenharmony_ci.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
293cabdff1aSopenharmony_ci    movu      m1, [pix1q]    ; m1 = pix1[0][0-15], [0-7] for mmx
294cabdff1aSopenharmony_ci    movu      m2, [pix2q]    ; m2 = pix2[0][0-15], [0-7] for mmx
295cabdff1aSopenharmony_ci%if %1 == mmsize
296cabdff1aSopenharmony_ci    movu      m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx
297cabdff1aSopenharmony_ci    movu      m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx
298cabdff1aSopenharmony_ci%else  ; %1 / 2 == mmsize; mmx only
299cabdff1aSopenharmony_ci    mova      m3, [pix1q+8]  ; m3 = pix1[0][8-15]
300cabdff1aSopenharmony_ci    mova      m4, [pix2q+8]  ; m4 = pix2[0][8-15]
301cabdff1aSopenharmony_ci%endif
302cabdff1aSopenharmony_ci
303cabdff1aSopenharmony_ci    ; todo: mm1-mm2, mm3-mm4
304cabdff1aSopenharmony_ci    ; algo: subtract mm1 from mm2 with saturation and vice versa
305cabdff1aSopenharmony_ci    ;       OR the result to get the absolute difference
306cabdff1aSopenharmony_ci    mova      m5, m1
307cabdff1aSopenharmony_ci    mova      m6, m3
308cabdff1aSopenharmony_ci    psubusb   m1, m2
309cabdff1aSopenharmony_ci    psubusb   m3, m4
310cabdff1aSopenharmony_ci    psubusb   m2, m5
311cabdff1aSopenharmony_ci    psubusb   m4, m6
312cabdff1aSopenharmony_ci
313cabdff1aSopenharmony_ci    por       m2, m1
314cabdff1aSopenharmony_ci    por       m4, m3
315cabdff1aSopenharmony_ci
316cabdff1aSopenharmony_ci    ; now convert to 16-bit vectors so we can square them
317cabdff1aSopenharmony_ci    mova      m1, m2
318cabdff1aSopenharmony_ci    mova      m3, m4
319cabdff1aSopenharmony_ci
320cabdff1aSopenharmony_ci    punpckhbw m2, m0
321cabdff1aSopenharmony_ci    punpckhbw m4, m0
322cabdff1aSopenharmony_ci    punpcklbw m1, m0         ; mm1 not spread over (mm1,mm2)
323cabdff1aSopenharmony_ci    punpcklbw m3, m0         ; mm4 not spread over (mm3,mm4)
324cabdff1aSopenharmony_ci
325cabdff1aSopenharmony_ci    pmaddwd   m2, m2
326cabdff1aSopenharmony_ci    pmaddwd   m4, m4
327cabdff1aSopenharmony_ci    pmaddwd   m1, m1
328cabdff1aSopenharmony_ci    pmaddwd   m3, m3
329cabdff1aSopenharmony_ci
330cabdff1aSopenharmony_ci    paddd     m1, m2
331cabdff1aSopenharmony_ci    paddd     m3, m4
332cabdff1aSopenharmony_ci    paddd     m7, m1
333cabdff1aSopenharmony_ci    paddd     m7, m3
334cabdff1aSopenharmony_ci
335cabdff1aSopenharmony_ci%if %1 == mmsize
336cabdff1aSopenharmony_ci    lea    pix1q, [pix1q + 2*lsizeq]
337cabdff1aSopenharmony_ci    lea    pix2q, [pix2q + 2*lsizeq]
338cabdff1aSopenharmony_ci%else
339cabdff1aSopenharmony_ci    add    pix1q, lsizeq
340cabdff1aSopenharmony_ci    add    pix2q, lsizeq
341cabdff1aSopenharmony_ci%endif
342cabdff1aSopenharmony_ci    dec       hd
343cabdff1aSopenharmony_ci    jnz .next2lines
344cabdff1aSopenharmony_ci
345cabdff1aSopenharmony_ci    HADDD     m7, m1
346cabdff1aSopenharmony_ci    movd     eax, m7         ; return value
347cabdff1aSopenharmony_ci    RET
348cabdff1aSopenharmony_ci%endmacro
349cabdff1aSopenharmony_ci
350cabdff1aSopenharmony_ciINIT_MMX mmx
351cabdff1aSopenharmony_ciSUM_SQUARED_ERRORS 8
352cabdff1aSopenharmony_ci
353cabdff1aSopenharmony_ciINIT_MMX mmx
354cabdff1aSopenharmony_ciSUM_SQUARED_ERRORS 16
355cabdff1aSopenharmony_ci
356cabdff1aSopenharmony_ciINIT_XMM sse2
357cabdff1aSopenharmony_ciSUM_SQUARED_ERRORS 16
358cabdff1aSopenharmony_ci
359cabdff1aSopenharmony_ci;-----------------------------------------------
360cabdff1aSopenharmony_ci;int ff_sum_abs_dctelem(int16_t *block)
361cabdff1aSopenharmony_ci;-----------------------------------------------
362cabdff1aSopenharmony_ci; %1 = number of xmm registers used
363cabdff1aSopenharmony_ci; %2 = number of inline loops
364cabdff1aSopenharmony_ci
365cabdff1aSopenharmony_ci%macro SUM_ABS_DCTELEM 2
366cabdff1aSopenharmony_cicglobal sum_abs_dctelem, 1, 1, %1, block
367cabdff1aSopenharmony_ci    pxor    m0, m0
368cabdff1aSopenharmony_ci    pxor    m1, m1
369cabdff1aSopenharmony_ci%assign %%i 0
370cabdff1aSopenharmony_ci%rep %2
371cabdff1aSopenharmony_ci    mova      m2, [blockq+mmsize*(0+%%i)]
372cabdff1aSopenharmony_ci    mova      m3, [blockq+mmsize*(1+%%i)]
373cabdff1aSopenharmony_ci    mova      m4, [blockq+mmsize*(2+%%i)]
374cabdff1aSopenharmony_ci    mova      m5, [blockq+mmsize*(3+%%i)]
375cabdff1aSopenharmony_ci    ABS1_SUM  m2, m6, m0
376cabdff1aSopenharmony_ci    ABS1_SUM  m3, m6, m1
377cabdff1aSopenharmony_ci    ABS1_SUM  m4, m6, m0
378cabdff1aSopenharmony_ci    ABS1_SUM  m5, m6, m1
379cabdff1aSopenharmony_ci%assign %%i %%i+4
380cabdff1aSopenharmony_ci%endrep
381cabdff1aSopenharmony_ci    paddusw m0, m1
382cabdff1aSopenharmony_ci    HSUM    m0, m1, eax
383cabdff1aSopenharmony_ci    and     eax, 0xFFFF
384cabdff1aSopenharmony_ci    RET
385cabdff1aSopenharmony_ci%endmacro
386cabdff1aSopenharmony_ci
387cabdff1aSopenharmony_ciINIT_XMM sse2
388cabdff1aSopenharmony_ciSUM_ABS_DCTELEM 7, 2
389cabdff1aSopenharmony_ciINIT_XMM ssse3
390cabdff1aSopenharmony_ciSUM_ABS_DCTELEM 6, 2
391cabdff1aSopenharmony_ci
392cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
393cabdff1aSopenharmony_ci; int ff_hf_noise*_mmx(uint8_t *pix1, ptrdiff_t lsize, int h)
394cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
395cabdff1aSopenharmony_ci; %1 = 8/16. %2-5=m#
396cabdff1aSopenharmony_ci%macro HF_NOISE_PART1 5
397cabdff1aSopenharmony_ci    mova      m%2, [pix1q]
398cabdff1aSopenharmony_ci%if %1 == 8
399cabdff1aSopenharmony_ci    mova      m%3, m%2
400cabdff1aSopenharmony_ci    psllq     m%2, 8
401cabdff1aSopenharmony_ci    psrlq     m%3, 8
402cabdff1aSopenharmony_ci    psrlq     m%2, 8
403cabdff1aSopenharmony_ci%else
404cabdff1aSopenharmony_ci    mova      m%3, [pix1q+1]
405cabdff1aSopenharmony_ci%endif
406cabdff1aSopenharmony_ci    mova      m%4, m%2
407cabdff1aSopenharmony_ci    mova      m%5, m%3
408cabdff1aSopenharmony_ci    punpcklbw m%2, m7
409cabdff1aSopenharmony_ci    punpcklbw m%3, m7
410cabdff1aSopenharmony_ci    punpckhbw m%4, m7
411cabdff1aSopenharmony_ci    punpckhbw m%5, m7
412cabdff1aSopenharmony_ci    psubw     m%2, m%3
413cabdff1aSopenharmony_ci    psubw     m%4, m%5
414cabdff1aSopenharmony_ci%endmacro
415cabdff1aSopenharmony_ci
416cabdff1aSopenharmony_ci; %1-2 = m#
417cabdff1aSopenharmony_ci%macro HF_NOISE_PART2 4
418cabdff1aSopenharmony_ci    psubw     m%1, m%3
419cabdff1aSopenharmony_ci    psubw     m%2, m%4
420cabdff1aSopenharmony_ci    pxor       m3, m3
421cabdff1aSopenharmony_ci    pxor       m1, m1
422cabdff1aSopenharmony_ci    pcmpgtw    m3, m%1
423cabdff1aSopenharmony_ci    pcmpgtw    m1, m%2
424cabdff1aSopenharmony_ci    pxor      m%1, m3
425cabdff1aSopenharmony_ci    pxor      m%2, m1
426cabdff1aSopenharmony_ci    psubw     m%1, m3
427cabdff1aSopenharmony_ci    psubw     m%2, m1
428cabdff1aSopenharmony_ci    paddw     m%2, m%1
429cabdff1aSopenharmony_ci    paddw      m6, m%2
430cabdff1aSopenharmony_ci%endmacro
431cabdff1aSopenharmony_ci
432cabdff1aSopenharmony_ci; %1 = 8/16
433cabdff1aSopenharmony_ci%macro HF_NOISE 1
434cabdff1aSopenharmony_cicglobal hf_noise%1, 3,3,0, pix1, lsize, h
435cabdff1aSopenharmony_ci    sub        hd, 2
436cabdff1aSopenharmony_ci    pxor       m7, m7
437cabdff1aSopenharmony_ci    pxor       m6, m6
438cabdff1aSopenharmony_ci    HF_NOISE_PART1 %1, 0, 1, 2, 3
439cabdff1aSopenharmony_ci    add     pix1q, lsizeq
440cabdff1aSopenharmony_ci    HF_NOISE_PART1 %1, 4, 1, 5, 3
441cabdff1aSopenharmony_ci    HF_NOISE_PART2     0, 2, 4, 5
442cabdff1aSopenharmony_ci    add     pix1q, lsizeq
443cabdff1aSopenharmony_ci.loop:
444cabdff1aSopenharmony_ci    HF_NOISE_PART1 %1, 0, 1, 2, 3
445cabdff1aSopenharmony_ci    HF_NOISE_PART2     4, 5, 0, 2
446cabdff1aSopenharmony_ci    add     pix1q, lsizeq
447cabdff1aSopenharmony_ci    HF_NOISE_PART1 %1, 4, 1, 5, 3
448cabdff1aSopenharmony_ci    HF_NOISE_PART2     0, 2, 4, 5
449cabdff1aSopenharmony_ci    add     pix1q, lsizeq
450cabdff1aSopenharmony_ci    sub        hd, 2
451cabdff1aSopenharmony_ci        jne .loop
452cabdff1aSopenharmony_ci
453cabdff1aSopenharmony_ci    mova       m0, m6
454cabdff1aSopenharmony_ci    punpcklwd  m0, m7
455cabdff1aSopenharmony_ci    punpckhwd  m6, m7
456cabdff1aSopenharmony_ci    paddd      m6, m0
457cabdff1aSopenharmony_ci    mova       m0, m6
458cabdff1aSopenharmony_ci    psrlq      m6, 32
459cabdff1aSopenharmony_ci    paddd      m0, m6
460cabdff1aSopenharmony_ci    movd      eax, m0   ; eax = result of hf_noise8;
461cabdff1aSopenharmony_ci    REP_RET                 ; return eax;
462cabdff1aSopenharmony_ci%endmacro
463cabdff1aSopenharmony_ci
464cabdff1aSopenharmony_ciINIT_MMX mmx
465cabdff1aSopenharmony_ciHF_NOISE 8
466cabdff1aSopenharmony_ciHF_NOISE 16
467cabdff1aSopenharmony_ci
468cabdff1aSopenharmony_ci;---------------------------------------------------------------------------------------
469cabdff1aSopenharmony_ci;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
470cabdff1aSopenharmony_ci;---------------------------------------------------------------------------------------
471cabdff1aSopenharmony_ci;%1 = 8/16
472cabdff1aSopenharmony_ci%macro SAD 1
473cabdff1aSopenharmony_cicglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
474cabdff1aSopenharmony_ci    movu      m2, [pix2q]
475cabdff1aSopenharmony_ci    movu      m1, [pix2q+strideq]
476cabdff1aSopenharmony_ci    psadbw    m2, [pix1q]
477cabdff1aSopenharmony_ci    psadbw    m1, [pix1q+strideq]
478cabdff1aSopenharmony_ci    paddw     m2, m1
479cabdff1aSopenharmony_ci%if %1 != mmsize
480cabdff1aSopenharmony_ci    movu      m0, [pix2q+8]
481cabdff1aSopenharmony_ci    movu      m1, [pix2q+strideq+8]
482cabdff1aSopenharmony_ci    psadbw    m0, [pix1q+8]
483cabdff1aSopenharmony_ci    psadbw    m1, [pix1q+strideq+8]
484cabdff1aSopenharmony_ci    paddw     m2, m0
485cabdff1aSopenharmony_ci    paddw     m2, m1
486cabdff1aSopenharmony_ci%endif
487cabdff1aSopenharmony_ci    sub       hd, 2
488cabdff1aSopenharmony_ci
489cabdff1aSopenharmony_cialign 16
490cabdff1aSopenharmony_ci.loop:
491cabdff1aSopenharmony_ci    lea    pix1q, [pix1q+strideq*2]
492cabdff1aSopenharmony_ci    lea    pix2q, [pix2q+strideq*2]
493cabdff1aSopenharmony_ci    movu      m0, [pix2q]
494cabdff1aSopenharmony_ci    movu      m1, [pix2q+strideq]
495cabdff1aSopenharmony_ci    psadbw    m0, [pix1q]
496cabdff1aSopenharmony_ci    psadbw    m1, [pix1q+strideq]
497cabdff1aSopenharmony_ci    paddw     m2, m0
498cabdff1aSopenharmony_ci    paddw     m2, m1
499cabdff1aSopenharmony_ci%if %1 != mmsize
500cabdff1aSopenharmony_ci    movu      m0, [pix2q+8]
501cabdff1aSopenharmony_ci    movu      m1, [pix2q+strideq+8]
502cabdff1aSopenharmony_ci    psadbw    m0, [pix1q+8]
503cabdff1aSopenharmony_ci    psadbw    m1, [pix1q+strideq+8]
504cabdff1aSopenharmony_ci    paddw     m2, m0
505cabdff1aSopenharmony_ci    paddw     m2, m1
506cabdff1aSopenharmony_ci%endif
507cabdff1aSopenharmony_ci    sub       hd, 2
508cabdff1aSopenharmony_ci    jg .loop
509cabdff1aSopenharmony_ci%if mmsize == 16
510cabdff1aSopenharmony_ci    movhlps   m0, m2
511cabdff1aSopenharmony_ci    paddw     m2, m0
512cabdff1aSopenharmony_ci%endif
513cabdff1aSopenharmony_ci    movd     eax, m2
514cabdff1aSopenharmony_ci    RET
515cabdff1aSopenharmony_ci%endmacro
516cabdff1aSopenharmony_ci
517cabdff1aSopenharmony_ciINIT_MMX mmxext
518cabdff1aSopenharmony_ciSAD 8
519cabdff1aSopenharmony_ciSAD 16
520cabdff1aSopenharmony_ciINIT_XMM sse2
521cabdff1aSopenharmony_ciSAD 16
522cabdff1aSopenharmony_ci
523cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------
524cabdff1aSopenharmony_ci;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
525cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------
526cabdff1aSopenharmony_ci;%1 = 8/16
527cabdff1aSopenharmony_ci%macro SAD_X2 1
528cabdff1aSopenharmony_cicglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
529cabdff1aSopenharmony_ci    movu      m0, [pix2q]
530cabdff1aSopenharmony_ci    movu      m2, [pix2q+strideq]
531cabdff1aSopenharmony_ci%if mmsize == 16
532cabdff1aSopenharmony_ci    movu      m3, [pix2q+1]
533cabdff1aSopenharmony_ci    movu      m4, [pix2q+strideq+1]
534cabdff1aSopenharmony_ci    pavgb     m0, m3
535cabdff1aSopenharmony_ci    pavgb     m2, m4
536cabdff1aSopenharmony_ci%else
537cabdff1aSopenharmony_ci    pavgb     m0, [pix2q+1]
538cabdff1aSopenharmony_ci    pavgb     m2, [pix2q+strideq+1]
539cabdff1aSopenharmony_ci%endif
540cabdff1aSopenharmony_ci    psadbw    m0, [pix1q]
541cabdff1aSopenharmony_ci    psadbw    m2, [pix1q+strideq]
542cabdff1aSopenharmony_ci    paddw     m0, m2
543cabdff1aSopenharmony_ci%if %1 != mmsize
544cabdff1aSopenharmony_ci    movu      m1, [pix2q+8]
545cabdff1aSopenharmony_ci    movu      m2, [pix2q+strideq+8]
546cabdff1aSopenharmony_ci    pavgb     m1, [pix2q+9]
547cabdff1aSopenharmony_ci    pavgb     m2, [pix2q+strideq+9]
548cabdff1aSopenharmony_ci    psadbw    m1, [pix1q+8]
549cabdff1aSopenharmony_ci    psadbw    m2, [pix1q+strideq+8]
550cabdff1aSopenharmony_ci    paddw     m0, m1
551cabdff1aSopenharmony_ci    paddw     m0, m2
552cabdff1aSopenharmony_ci%endif
553cabdff1aSopenharmony_ci    sub       hd, 2
554cabdff1aSopenharmony_ci
555cabdff1aSopenharmony_cialign 16
556cabdff1aSopenharmony_ci.loop:
557cabdff1aSopenharmony_ci    lea    pix1q, [pix1q+2*strideq]
558cabdff1aSopenharmony_ci    lea    pix2q, [pix2q+2*strideq]
559cabdff1aSopenharmony_ci    movu      m1, [pix2q]
560cabdff1aSopenharmony_ci    movu      m2, [pix2q+strideq]
561cabdff1aSopenharmony_ci%if mmsize == 16
562cabdff1aSopenharmony_ci    movu      m3, [pix2q+1]
563cabdff1aSopenharmony_ci    movu      m4, [pix2q+strideq+1]
564cabdff1aSopenharmony_ci    pavgb     m1, m3
565cabdff1aSopenharmony_ci    pavgb     m2, m4
566cabdff1aSopenharmony_ci%else
567cabdff1aSopenharmony_ci    pavgb     m1, [pix2q+1]
568cabdff1aSopenharmony_ci    pavgb     m2, [pix2q+strideq+1]
569cabdff1aSopenharmony_ci%endif
570cabdff1aSopenharmony_ci    psadbw    m1, [pix1q]
571cabdff1aSopenharmony_ci    psadbw    m2, [pix1q+strideq]
572cabdff1aSopenharmony_ci    paddw     m0, m1
573cabdff1aSopenharmony_ci    paddw     m0, m2
574cabdff1aSopenharmony_ci%if %1 != mmsize
575cabdff1aSopenharmony_ci    movu      m1, [pix2q+8]
576cabdff1aSopenharmony_ci    movu      m2, [pix2q+strideq+8]
577cabdff1aSopenharmony_ci    pavgb     m1, [pix2q+9]
578cabdff1aSopenharmony_ci    pavgb     m2, [pix2q+strideq+9]
579cabdff1aSopenharmony_ci    psadbw    m1, [pix1q+8]
580cabdff1aSopenharmony_ci    psadbw    m2, [pix1q+strideq+8]
581cabdff1aSopenharmony_ci    paddw     m0, m1
582cabdff1aSopenharmony_ci    paddw     m0, m2
583cabdff1aSopenharmony_ci%endif
584cabdff1aSopenharmony_ci    sub       hd, 2
585cabdff1aSopenharmony_ci    jg .loop
586cabdff1aSopenharmony_ci%if mmsize == 16
587cabdff1aSopenharmony_ci    movhlps   m1, m0
588cabdff1aSopenharmony_ci    paddw     m0, m1
589cabdff1aSopenharmony_ci%endif
590cabdff1aSopenharmony_ci    movd     eax, m0
591cabdff1aSopenharmony_ci    RET
592cabdff1aSopenharmony_ci%endmacro
593cabdff1aSopenharmony_ci
594cabdff1aSopenharmony_ciINIT_MMX mmxext
595cabdff1aSopenharmony_ciSAD_X2 8
596cabdff1aSopenharmony_ciSAD_X2 16
597cabdff1aSopenharmony_ciINIT_XMM sse2
598cabdff1aSopenharmony_ciSAD_X2 16
599cabdff1aSopenharmony_ci
600cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------
601cabdff1aSopenharmony_ci;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
602cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------
603cabdff1aSopenharmony_ci;%1 = 8/16
604cabdff1aSopenharmony_ci%macro SAD_Y2 1
605cabdff1aSopenharmony_cicglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h
606cabdff1aSopenharmony_ci    movu      m1, [pix2q]
607cabdff1aSopenharmony_ci    movu      m0, [pix2q+strideq]
608cabdff1aSopenharmony_ci    movu      m3, [pix2q+2*strideq]
609cabdff1aSopenharmony_ci    pavgb     m1, m0
610cabdff1aSopenharmony_ci    pavgb     m0, m3
611cabdff1aSopenharmony_ci    psadbw    m1, [pix1q]
612cabdff1aSopenharmony_ci    psadbw    m0, [pix1q+strideq]
613cabdff1aSopenharmony_ci    paddw     m0, m1
614cabdff1aSopenharmony_ci    mova      m1, m3
615cabdff1aSopenharmony_ci%if %1 != mmsize
616cabdff1aSopenharmony_ci    movu      m4, [pix2q+8]
617cabdff1aSopenharmony_ci    movu      m5, [pix2q+strideq+8]
618cabdff1aSopenharmony_ci    movu      m6, [pix2q+2*strideq+8]
619cabdff1aSopenharmony_ci    pavgb     m4, m5
620cabdff1aSopenharmony_ci    pavgb     m5, m6
621cabdff1aSopenharmony_ci    psadbw    m4, [pix1q+8]
622cabdff1aSopenharmony_ci    psadbw    m5, [pix1q+strideq+8]
623cabdff1aSopenharmony_ci    paddw     m0, m4
624cabdff1aSopenharmony_ci    paddw     m0, m5
625cabdff1aSopenharmony_ci    mova      m4, m6
626cabdff1aSopenharmony_ci%endif
627cabdff1aSopenharmony_ci    add    pix2q, strideq
628cabdff1aSopenharmony_ci    sub       hd, 2
629cabdff1aSopenharmony_ci
630cabdff1aSopenharmony_cialign 16
631cabdff1aSopenharmony_ci.loop:
632cabdff1aSopenharmony_ci    lea    pix1q, [pix1q+2*strideq]
633cabdff1aSopenharmony_ci    lea    pix2q, [pix2q+2*strideq]
634cabdff1aSopenharmony_ci    movu      m2, [pix2q]
635cabdff1aSopenharmony_ci    movu      m3, [pix2q+strideq]
636cabdff1aSopenharmony_ci    pavgb     m1, m2
637cabdff1aSopenharmony_ci    pavgb     m2, m3
638cabdff1aSopenharmony_ci    psadbw    m1, [pix1q]
639cabdff1aSopenharmony_ci    psadbw    m2, [pix1q+strideq]
640cabdff1aSopenharmony_ci    paddw     m0, m1
641cabdff1aSopenharmony_ci    paddw     m0, m2
642cabdff1aSopenharmony_ci    mova      m1, m3
643cabdff1aSopenharmony_ci%if %1 != mmsize
644cabdff1aSopenharmony_ci    movu      m5, [pix2q+8]
645cabdff1aSopenharmony_ci    movu      m6, [pix2q+strideq+8]
646cabdff1aSopenharmony_ci    pavgb     m4, m5
647cabdff1aSopenharmony_ci    pavgb     m5, m6
648cabdff1aSopenharmony_ci    psadbw    m4, [pix1q+8]
649cabdff1aSopenharmony_ci    psadbw    m5, [pix1q+strideq+8]
650cabdff1aSopenharmony_ci    paddw     m0, m4
651cabdff1aSopenharmony_ci    paddw     m0, m5
652cabdff1aSopenharmony_ci    mova      m4, m6
653cabdff1aSopenharmony_ci%endif
654cabdff1aSopenharmony_ci    sub       hd, 2
655cabdff1aSopenharmony_ci    jg .loop
656cabdff1aSopenharmony_ci%if mmsize == 16
657cabdff1aSopenharmony_ci    movhlps   m1, m0
658cabdff1aSopenharmony_ci    paddw     m0, m1
659cabdff1aSopenharmony_ci%endif
660cabdff1aSopenharmony_ci    movd     eax, m0
661cabdff1aSopenharmony_ci    RET
662cabdff1aSopenharmony_ci%endmacro
663cabdff1aSopenharmony_ci
664cabdff1aSopenharmony_ciINIT_MMX mmxext
665cabdff1aSopenharmony_ciSAD_Y2 8
666cabdff1aSopenharmony_ciSAD_Y2 16
667cabdff1aSopenharmony_ciINIT_XMM sse2
668cabdff1aSopenharmony_ciSAD_Y2 16
669cabdff1aSopenharmony_ci
670cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------------------
671cabdff1aSopenharmony_ci;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
672cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------------------
673cabdff1aSopenharmony_ci;%1 = 8/16
674cabdff1aSopenharmony_ci%macro SAD_APPROX_XY2 1
675cabdff1aSopenharmony_cicglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h
676cabdff1aSopenharmony_ci    mova      m4, [pb_1]
677cabdff1aSopenharmony_ci    movu      m1, [pix2q]
678cabdff1aSopenharmony_ci    movu      m0, [pix2q+strideq]
679cabdff1aSopenharmony_ci    movu      m3, [pix2q+2*strideq]
680cabdff1aSopenharmony_ci%if mmsize == 16
681cabdff1aSopenharmony_ci    movu      m5, [pix2q+1]
682cabdff1aSopenharmony_ci    movu      m6, [pix2q+strideq+1]
683cabdff1aSopenharmony_ci    movu      m2, [pix2q+2*strideq+1]
684cabdff1aSopenharmony_ci    pavgb     m1, m5
685cabdff1aSopenharmony_ci    pavgb     m0, m6
686cabdff1aSopenharmony_ci    pavgb     m3, m2
687cabdff1aSopenharmony_ci%else
688cabdff1aSopenharmony_ci    pavgb     m1, [pix2q+1]
689cabdff1aSopenharmony_ci    pavgb     m0, [pix2q+strideq+1]
690cabdff1aSopenharmony_ci    pavgb     m3, [pix2q+2*strideq+1]
691cabdff1aSopenharmony_ci%endif
692cabdff1aSopenharmony_ci    psubusb   m0, m4
693cabdff1aSopenharmony_ci    pavgb     m1, m0
694cabdff1aSopenharmony_ci    pavgb     m0, m3
695cabdff1aSopenharmony_ci    psadbw    m1, [pix1q]
696cabdff1aSopenharmony_ci    psadbw    m0, [pix1q+strideq]
697cabdff1aSopenharmony_ci    paddw     m0, m1
698cabdff1aSopenharmony_ci    mova      m1, m3
699cabdff1aSopenharmony_ci%if %1 != mmsize
700cabdff1aSopenharmony_ci    movu      m5, [pix2q+8]
701cabdff1aSopenharmony_ci    movu      m6, [pix2q+strideq+8]
702cabdff1aSopenharmony_ci    movu      m7, [pix2q+2*strideq+8]
703cabdff1aSopenharmony_ci    pavgb     m5, [pix2q+1+8]
704cabdff1aSopenharmony_ci    pavgb     m6, [pix2q+strideq+1+8]
705cabdff1aSopenharmony_ci    pavgb     m7, [pix2q+2*strideq+1+8]
706cabdff1aSopenharmony_ci    psubusb   m6, m4
707cabdff1aSopenharmony_ci    pavgb     m5, m6
708cabdff1aSopenharmony_ci    pavgb     m6, m7
709cabdff1aSopenharmony_ci    psadbw    m5, [pix1q+8]
710cabdff1aSopenharmony_ci    psadbw    m6, [pix1q+strideq+8]
711cabdff1aSopenharmony_ci    paddw     m0, m5
712cabdff1aSopenharmony_ci    paddw     m0, m6
713cabdff1aSopenharmony_ci    mova      m5, m7
714cabdff1aSopenharmony_ci%endif
715cabdff1aSopenharmony_ci    add    pix2q, strideq
716cabdff1aSopenharmony_ci    sub       hd, 2
717cabdff1aSopenharmony_ci
718cabdff1aSopenharmony_cialign 16
719cabdff1aSopenharmony_ci.loop:
720cabdff1aSopenharmony_ci    lea    pix1q, [pix1q+2*strideq]
721cabdff1aSopenharmony_ci    lea    pix2q, [pix2q+2*strideq]
722cabdff1aSopenharmony_ci    movu      m2, [pix2q]
723cabdff1aSopenharmony_ci    movu      m3, [pix2q+strideq]
724cabdff1aSopenharmony_ci%if mmsize == 16
725cabdff1aSopenharmony_ci    movu      m5, [pix2q+1]
726cabdff1aSopenharmony_ci    movu      m6, [pix2q+strideq+1]
727cabdff1aSopenharmony_ci    pavgb     m2, m5
728cabdff1aSopenharmony_ci    pavgb     m3, m6
729cabdff1aSopenharmony_ci%else
730cabdff1aSopenharmony_ci    pavgb     m2, [pix2q+1]
731cabdff1aSopenharmony_ci    pavgb     m3, [pix2q+strideq+1]
732cabdff1aSopenharmony_ci%endif
733cabdff1aSopenharmony_ci    psubusb   m2, m4
734cabdff1aSopenharmony_ci    pavgb     m1, m2
735cabdff1aSopenharmony_ci    pavgb     m2, m3
736cabdff1aSopenharmony_ci    psadbw    m1, [pix1q]
737cabdff1aSopenharmony_ci    psadbw    m2, [pix1q+strideq]
738cabdff1aSopenharmony_ci    paddw     m0, m1
739cabdff1aSopenharmony_ci    paddw     m0, m2
740cabdff1aSopenharmony_ci    mova      m1, m3
741cabdff1aSopenharmony_ci%if %1 != mmsize
742cabdff1aSopenharmony_ci    movu      m6, [pix2q+8]
743cabdff1aSopenharmony_ci    movu      m7, [pix2q+strideq+8]
744cabdff1aSopenharmony_ci    pavgb     m6, [pix2q+8+1]
745cabdff1aSopenharmony_ci    pavgb     m7, [pix2q+strideq+8+1]
746cabdff1aSopenharmony_ci    psubusb   m6, m4
747cabdff1aSopenharmony_ci    pavgb     m5, m6
748cabdff1aSopenharmony_ci    pavgb     m6, m7
749cabdff1aSopenharmony_ci    psadbw    m5, [pix1q+8]
750cabdff1aSopenharmony_ci    psadbw    m6, [pix1q+strideq+8]
751cabdff1aSopenharmony_ci    paddw     m0, m5
752cabdff1aSopenharmony_ci    paddw     m0, m6
753cabdff1aSopenharmony_ci    mova      m5, m7
754cabdff1aSopenharmony_ci%endif
755cabdff1aSopenharmony_ci    sub       hd, 2
756cabdff1aSopenharmony_ci    jg .loop
757cabdff1aSopenharmony_ci%if mmsize == 16
758cabdff1aSopenharmony_ci    movhlps   m1, m0
759cabdff1aSopenharmony_ci    paddw     m0, m1
760cabdff1aSopenharmony_ci%endif
761cabdff1aSopenharmony_ci    movd     eax, m0
762cabdff1aSopenharmony_ci    RET
763cabdff1aSopenharmony_ci%endmacro
764cabdff1aSopenharmony_ci
765cabdff1aSopenharmony_ciINIT_MMX mmxext
766cabdff1aSopenharmony_ciSAD_APPROX_XY2 8
767cabdff1aSopenharmony_ciSAD_APPROX_XY2 16
768cabdff1aSopenharmony_ciINIT_XMM sse2
769cabdff1aSopenharmony_ciSAD_APPROX_XY2 16
770cabdff1aSopenharmony_ci
771cabdff1aSopenharmony_ci;--------------------------------------------------------------------
772cabdff1aSopenharmony_ci;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
773cabdff1aSopenharmony_ci;                  ptrdiff_t line_size, int h);
774cabdff1aSopenharmony_ci;--------------------------------------------------------------------
775cabdff1aSopenharmony_ci; %1 = 8/16
776cabdff1aSopenharmony_ci%macro VSAD_INTRA 1
777cabdff1aSopenharmony_cicglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
778cabdff1aSopenharmony_ci    mova      m0, [pix1q]
779cabdff1aSopenharmony_ci%if %1 == mmsize
780cabdff1aSopenharmony_ci    mova      m2, [pix1q+lsizeq]
781cabdff1aSopenharmony_ci    psadbw    m0, m2
782cabdff1aSopenharmony_ci%else
783cabdff1aSopenharmony_ci    mova      m2, [pix1q+lsizeq]
784cabdff1aSopenharmony_ci    mova      m3, [pix1q+8]
785cabdff1aSopenharmony_ci    mova      m4, [pix1q+lsizeq+8]
786cabdff1aSopenharmony_ci    psadbw    m0, m2
787cabdff1aSopenharmony_ci    psadbw    m3, m4
788cabdff1aSopenharmony_ci    paddw     m0, m3
789cabdff1aSopenharmony_ci%endif
790cabdff1aSopenharmony_ci    sub       hd, 2
791cabdff1aSopenharmony_ci
792cabdff1aSopenharmony_ci.loop:
793cabdff1aSopenharmony_ci    lea    pix1q, [pix1q + 2*lsizeq]
794cabdff1aSopenharmony_ci%if %1 == mmsize
795cabdff1aSopenharmony_ci    mova      m1, [pix1q]
796cabdff1aSopenharmony_ci    psadbw    m2, m1
797cabdff1aSopenharmony_ci    paddw     m0, m2
798cabdff1aSopenharmony_ci    mova      m2, [pix1q+lsizeq]
799cabdff1aSopenharmony_ci    psadbw    m1, m2
800cabdff1aSopenharmony_ci    paddw     m0, m1
801cabdff1aSopenharmony_ci%else
802cabdff1aSopenharmony_ci    mova      m1, [pix1q]
803cabdff1aSopenharmony_ci    mova      m3, [pix1q+8]
804cabdff1aSopenharmony_ci    psadbw    m2, m1
805cabdff1aSopenharmony_ci    psadbw    m4, m3
806cabdff1aSopenharmony_ci    paddw     m0, m2
807cabdff1aSopenharmony_ci    paddw     m0, m4
808cabdff1aSopenharmony_ci    mova      m2, [pix1q+lsizeq]
809cabdff1aSopenharmony_ci    mova      m4, [pix1q+lsizeq+8]
810cabdff1aSopenharmony_ci    psadbw    m1, m2
811cabdff1aSopenharmony_ci    psadbw    m3, m4
812cabdff1aSopenharmony_ci    paddw     m0, m1
813cabdff1aSopenharmony_ci    paddw     m0, m3
814cabdff1aSopenharmony_ci%endif
815cabdff1aSopenharmony_ci    sub       hd, 2
816cabdff1aSopenharmony_ci    jg     .loop
817cabdff1aSopenharmony_ci
818cabdff1aSopenharmony_ci%if mmsize == 16
819cabdff1aSopenharmony_ci    pshufd m1, m0, 0xe
820cabdff1aSopenharmony_ci    paddd  m0, m1
821cabdff1aSopenharmony_ci%endif
822cabdff1aSopenharmony_ci    movd eax, m0
823cabdff1aSopenharmony_ci    RET
824cabdff1aSopenharmony_ci%endmacro
825cabdff1aSopenharmony_ci
826cabdff1aSopenharmony_ciINIT_MMX mmxext
827cabdff1aSopenharmony_ciVSAD_INTRA 8
828cabdff1aSopenharmony_ciVSAD_INTRA 16
829cabdff1aSopenharmony_ciINIT_XMM sse2
830cabdff1aSopenharmony_ciVSAD_INTRA 16
831cabdff1aSopenharmony_ci
832cabdff1aSopenharmony_ci;---------------------------------------------------------------------
833cabdff1aSopenharmony_ci;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
834cabdff1aSopenharmony_ci;                   ptrdiff_t line_size, int h);
835cabdff1aSopenharmony_ci;---------------------------------------------------------------------
836cabdff1aSopenharmony_ci; %1 = 8/16
837cabdff1aSopenharmony_ci%macro VSAD_APPROX 1
838cabdff1aSopenharmony_cicglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
839cabdff1aSopenharmony_ci    mova   m1, [pb_80]
840cabdff1aSopenharmony_ci    mova   m0, [pix1q]
841cabdff1aSopenharmony_ci%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
842cabdff1aSopenharmony_ci    mova   m4, [pix1q+lsizeq]
843cabdff1aSopenharmony_ci%if mmsize == 16
844cabdff1aSopenharmony_ci    movu   m3, [pix2q]
845cabdff1aSopenharmony_ci    movu   m2, [pix2q+lsizeq]
846cabdff1aSopenharmony_ci    psubb  m0, m3
847cabdff1aSopenharmony_ci    psubb  m4, m2
848cabdff1aSopenharmony_ci%else
849cabdff1aSopenharmony_ci    psubb  m0, [pix2q]
850cabdff1aSopenharmony_ci    psubb  m4, [pix2q+lsizeq]
851cabdff1aSopenharmony_ci%endif
852cabdff1aSopenharmony_ci    pxor   m0, m1
853cabdff1aSopenharmony_ci    pxor   m4, m1
854cabdff1aSopenharmony_ci    psadbw m0, m4
855cabdff1aSopenharmony_ci%else ; vsad16_mmxext
856cabdff1aSopenharmony_ci    mova   m3, [pix1q+8]
857cabdff1aSopenharmony_ci    psubb  m0, [pix2q]
858cabdff1aSopenharmony_ci    psubb  m3, [pix2q+8]
859cabdff1aSopenharmony_ci    pxor   m0, m1
860cabdff1aSopenharmony_ci    pxor   m3, m1
861cabdff1aSopenharmony_ci    mova   m4, [pix1q+lsizeq]
862cabdff1aSopenharmony_ci    mova   m5, [pix1q+lsizeq+8]
863cabdff1aSopenharmony_ci    psubb  m4, [pix2q+lsizeq]
864cabdff1aSopenharmony_ci    psubb  m5, [pix2q+lsizeq+8]
865cabdff1aSopenharmony_ci    pxor   m4, m1
866cabdff1aSopenharmony_ci    pxor   m5, m1
867cabdff1aSopenharmony_ci    psadbw m0, m4
868cabdff1aSopenharmony_ci    psadbw m3, m5
869cabdff1aSopenharmony_ci    paddw  m0, m3
870cabdff1aSopenharmony_ci%endif
871cabdff1aSopenharmony_ci    sub    hd, 2
872cabdff1aSopenharmony_ci
873cabdff1aSopenharmony_ci.loop:
874cabdff1aSopenharmony_ci    lea pix1q, [pix1q + 2*lsizeq]
875cabdff1aSopenharmony_ci    lea pix2q, [pix2q + 2*lsizeq]
876cabdff1aSopenharmony_ci    mova   m2, [pix1q]
877cabdff1aSopenharmony_ci%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
878cabdff1aSopenharmony_ci%if mmsize == 16
879cabdff1aSopenharmony_ci    movu   m3, [pix2q]
880cabdff1aSopenharmony_ci    psubb  m2, m3
881cabdff1aSopenharmony_ci%else
882cabdff1aSopenharmony_ci    psubb  m2, [pix2q]
883cabdff1aSopenharmony_ci%endif
884cabdff1aSopenharmony_ci    pxor   m2, m1
885cabdff1aSopenharmony_ci    psadbw m4, m2
886cabdff1aSopenharmony_ci    paddw  m0, m4
887cabdff1aSopenharmony_ci    mova   m4, [pix1q+lsizeq]
888cabdff1aSopenharmony_ci    movu   m3, [pix2q+lsizeq]
889cabdff1aSopenharmony_ci    psubb  m4, m3
890cabdff1aSopenharmony_ci    pxor   m4, m1
891cabdff1aSopenharmony_ci    psadbw m2, m4
892cabdff1aSopenharmony_ci    paddw  m0, m2
893cabdff1aSopenharmony_ci%else ; vsad16_mmxext
894cabdff1aSopenharmony_ci    mova   m3, [pix1q+8]
895cabdff1aSopenharmony_ci    psubb  m2, [pix2q]
896cabdff1aSopenharmony_ci    psubb  m3, [pix2q+8]
897cabdff1aSopenharmony_ci    pxor   m2, m1
898cabdff1aSopenharmony_ci    pxor   m3, m1
899cabdff1aSopenharmony_ci    psadbw m4, m2
900cabdff1aSopenharmony_ci    psadbw m5, m3
901cabdff1aSopenharmony_ci    paddw  m0, m4
902cabdff1aSopenharmony_ci    paddw  m0, m5
903cabdff1aSopenharmony_ci    mova   m4, [pix1q+lsizeq]
904cabdff1aSopenharmony_ci    mova   m5, [pix1q+lsizeq+8]
905cabdff1aSopenharmony_ci    psubb  m4, [pix2q+lsizeq]
906cabdff1aSopenharmony_ci    psubb  m5, [pix2q+lsizeq+8]
907cabdff1aSopenharmony_ci    pxor   m4, m1
908cabdff1aSopenharmony_ci    pxor   m5, m1
909cabdff1aSopenharmony_ci    psadbw m2, m4
910cabdff1aSopenharmony_ci    psadbw m3, m5
911cabdff1aSopenharmony_ci    paddw  m0, m2
912cabdff1aSopenharmony_ci    paddw  m0, m3
913cabdff1aSopenharmony_ci%endif
914cabdff1aSopenharmony_ci    sub    hd, 2
915cabdff1aSopenharmony_ci    jg  .loop
916cabdff1aSopenharmony_ci
917cabdff1aSopenharmony_ci%if mmsize == 16
918cabdff1aSopenharmony_ci    pshufd m1, m0, 0xe
919cabdff1aSopenharmony_ci    paddd  m0, m1
920cabdff1aSopenharmony_ci%endif
921cabdff1aSopenharmony_ci    movd  eax, m0
922cabdff1aSopenharmony_ci    RET
923cabdff1aSopenharmony_ci%endmacro
924cabdff1aSopenharmony_ci
925cabdff1aSopenharmony_ciINIT_MMX mmxext
926cabdff1aSopenharmony_ciVSAD_APPROX 8
927cabdff1aSopenharmony_ciVSAD_APPROX 16
928cabdff1aSopenharmony_ciINIT_XMM sse2
929cabdff1aSopenharmony_ciVSAD_APPROX 16
930