1cabdff1aSopenharmony_ci;*****************************************************************************
2cabdff1aSopenharmony_ci;* x86util.asm
3cabdff1aSopenharmony_ci;*****************************************************************************
4cabdff1aSopenharmony_ci;* Copyright (C) 2008-2010 x264 project
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* Authors: Loren Merritt <lorenm@u.washington.edu>
7cabdff1aSopenharmony_ci;*          Holger Lubitz <holger@lubitz.org>
8cabdff1aSopenharmony_ci;*
9cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
10cabdff1aSopenharmony_ci;*
11cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
12cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
13cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
14cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
15cabdff1aSopenharmony_ci;*
16cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
17cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
20cabdff1aSopenharmony_ci;*
21cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
22cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
23cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24cabdff1aSopenharmony_ci;******************************************************************************
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ci%define private_prefix ff
27cabdff1aSopenharmony_ci%define public_prefix  avpriv
28cabdff1aSopenharmony_ci%define cpuflags_mmxext cpuflags_mmx2
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ci%include "libavutil/x86/x86inc.asm"
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ci; expands to [base],...,[base+7*stride]
33cabdff1aSopenharmony_ci%define PASS8ROWS(base, base3, stride, stride3) \
34cabdff1aSopenharmony_ci    [base],           [base  + stride],   [base  + 2*stride], [base3], \
35cabdff1aSopenharmony_ci    [base3 + stride], [base3 + 2*stride], [base3 + stride3],  [base3 + stride*4]
36cabdff1aSopenharmony_ci
37cabdff1aSopenharmony_ci; Interleave low src0 with low src1 and store in src0,
38cabdff1aSopenharmony_ci; interleave high src0 with high src1 and store in src1.
39cabdff1aSopenharmony_ci; %1 - types
40cabdff1aSopenharmony_ci; %2 - index of the register with src0
41cabdff1aSopenharmony_ci; %3 - index of the register with src1
42cabdff1aSopenharmony_ci; %4 - index of the register for intermediate results
43cabdff1aSopenharmony_ci; example for %1 - wd: input: src0: x0 x1 x2 x3 z0 z1 z2 z3
44cabdff1aSopenharmony_ci;                             src1: y0 y1 y2 y3 q0 q1 q2 q3
45cabdff1aSopenharmony_ci;                     output: src0: x0 y0 x1 y1 x2 y2 x3 y3
46cabdff1aSopenharmony_ci;                             src1: z0 q0 z1 q1 z2 q2 z3 q3
47cabdff1aSopenharmony_ci%macro SBUTTERFLY 4
48cabdff1aSopenharmony_ci%ifidn %1, dqqq
49cabdff1aSopenharmony_ci    vperm2i128  m%4, m%2, m%3, q0301
50cabdff1aSopenharmony_ci    vinserti128 m%2, m%2, xm%3, 1
51cabdff1aSopenharmony_ci%elif avx_enabled == 0
52cabdff1aSopenharmony_ci    mova      m%4, m%2
53cabdff1aSopenharmony_ci    punpckl%1 m%2, m%3
54cabdff1aSopenharmony_ci    punpckh%1 m%4, m%3
55cabdff1aSopenharmony_ci%else
56cabdff1aSopenharmony_ci    punpckh%1 m%4, m%2, m%3
57cabdff1aSopenharmony_ci    punpckl%1 m%2, m%3
58cabdff1aSopenharmony_ci%endif
59cabdff1aSopenharmony_ci    SWAP %3, %4
60cabdff1aSopenharmony_ci%endmacro
61cabdff1aSopenharmony_ci
62cabdff1aSopenharmony_ci%macro SBUTTERFLY2 4
63cabdff1aSopenharmony_ci    punpckl%1 m%4, m%2, m%3
64cabdff1aSopenharmony_ci    punpckh%1 m%2, m%2, m%3
65cabdff1aSopenharmony_ci    SWAP %2, %4, %3
66cabdff1aSopenharmony_ci%endmacro
67cabdff1aSopenharmony_ci
68cabdff1aSopenharmony_ci%macro SBUTTERFLYPS 3
69cabdff1aSopenharmony_ci    unpcklps m%3, m%1, m%2
70cabdff1aSopenharmony_ci    unpckhps m%1, m%1, m%2
71cabdff1aSopenharmony_ci    SWAP %1, %3, %2
72cabdff1aSopenharmony_ci%endmacro
73cabdff1aSopenharmony_ci
74cabdff1aSopenharmony_ci%macro SBUTTERFLYPD 3
75cabdff1aSopenharmony_ci    movlhps m%3, m%1, m%2
76cabdff1aSopenharmony_ci    movhlps m%2, m%2, m%1
77cabdff1aSopenharmony_ci    SWAP %1, %3
78cabdff1aSopenharmony_ci%endmacro
79cabdff1aSopenharmony_ci
80cabdff1aSopenharmony_ci%macro TRANSPOSE4x4B 5
81cabdff1aSopenharmony_ci    SBUTTERFLY bw, %1, %2, %5
82cabdff1aSopenharmony_ci    SBUTTERFLY bw, %3, %4, %5
83cabdff1aSopenharmony_ci    SBUTTERFLY wd, %1, %3, %5
84cabdff1aSopenharmony_ci    SBUTTERFLY wd, %2, %4, %5
85cabdff1aSopenharmony_ci    SWAP %2, %3
86cabdff1aSopenharmony_ci%endmacro
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci%macro TRANSPOSE4x4W 5
89cabdff1aSopenharmony_ci    SBUTTERFLY wd, %1, %2, %5
90cabdff1aSopenharmony_ci    SBUTTERFLY wd, %3, %4, %5
91cabdff1aSopenharmony_ci    SBUTTERFLY dq, %1, %3, %5
92cabdff1aSopenharmony_ci    SBUTTERFLY dq, %2, %4, %5
93cabdff1aSopenharmony_ci    SWAP %2, %3
94cabdff1aSopenharmony_ci%endmacro
95cabdff1aSopenharmony_ci
96cabdff1aSopenharmony_ci%macro TRANSPOSE2x4x4B 5
97cabdff1aSopenharmony_ci    SBUTTERFLY bw,  %1, %2, %5
98cabdff1aSopenharmony_ci    SBUTTERFLY bw,  %3, %4, %5
99cabdff1aSopenharmony_ci    SBUTTERFLY wd,  %1, %3, %5
100cabdff1aSopenharmony_ci    SBUTTERFLY wd,  %2, %4, %5
101cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %1, %2, %5
102cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %3, %4, %5
103cabdff1aSopenharmony_ci%endmacro
104cabdff1aSopenharmony_ci
105cabdff1aSopenharmony_ci%macro TRANSPOSE2x4x4W 5
106cabdff1aSopenharmony_ci    SBUTTERFLY wd,  %1, %2, %5
107cabdff1aSopenharmony_ci    SBUTTERFLY wd,  %3, %4, %5
108cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %1, %3, %5
109cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %2, %4, %5
110cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %1, %2, %5
111cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %3, %4, %5
112cabdff1aSopenharmony_ci%endmacro
113cabdff1aSopenharmony_ci
114cabdff1aSopenharmony_ci%macro TRANSPOSE4x4D 5
115cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %1, %2, %5
116cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %3, %4, %5
117cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %1, %3, %5
118cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %2, %4, %5
119cabdff1aSopenharmony_ci    SWAP %2, %3
120cabdff1aSopenharmony_ci%endmacro
121cabdff1aSopenharmony_ci
122cabdff1aSopenharmony_ci; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops
123cabdff1aSopenharmony_ci%macro TRANSPOSE4x4PS 5
124cabdff1aSopenharmony_ci    SBUTTERFLYPS %1, %2, %5
125cabdff1aSopenharmony_ci    SBUTTERFLYPS %3, %4, %5
126cabdff1aSopenharmony_ci    SBUTTERFLYPD %1, %3, %5
127cabdff1aSopenharmony_ci    SBUTTERFLYPD %2, %4, %5
128cabdff1aSopenharmony_ci    SWAP %2, %3
129cabdff1aSopenharmony_ci%endmacro
130cabdff1aSopenharmony_ci
131cabdff1aSopenharmony_ci%macro TRANSPOSE8x4D 9-11
132cabdff1aSopenharmony_ci%if ARCH_X86_64
133cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %1, %2, %9
134cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %3, %4, %9
135cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %5, %6, %9
136cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %7, %8, %9
137cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %1, %3, %9
138cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %2, %4, %9
139cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %5, %7, %9
140cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %6, %8, %9
141cabdff1aSopenharmony_ci    SWAP %2, %5
142cabdff1aSopenharmony_ci    SWAP %4, %7
143cabdff1aSopenharmony_ci%else
144cabdff1aSopenharmony_ci; in:  m0..m7
145cabdff1aSopenharmony_ci; out: m0..m7, unless %11 in which case m2 is in %9
146cabdff1aSopenharmony_ci; spills into %9 and %10
147cabdff1aSopenharmony_ci    movdqa %9, m%7
148cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %1, %2, %7
149cabdff1aSopenharmony_ci    movdqa %10, m%2
150cabdff1aSopenharmony_ci    movdqa m%7, %9
151cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %3, %4, %2
152cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %5, %6, %2
153cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %7, %8, %2
154cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %1, %3, %2
155cabdff1aSopenharmony_ci    movdqa %9, m%3
156cabdff1aSopenharmony_ci    movdqa m%2, %10
157cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %2, %4, %3
158cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %5, %7, %3
159cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %6, %8, %3
160cabdff1aSopenharmony_ci    SWAP %2, %5
161cabdff1aSopenharmony_ci    SWAP %4, %7
162cabdff1aSopenharmony_ci%if %0<11
163cabdff1aSopenharmony_ci    movdqa m%3, %9
164cabdff1aSopenharmony_ci%endif
165cabdff1aSopenharmony_ci%endif
166cabdff1aSopenharmony_ci%endmacro
167cabdff1aSopenharmony_ci
168cabdff1aSopenharmony_ci%macro TRANSPOSE8x8W 9-11
169cabdff1aSopenharmony_ci%if ARCH_X86_64
170cabdff1aSopenharmony_ci    SBUTTERFLY wd,  %1, %2, %9
171cabdff1aSopenharmony_ci    SBUTTERFLY wd,  %3, %4, %9
172cabdff1aSopenharmony_ci    SBUTTERFLY wd,  %5, %6, %9
173cabdff1aSopenharmony_ci    SBUTTERFLY wd,  %7, %8, %9
174cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %1, %3, %9
175cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %2, %4, %9
176cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %5, %7, %9
177cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %6, %8, %9
178cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %1, %5, %9
179cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %2, %6, %9
180cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %3, %7, %9
181cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %4, %8, %9
182cabdff1aSopenharmony_ci    SWAP %2, %5
183cabdff1aSopenharmony_ci    SWAP %4, %7
184cabdff1aSopenharmony_ci%else
185cabdff1aSopenharmony_ci; in:  m0..m7, unless %11 in which case m6 is in %9
186cabdff1aSopenharmony_ci; out: m0..m7, unless %11 in which case m4 is in %10
187cabdff1aSopenharmony_ci; spills into %9 and %10
188cabdff1aSopenharmony_ci%if %0<11
189cabdff1aSopenharmony_ci    movdqa %9, m%7
190cabdff1aSopenharmony_ci%endif
191cabdff1aSopenharmony_ci    SBUTTERFLY wd,  %1, %2, %7
192cabdff1aSopenharmony_ci    movdqa %10, m%2
193cabdff1aSopenharmony_ci    movdqa m%7, %9
194cabdff1aSopenharmony_ci    SBUTTERFLY wd,  %3, %4, %2
195cabdff1aSopenharmony_ci    SBUTTERFLY wd,  %5, %6, %2
196cabdff1aSopenharmony_ci    SBUTTERFLY wd,  %7, %8, %2
197cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %1, %3, %2
198cabdff1aSopenharmony_ci    movdqa %9, m%3
199cabdff1aSopenharmony_ci    movdqa m%2, %10
200cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %2, %4, %3
201cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %5, %7, %3
202cabdff1aSopenharmony_ci    SBUTTERFLY dq,  %6, %8, %3
203cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %1, %5, %3
204cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %2, %6, %3
205cabdff1aSopenharmony_ci    movdqa %10, m%2
206cabdff1aSopenharmony_ci    movdqa m%3, %9
207cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %3, %7, %2
208cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %4, %8, %2
209cabdff1aSopenharmony_ci    SWAP %2, %5
210cabdff1aSopenharmony_ci    SWAP %4, %7
211cabdff1aSopenharmony_ci%if %0<11
212cabdff1aSopenharmony_ci    movdqa m%5, %10
213cabdff1aSopenharmony_ci%endif
214cabdff1aSopenharmony_ci%endif
215cabdff1aSopenharmony_ci%endmacro
216cabdff1aSopenharmony_ci
217cabdff1aSopenharmony_ci%macro TRANSPOSE16x16W 18-19
218cabdff1aSopenharmony_ci; in:  m0..m15, unless %19 in which case m6 is in %17
219cabdff1aSopenharmony_ci; out: m0..m15, unless %19 in which case m4 is in %18
220cabdff1aSopenharmony_ci; spills into %17 and %18
221cabdff1aSopenharmony_ci%if %0 < 19
222cabdff1aSopenharmony_ci    mova       %17, m%7
223cabdff1aSopenharmony_ci%endif
224cabdff1aSopenharmony_ci
225cabdff1aSopenharmony_ci    SBUTTERFLY dqqq, %1,  %9, %7
226cabdff1aSopenharmony_ci    SBUTTERFLY dqqq, %2, %10, %7
227cabdff1aSopenharmony_ci    SBUTTERFLY dqqq, %3, %11, %7
228cabdff1aSopenharmony_ci    SBUTTERFLY dqqq, %4, %12, %7
229cabdff1aSopenharmony_ci    SBUTTERFLY dqqq, %5, %13, %7
230cabdff1aSopenharmony_ci    SBUTTERFLY dqqq, %6, %14, %7
231cabdff1aSopenharmony_ci    mova       %18, m%14
232cabdff1aSopenharmony_ci    mova       m%7, %17
233cabdff1aSopenharmony_ci    SBUTTERFLY dqqq, %7, %15, %14
234cabdff1aSopenharmony_ci    SBUTTERFLY dqqq, %8, %16, %14
235cabdff1aSopenharmony_ci
236cabdff1aSopenharmony_ci    SBUTTERFLY  wd,  %1,  %2, %14
237cabdff1aSopenharmony_ci    SBUTTERFLY  wd,  %3,  %4, %14
238cabdff1aSopenharmony_ci    SBUTTERFLY  wd,  %5,  %6, %14
239cabdff1aSopenharmony_ci    SBUTTERFLY  wd,  %7,  %8, %14
240cabdff1aSopenharmony_ci    SBUTTERFLY  wd,  %9, %10, %14
241cabdff1aSopenharmony_ci    SBUTTERFLY  wd, %11, %12, %14
242cabdff1aSopenharmony_ci    mova       %17, m%12
243cabdff1aSopenharmony_ci    mova      m%14, %18
244cabdff1aSopenharmony_ci    SBUTTERFLY  wd, %13, %14, %12
245cabdff1aSopenharmony_ci    SBUTTERFLY  wd, %15, %16, %12
246cabdff1aSopenharmony_ci
247cabdff1aSopenharmony_ci    SBUTTERFLY  dq,  %1,  %3, %12
248cabdff1aSopenharmony_ci    SBUTTERFLY  dq,  %2,  %4, %12
249cabdff1aSopenharmony_ci    SBUTTERFLY  dq,  %5,  %7, %12
250cabdff1aSopenharmony_ci    SBUTTERFLY  dq,  %6,  %8, %12
251cabdff1aSopenharmony_ci    SBUTTERFLY  dq,  %9, %11, %12
252cabdff1aSopenharmony_ci    mova       %18, m%11
253cabdff1aSopenharmony_ci    mova      m%12, %17
254cabdff1aSopenharmony_ci    SBUTTERFLY  dq, %10, %12, %11
255cabdff1aSopenharmony_ci    SBUTTERFLY  dq, %13, %15, %11
256cabdff1aSopenharmony_ci    SBUTTERFLY  dq, %14, %16, %11
257cabdff1aSopenharmony_ci
258cabdff1aSopenharmony_ci    SBUTTERFLY qdq,  %1,  %5, %11
259cabdff1aSopenharmony_ci    SBUTTERFLY qdq,  %2,  %6, %11
260cabdff1aSopenharmony_ci    SBUTTERFLY qdq,  %3,  %7, %11
261cabdff1aSopenharmony_ci    SBUTTERFLY qdq,  %4,  %8, %11
262cabdff1aSopenharmony_ci
263cabdff1aSopenharmony_ci    SWAP        %2, %5
264cabdff1aSopenharmony_ci    SWAP        %4, %7
265cabdff1aSopenharmony_ci
266cabdff1aSopenharmony_ci    SBUTTERFLY qdq,  %9, %13, %11
267cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %10, %14, %11
268cabdff1aSopenharmony_ci    mova      m%11, %18
269cabdff1aSopenharmony_ci    mova       %18, m%5
270cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %11, %15, %5
271cabdff1aSopenharmony_ci    SBUTTERFLY qdq, %12, %16, %5
272cabdff1aSopenharmony_ci
273cabdff1aSopenharmony_ci%if %0 < 19
274cabdff1aSopenharmony_ci    mova       m%5, %18
275cabdff1aSopenharmony_ci%endif
276cabdff1aSopenharmony_ci
277cabdff1aSopenharmony_ci    SWAP       %10, %13
278cabdff1aSopenharmony_ci    SWAP       %12, %15
279cabdff1aSopenharmony_ci%endmacro
280cabdff1aSopenharmony_ci
281cabdff1aSopenharmony_ci%macro TRANSPOSE_8X8B 8
282cabdff1aSopenharmony_ci    %if mmsize == 8
283cabdff1aSopenharmony_ci        %error "This macro does not support mmsize == 8"
284cabdff1aSopenharmony_ci    %endif
285cabdff1aSopenharmony_ci    punpcklbw m%1, m%2
286cabdff1aSopenharmony_ci    punpcklbw m%3, m%4
287cabdff1aSopenharmony_ci    punpcklbw m%5, m%6
288cabdff1aSopenharmony_ci    punpcklbw m%7, m%8
289cabdff1aSopenharmony_ci    TRANSPOSE4x4W %1, %3, %5, %7, %2
290cabdff1aSopenharmony_ci    MOVHL m%2, m%1
291cabdff1aSopenharmony_ci    MOVHL m%4, m%3
292cabdff1aSopenharmony_ci    MOVHL m%6, m%5
293cabdff1aSopenharmony_ci    MOVHL m%8, m%7
294cabdff1aSopenharmony_ci%endmacro
295cabdff1aSopenharmony_ci
296cabdff1aSopenharmony_ci; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place
297cabdff1aSopenharmony_ci%macro PABSW 2
298cabdff1aSopenharmony_ci%if cpuflag(ssse3)
299cabdff1aSopenharmony_ci    pabsw      %1, %2
300cabdff1aSopenharmony_ci%elif cpuflag(mmxext)
301cabdff1aSopenharmony_ci    pxor    %1, %1
302cabdff1aSopenharmony_ci    psubw   %1, %2
303cabdff1aSopenharmony_ci    pmaxsw  %1, %2
304cabdff1aSopenharmony_ci%else
305cabdff1aSopenharmony_ci    pxor       %1, %1
306cabdff1aSopenharmony_ci    pcmpgtw    %1, %2
307cabdff1aSopenharmony_ci    pxor       %2, %1
308cabdff1aSopenharmony_ci    psubw      %2, %1
309cabdff1aSopenharmony_ci    SWAP       %1, %2
310cabdff1aSopenharmony_ci%endif
311cabdff1aSopenharmony_ci%endmacro
312cabdff1aSopenharmony_ci
313cabdff1aSopenharmony_ci%macro PSIGNW 2
314cabdff1aSopenharmony_ci%if cpuflag(ssse3)
315cabdff1aSopenharmony_ci    psignw     %1, %2
316cabdff1aSopenharmony_ci%else
317cabdff1aSopenharmony_ci    pxor       %1, %2
318cabdff1aSopenharmony_ci    psubw      %1, %2
319cabdff1aSopenharmony_ci%endif
320cabdff1aSopenharmony_ci%endmacro
321cabdff1aSopenharmony_ci
322cabdff1aSopenharmony_ci%macro ABS1 2
323cabdff1aSopenharmony_ci%if cpuflag(ssse3)
324cabdff1aSopenharmony_ci    pabsw   %1, %1
325cabdff1aSopenharmony_ci%elif cpuflag(mmxext) ; a, tmp
326cabdff1aSopenharmony_ci    pxor    %2, %2
327cabdff1aSopenharmony_ci    psubw   %2, %1
328cabdff1aSopenharmony_ci    pmaxsw  %1, %2
329cabdff1aSopenharmony_ci%else ; a, tmp
330cabdff1aSopenharmony_ci    pxor       %2, %2
331cabdff1aSopenharmony_ci    pcmpgtw    %2, %1
332cabdff1aSopenharmony_ci    pxor       %1, %2
333cabdff1aSopenharmony_ci    psubw      %1, %2
334cabdff1aSopenharmony_ci%endif
335cabdff1aSopenharmony_ci%endmacro
336cabdff1aSopenharmony_ci
337cabdff1aSopenharmony_ci%macro ABS2 4
338cabdff1aSopenharmony_ci%if cpuflag(ssse3)
339cabdff1aSopenharmony_ci    pabsw   %1, %1
340cabdff1aSopenharmony_ci    pabsw   %2, %2
341cabdff1aSopenharmony_ci%elif cpuflag(mmxext) ; a, b, tmp0, tmp1
342cabdff1aSopenharmony_ci    pxor    %3, %3
343cabdff1aSopenharmony_ci    pxor    %4, %4
344cabdff1aSopenharmony_ci    psubw   %3, %1
345cabdff1aSopenharmony_ci    psubw   %4, %2
346cabdff1aSopenharmony_ci    pmaxsw  %1, %3
347cabdff1aSopenharmony_ci    pmaxsw  %2, %4
348cabdff1aSopenharmony_ci%else ; a, b, tmp0, tmp1
349cabdff1aSopenharmony_ci    pxor       %3, %3
350cabdff1aSopenharmony_ci    pxor       %4, %4
351cabdff1aSopenharmony_ci    pcmpgtw    %3, %1
352cabdff1aSopenharmony_ci    pcmpgtw    %4, %2
353cabdff1aSopenharmony_ci    pxor       %1, %3
354cabdff1aSopenharmony_ci    pxor       %2, %4
355cabdff1aSopenharmony_ci    psubw      %1, %3
356cabdff1aSopenharmony_ci    psubw      %2, %4
357cabdff1aSopenharmony_ci%endif
358cabdff1aSopenharmony_ci%endmacro
359cabdff1aSopenharmony_ci
360cabdff1aSopenharmony_ci%macro ABSB 2 ; source mmreg, temp mmreg (unused for SSSE3)
361cabdff1aSopenharmony_ci%if cpuflag(ssse3)
362cabdff1aSopenharmony_ci    pabsb   %1, %1
363cabdff1aSopenharmony_ci%else
364cabdff1aSopenharmony_ci    pxor    %2, %2
365cabdff1aSopenharmony_ci    psubb   %2, %1
366cabdff1aSopenharmony_ci    pminub  %1, %2
367cabdff1aSopenharmony_ci%endif
368cabdff1aSopenharmony_ci%endmacro
369cabdff1aSopenharmony_ci
370cabdff1aSopenharmony_ci%macro ABSB2 4 ; src1, src2, tmp1, tmp2 (tmp1/2 unused for SSSE3)
371cabdff1aSopenharmony_ci%if cpuflag(ssse3)
372cabdff1aSopenharmony_ci    pabsb   %1, %1
373cabdff1aSopenharmony_ci    pabsb   %2, %2
374cabdff1aSopenharmony_ci%else
375cabdff1aSopenharmony_ci    pxor    %3, %3
376cabdff1aSopenharmony_ci    pxor    %4, %4
377cabdff1aSopenharmony_ci    psubb   %3, %1
378cabdff1aSopenharmony_ci    psubb   %4, %2
379cabdff1aSopenharmony_ci    pminub  %1, %3
380cabdff1aSopenharmony_ci    pminub  %2, %4
381cabdff1aSopenharmony_ci%endif
382cabdff1aSopenharmony_ci%endmacro
383cabdff1aSopenharmony_ci
384cabdff1aSopenharmony_ci%macro ABSD2 4
385cabdff1aSopenharmony_ci    pxor    %3, %3
386cabdff1aSopenharmony_ci    pxor    %4, %4
387cabdff1aSopenharmony_ci    pcmpgtd %3, %1
388cabdff1aSopenharmony_ci    pcmpgtd %4, %2
389cabdff1aSopenharmony_ci    pxor    %1, %3
390cabdff1aSopenharmony_ci    pxor    %2, %4
391cabdff1aSopenharmony_ci    psubd   %1, %3
392cabdff1aSopenharmony_ci    psubd   %2, %4
393cabdff1aSopenharmony_ci%endmacro
394cabdff1aSopenharmony_ci
395cabdff1aSopenharmony_ci%macro ABS4 6
396cabdff1aSopenharmony_ci    ABS2 %1, %2, %5, %6
397cabdff1aSopenharmony_ci    ABS2 %3, %4, %5, %6
398cabdff1aSopenharmony_ci%endmacro
399cabdff1aSopenharmony_ci
400cabdff1aSopenharmony_ci%macro SPLATB_LOAD 3
401cabdff1aSopenharmony_ci%if cpuflag(ssse3)
402cabdff1aSopenharmony_ci    movd      %1, [%2-3]
403cabdff1aSopenharmony_ci    pshufb    %1, %3
404cabdff1aSopenharmony_ci%else
405cabdff1aSopenharmony_ci    movd      %1, [%2-3] ;to avoid crossing a cacheline
406cabdff1aSopenharmony_ci    punpcklbw %1, %1
407cabdff1aSopenharmony_ci    SPLATW    %1, %1, 3
408cabdff1aSopenharmony_ci%endif
409cabdff1aSopenharmony_ci%endmacro
410cabdff1aSopenharmony_ci
411cabdff1aSopenharmony_ci%macro SPLATB_REG 3
412cabdff1aSopenharmony_ci%if cpuflag(ssse3)
413cabdff1aSopenharmony_ci    movd      %1, %2d
414cabdff1aSopenharmony_ci    pshufb    %1, %3
415cabdff1aSopenharmony_ci%else
416cabdff1aSopenharmony_ci    movd      %1, %2d
417cabdff1aSopenharmony_ci    punpcklbw %1, %1
418cabdff1aSopenharmony_ci    SPLATW    %1, %1, 0
419cabdff1aSopenharmony_ci%endif
420cabdff1aSopenharmony_ci%endmacro
421cabdff1aSopenharmony_ci
422cabdff1aSopenharmony_ci%macro HADDD 2 ; sum junk
423cabdff1aSopenharmony_ci%if sizeof%1 == 32
424cabdff1aSopenharmony_ci%define %2 xmm%2
425cabdff1aSopenharmony_ci    vextracti128 %2, %1, 1
426cabdff1aSopenharmony_ci%define %1 xmm%1
427cabdff1aSopenharmony_ci    paddd   %1, %2
428cabdff1aSopenharmony_ci%endif
429cabdff1aSopenharmony_ci%if mmsize >= 16
430cabdff1aSopenharmony_ci%if cpuflag(xop) && sizeof%1 == 16
431cabdff1aSopenharmony_ci    vphadddq %1, %1
432cabdff1aSopenharmony_ci%endif
433cabdff1aSopenharmony_ci    movhlps %2, %1
434cabdff1aSopenharmony_ci    paddd   %1, %2
435cabdff1aSopenharmony_ci%endif
436cabdff1aSopenharmony_ci%if notcpuflag(xop) || sizeof%1 != 16
437cabdff1aSopenharmony_ci%if cpuflag(mmxext)
438cabdff1aSopenharmony_ci    PSHUFLW %2, %1, q0032
439cabdff1aSopenharmony_ci%else ; mmx
440cabdff1aSopenharmony_ci    mova    %2, %1
441cabdff1aSopenharmony_ci    psrlq   %2, 32
442cabdff1aSopenharmony_ci%endif
443cabdff1aSopenharmony_ci    paddd   %1, %2
444cabdff1aSopenharmony_ci%endif
445cabdff1aSopenharmony_ci%undef %1
446cabdff1aSopenharmony_ci%undef %2
447cabdff1aSopenharmony_ci%endmacro
448cabdff1aSopenharmony_ci
449cabdff1aSopenharmony_ci%macro HADDW 2 ; reg, tmp
450cabdff1aSopenharmony_ci%if cpuflag(xop) && sizeof%1 == 16
451cabdff1aSopenharmony_ci    vphaddwq  %1, %1
452cabdff1aSopenharmony_ci    movhlps   %2, %1
453cabdff1aSopenharmony_ci    paddd     %1, %2
454cabdff1aSopenharmony_ci%else
455cabdff1aSopenharmony_ci    pmaddwd %1, [pw_1]
456cabdff1aSopenharmony_ci    HADDD   %1, %2
457cabdff1aSopenharmony_ci%endif
458cabdff1aSopenharmony_ci%endmacro
459cabdff1aSopenharmony_ci
460cabdff1aSopenharmony_ci%macro HADDPS 3 ; dst, src, tmp
461cabdff1aSopenharmony_ci%if cpuflag(sse3)
462cabdff1aSopenharmony_ci    haddps  %1, %1, %2
463cabdff1aSopenharmony_ci%else
464cabdff1aSopenharmony_ci    movaps  %3, %1
465cabdff1aSopenharmony_ci    shufps  %1, %2, q2020
466cabdff1aSopenharmony_ci    shufps  %3, %2, q3131
467cabdff1aSopenharmony_ci    addps   %1, %3
468cabdff1aSopenharmony_ci%endif
469cabdff1aSopenharmony_ci%endmacro
470cabdff1aSopenharmony_ci
471cabdff1aSopenharmony_ci%macro PALIGNR 4-5
472cabdff1aSopenharmony_ci%if cpuflag(ssse3)
473cabdff1aSopenharmony_ci%if %0==5
474cabdff1aSopenharmony_ci    palignr %1, %2, %3, %4
475cabdff1aSopenharmony_ci%else
476cabdff1aSopenharmony_ci    palignr %1, %2, %3
477cabdff1aSopenharmony_ci%endif
478cabdff1aSopenharmony_ci%else ; [dst,] src1, src2, imm, tmp
479cabdff1aSopenharmony_ci    %define %%dst %1
480cabdff1aSopenharmony_ci%if %0==5
481cabdff1aSopenharmony_ci%ifnidn %1, %2
482cabdff1aSopenharmony_ci    mova    %%dst, %2
483cabdff1aSopenharmony_ci%endif
484cabdff1aSopenharmony_ci    %rotate 1
485cabdff1aSopenharmony_ci%endif
486cabdff1aSopenharmony_ci%ifnidn %4, %2
487cabdff1aSopenharmony_ci    mova    %4, %2
488cabdff1aSopenharmony_ci%endif
489cabdff1aSopenharmony_ci%if mmsize==8
490cabdff1aSopenharmony_ci    psllq   %%dst, (8-%3)*8
491cabdff1aSopenharmony_ci    psrlq   %4, %3*8
492cabdff1aSopenharmony_ci%else
493cabdff1aSopenharmony_ci    pslldq  %%dst, 16-%3
494cabdff1aSopenharmony_ci    psrldq  %4, %3
495cabdff1aSopenharmony_ci%endif
496cabdff1aSopenharmony_ci    por     %%dst, %4
497cabdff1aSopenharmony_ci%endif
498cabdff1aSopenharmony_ci%endmacro
499cabdff1aSopenharmony_ci
500cabdff1aSopenharmony_ci%macro PAVGB 2-4
501cabdff1aSopenharmony_ci%if cpuflag(mmxext)
502cabdff1aSopenharmony_ci    pavgb   %1, %2
503cabdff1aSopenharmony_ci%elif cpuflag(3dnow)
504cabdff1aSopenharmony_ci    pavgusb %1, %2
505cabdff1aSopenharmony_ci%elif cpuflag(mmx)
506cabdff1aSopenharmony_ci    movu   %3, %2
507cabdff1aSopenharmony_ci    por    %3, %1
508cabdff1aSopenharmony_ci    pxor   %1, %2
509cabdff1aSopenharmony_ci    pand   %1, %4
510cabdff1aSopenharmony_ci    psrlq  %1, 1
511cabdff1aSopenharmony_ci    psubb  %3, %1
512cabdff1aSopenharmony_ci    SWAP   %1, %3
513cabdff1aSopenharmony_ci%endif
514cabdff1aSopenharmony_ci%endmacro
515cabdff1aSopenharmony_ci
516cabdff1aSopenharmony_ci%macro PSHUFLW 1+
517cabdff1aSopenharmony_ci    %if mmsize == 8
518cabdff1aSopenharmony_ci        pshufw %1
519cabdff1aSopenharmony_ci    %else
520cabdff1aSopenharmony_ci        pshuflw %1
521cabdff1aSopenharmony_ci    %endif
522cabdff1aSopenharmony_ci%endmacro
523cabdff1aSopenharmony_ci
524cabdff1aSopenharmony_ci%macro PSWAPD 2
525cabdff1aSopenharmony_ci%if cpuflag(mmxext)
526cabdff1aSopenharmony_ci    pshufw    %1, %2, q1032
527cabdff1aSopenharmony_ci%elif cpuflag(3dnowext)
528cabdff1aSopenharmony_ci    pswapd    %1, %2
529cabdff1aSopenharmony_ci%elif cpuflag(3dnow)
530cabdff1aSopenharmony_ci    movq      %1, %2
531cabdff1aSopenharmony_ci    psrlq     %1, 32
532cabdff1aSopenharmony_ci    punpckldq %1, %2
533cabdff1aSopenharmony_ci%endif
534cabdff1aSopenharmony_ci%endmacro
535cabdff1aSopenharmony_ci
536cabdff1aSopenharmony_ci%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
537cabdff1aSopenharmony_ci%ifnum %5
538cabdff1aSopenharmony_ci    pand   m%3, m%5, m%4 ; src .. y6 .. y4
539cabdff1aSopenharmony_ci    pand   m%1, m%5, m%2 ; dst .. y6 .. y4
540cabdff1aSopenharmony_ci%else
541cabdff1aSopenharmony_ci    mova   m%1, %5
542cabdff1aSopenharmony_ci    pand   m%3, m%1, m%4 ; src .. y6 .. y4
543cabdff1aSopenharmony_ci    pand   m%1, m%1, m%2 ; dst .. y6 .. y4
544cabdff1aSopenharmony_ci%endif
545cabdff1aSopenharmony_ci    psrlw  m%2, 8        ; dst .. y7 .. y5
546cabdff1aSopenharmony_ci    psrlw  m%4, 8        ; src .. y7 .. y5
547cabdff1aSopenharmony_ci%endmacro
548cabdff1aSopenharmony_ci
549cabdff1aSopenharmony_ci%macro SUMSUB_BA 3-4
550cabdff1aSopenharmony_ci%if %0==3
551cabdff1aSopenharmony_ci    padd%1  m%2, m%3
552cabdff1aSopenharmony_ci    padd%1  m%3, m%3
553cabdff1aSopenharmony_ci    psub%1  m%3, m%2
554cabdff1aSopenharmony_ci%else
555cabdff1aSopenharmony_ci%if avx_enabled == 0
556cabdff1aSopenharmony_ci    mova    m%4, m%2
557cabdff1aSopenharmony_ci    padd%1  m%2, m%3
558cabdff1aSopenharmony_ci    psub%1  m%3, m%4
559cabdff1aSopenharmony_ci%else
560cabdff1aSopenharmony_ci    padd%1  m%4, m%2, m%3
561cabdff1aSopenharmony_ci    psub%1  m%3, m%2
562cabdff1aSopenharmony_ci    SWAP    %2, %4
563cabdff1aSopenharmony_ci%endif
564cabdff1aSopenharmony_ci%endif
565cabdff1aSopenharmony_ci%endmacro
566cabdff1aSopenharmony_ci
567cabdff1aSopenharmony_ci%macro SUMSUB_BADC 5-6
568cabdff1aSopenharmony_ci%if %0==6
569cabdff1aSopenharmony_ci    SUMSUB_BA %1, %2, %3, %6
570cabdff1aSopenharmony_ci    SUMSUB_BA %1, %4, %5, %6
571cabdff1aSopenharmony_ci%else
572cabdff1aSopenharmony_ci    padd%1  m%2, m%3
573cabdff1aSopenharmony_ci    padd%1  m%4, m%5
574cabdff1aSopenharmony_ci    padd%1  m%3, m%3
575cabdff1aSopenharmony_ci    padd%1  m%5, m%5
576cabdff1aSopenharmony_ci    psub%1  m%3, m%2
577cabdff1aSopenharmony_ci    psub%1  m%5, m%4
578cabdff1aSopenharmony_ci%endif
579cabdff1aSopenharmony_ci%endmacro
580cabdff1aSopenharmony_ci
581cabdff1aSopenharmony_ci%macro SUMSUB2_AB 4
582cabdff1aSopenharmony_ci%ifnum %3
583cabdff1aSopenharmony_ci    psub%1  m%4, m%2, m%3
584cabdff1aSopenharmony_ci    psub%1  m%4, m%3
585cabdff1aSopenharmony_ci    padd%1  m%2, m%2
586cabdff1aSopenharmony_ci    padd%1  m%2, m%3
587cabdff1aSopenharmony_ci%else
588cabdff1aSopenharmony_ci    mova    m%4, m%2
589cabdff1aSopenharmony_ci    padd%1  m%2, m%2
590cabdff1aSopenharmony_ci    padd%1  m%2, %3
591cabdff1aSopenharmony_ci    psub%1  m%4, %3
592cabdff1aSopenharmony_ci    psub%1  m%4, %3
593cabdff1aSopenharmony_ci%endif
594cabdff1aSopenharmony_ci%endmacro
595cabdff1aSopenharmony_ci
596cabdff1aSopenharmony_ci%macro SUMSUB2_BA 4
597cabdff1aSopenharmony_ci%if avx_enabled == 0
598cabdff1aSopenharmony_ci    mova    m%4, m%2
599cabdff1aSopenharmony_ci    padd%1  m%2, m%3
600cabdff1aSopenharmony_ci    padd%1  m%2, m%3
601cabdff1aSopenharmony_ci    psub%1  m%3, m%4
602cabdff1aSopenharmony_ci    psub%1  m%3, m%4
603cabdff1aSopenharmony_ci%else
604cabdff1aSopenharmony_ci    padd%1  m%4, m%2, m%3
605cabdff1aSopenharmony_ci    padd%1  m%4, m%3
606cabdff1aSopenharmony_ci    psub%1  m%3, m%2
607cabdff1aSopenharmony_ci    psub%1  m%3, m%2
608cabdff1aSopenharmony_ci    SWAP     %2,  %4
609cabdff1aSopenharmony_ci%endif
610cabdff1aSopenharmony_ci%endmacro
611cabdff1aSopenharmony_ci
612cabdff1aSopenharmony_ci%macro SUMSUBD2_AB 5
613cabdff1aSopenharmony_ci%ifnum %4
614cabdff1aSopenharmony_ci    psra%1  m%5, m%2, 1  ; %3: %3>>1
615cabdff1aSopenharmony_ci    psra%1  m%4, m%3, 1  ; %2: %2>>1
616cabdff1aSopenharmony_ci    padd%1  m%4, m%2     ; %3: %3>>1+%2
617cabdff1aSopenharmony_ci    psub%1  m%5, m%3     ; %2: %2>>1-%3
618cabdff1aSopenharmony_ci    SWAP     %2, %5
619cabdff1aSopenharmony_ci    SWAP     %3, %4
620cabdff1aSopenharmony_ci%else
621cabdff1aSopenharmony_ci    mova    %5, m%2
622cabdff1aSopenharmony_ci    mova    %4, m%3
623cabdff1aSopenharmony_ci    psra%1  m%3, 1  ; %3: %3>>1
624cabdff1aSopenharmony_ci    psra%1  m%2, 1  ; %2: %2>>1
625cabdff1aSopenharmony_ci    padd%1  m%3, %5 ; %3: %3>>1+%2
626cabdff1aSopenharmony_ci    psub%1  m%2, %4 ; %2: %2>>1-%3
627cabdff1aSopenharmony_ci%endif
628cabdff1aSopenharmony_ci%endmacro
629cabdff1aSopenharmony_ci
630cabdff1aSopenharmony_ci%macro DCT4_1D 5
631cabdff1aSopenharmony_ci%ifnum %5
632cabdff1aSopenharmony_ci    SUMSUB_BADC w, %4, %1, %3, %2, %5
633cabdff1aSopenharmony_ci    SUMSUB_BA   w, %3, %4, %5
634cabdff1aSopenharmony_ci    SUMSUB2_AB  w, %1, %2, %5
635cabdff1aSopenharmony_ci    SWAP %1, %3, %4, %5, %2
636cabdff1aSopenharmony_ci%else
637cabdff1aSopenharmony_ci    SUMSUB_BADC w, %4, %1, %3, %2
638cabdff1aSopenharmony_ci    SUMSUB_BA   w, %3, %4
639cabdff1aSopenharmony_ci    mova     [%5], m%2
640cabdff1aSopenharmony_ci    SUMSUB2_AB  w, %1, [%5], %2
641cabdff1aSopenharmony_ci    SWAP %1, %3, %4, %2
642cabdff1aSopenharmony_ci%endif
643cabdff1aSopenharmony_ci%endmacro
644cabdff1aSopenharmony_ci
645cabdff1aSopenharmony_ci%macro IDCT4_1D 6-7
646cabdff1aSopenharmony_ci%ifnum %6
647cabdff1aSopenharmony_ci    SUMSUBD2_AB %1, %3, %5, %7, %6
648cabdff1aSopenharmony_ci    ; %3: %3>>1-%5 %5: %3+%5>>1
649cabdff1aSopenharmony_ci    SUMSUB_BA   %1, %4, %2, %7
650cabdff1aSopenharmony_ci    ; %4: %2+%4 %2: %2-%4
651cabdff1aSopenharmony_ci    SUMSUB_BADC %1, %5, %4, %3, %2, %7
652cabdff1aSopenharmony_ci    ; %5: %2+%4 + (%3+%5>>1)
653cabdff1aSopenharmony_ci    ; %4: %2+%4 - (%3+%5>>1)
654cabdff1aSopenharmony_ci    ; %3: %2-%4 + (%3>>1-%5)
655cabdff1aSopenharmony_ci    ; %2: %2-%4 - (%3>>1-%5)
656cabdff1aSopenharmony_ci%else
657cabdff1aSopenharmony_ci%ifidn %1, w
658cabdff1aSopenharmony_ci    SUMSUBD2_AB %1, %3, %5, [%6], [%6+16]
659cabdff1aSopenharmony_ci%else
660cabdff1aSopenharmony_ci    SUMSUBD2_AB %1, %3, %5, [%6], [%6+32]
661cabdff1aSopenharmony_ci%endif
662cabdff1aSopenharmony_ci    SUMSUB_BA   %1, %4, %2
663cabdff1aSopenharmony_ci    SUMSUB_BADC %1, %5, %4, %3, %2
664cabdff1aSopenharmony_ci%endif
665cabdff1aSopenharmony_ci    SWAP %2, %5, %4
666cabdff1aSopenharmony_ci    ; %2: %2+%4 + (%3+%5>>1) row0
667cabdff1aSopenharmony_ci    ; %3: %2-%4 + (%3>>1-%5) row1
668cabdff1aSopenharmony_ci    ; %4: %2-%4 - (%3>>1-%5) row2
669cabdff1aSopenharmony_ci    ; %5: %2+%4 - (%3+%5>>1) row3
670cabdff1aSopenharmony_ci%endmacro
671cabdff1aSopenharmony_ci
672cabdff1aSopenharmony_ci
673cabdff1aSopenharmony_ci%macro LOAD_DIFF 5
674cabdff1aSopenharmony_ci%ifidn %3, none
675cabdff1aSopenharmony_ci    movh       %1, %4
676cabdff1aSopenharmony_ci    movh       %2, %5
677cabdff1aSopenharmony_ci    punpcklbw  %1, %2
678cabdff1aSopenharmony_ci    punpcklbw  %2, %2
679cabdff1aSopenharmony_ci    psubw      %1, %2
680cabdff1aSopenharmony_ci%else
681cabdff1aSopenharmony_ci    movh       %1, %4
682cabdff1aSopenharmony_ci    punpcklbw  %1, %3
683cabdff1aSopenharmony_ci    movh       %2, %5
684cabdff1aSopenharmony_ci    punpcklbw  %2, %3
685cabdff1aSopenharmony_ci    psubw      %1, %2
686cabdff1aSopenharmony_ci%endif
687cabdff1aSopenharmony_ci%endmacro
688cabdff1aSopenharmony_ci
689cabdff1aSopenharmony_ci%macro STORE_DCT 6
690cabdff1aSopenharmony_ci    movq   [%5+%6+ 0], m%1
691cabdff1aSopenharmony_ci    movq   [%5+%6+ 8], m%2
692cabdff1aSopenharmony_ci    movq   [%5+%6+16], m%3
693cabdff1aSopenharmony_ci    movq   [%5+%6+24], m%4
694cabdff1aSopenharmony_ci    movhps [%5+%6+32], m%1
695cabdff1aSopenharmony_ci    movhps [%5+%6+40], m%2
696cabdff1aSopenharmony_ci    movhps [%5+%6+48], m%3
697cabdff1aSopenharmony_ci    movhps [%5+%6+56], m%4
698cabdff1aSopenharmony_ci%endmacro
699cabdff1aSopenharmony_ci
700cabdff1aSopenharmony_ci%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment?
701cabdff1aSopenharmony_ci    LOAD_DIFF m%1, m%5, m%7, [%8],      [%9]
702cabdff1aSopenharmony_ci    LOAD_DIFF m%2, m%6, m%7, [%8+r1],   [%9+r3]
703cabdff1aSopenharmony_ci    LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3]
704cabdff1aSopenharmony_ci    LOAD_DIFF m%4, m%6, m%7, [%8+r4],   [%9+r5]
705cabdff1aSopenharmony_ci%if %10
706cabdff1aSopenharmony_ci    lea %8, [%8+4*r1]
707cabdff1aSopenharmony_ci    lea %9, [%9+4*r3]
708cabdff1aSopenharmony_ci%endif
709cabdff1aSopenharmony_ci%endmacro
710cabdff1aSopenharmony_ci
711cabdff1aSopenharmony_ci%macro DIFFx2 6-7
712cabdff1aSopenharmony_ci    movh       %3, %5
713cabdff1aSopenharmony_ci    punpcklbw  %3, %4
714cabdff1aSopenharmony_ci    psraw      %1, 6
715cabdff1aSopenharmony_ci    paddsw     %1, %3
716cabdff1aSopenharmony_ci    movh       %3, %6
717cabdff1aSopenharmony_ci    punpcklbw  %3, %4
718cabdff1aSopenharmony_ci    psraw      %2, 6
719cabdff1aSopenharmony_ci    paddsw     %2, %3
720cabdff1aSopenharmony_ci    packuswb   %2, %1
721cabdff1aSopenharmony_ci%endmacro
722cabdff1aSopenharmony_ci
723cabdff1aSopenharmony_ci%macro STORE_DIFF 4
724cabdff1aSopenharmony_ci    movh       %2, %4
725cabdff1aSopenharmony_ci    punpcklbw  %2, %3
726cabdff1aSopenharmony_ci    psraw      %1, 6
727cabdff1aSopenharmony_ci    paddsw     %1, %2
728cabdff1aSopenharmony_ci    packuswb   %1, %1
729cabdff1aSopenharmony_ci    movh       %4, %1
730cabdff1aSopenharmony_ci%endmacro
731cabdff1aSopenharmony_ci
732cabdff1aSopenharmony_ci%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
733cabdff1aSopenharmony_ci    movh       %3, [%7]
734cabdff1aSopenharmony_ci    movh       %4, [%7+%8]
735cabdff1aSopenharmony_ci    psraw      %1, %6
736cabdff1aSopenharmony_ci    psraw      %2, %6
737cabdff1aSopenharmony_ci    punpcklbw  %3, %5
738cabdff1aSopenharmony_ci    punpcklbw  %4, %5
739cabdff1aSopenharmony_ci    paddw      %3, %1
740cabdff1aSopenharmony_ci    paddw      %4, %2
741cabdff1aSopenharmony_ci    packuswb   %3, %5
742cabdff1aSopenharmony_ci    packuswb   %4, %5
743cabdff1aSopenharmony_ci    movh     [%7], %3
744cabdff1aSopenharmony_ci    movh  [%7+%8], %4
745cabdff1aSopenharmony_ci%endmacro
746cabdff1aSopenharmony_ci
747cabdff1aSopenharmony_ci%macro PMINUB 3 ; dst, src, ignored
748cabdff1aSopenharmony_ci%if cpuflag(mmxext)
749cabdff1aSopenharmony_ci    pminub   %1, %2
750cabdff1aSopenharmony_ci%else ; dst, src, tmp
751cabdff1aSopenharmony_ci    mova     %3, %1
752cabdff1aSopenharmony_ci    psubusb  %3, %2
753cabdff1aSopenharmony_ci    psubb    %1, %3
754cabdff1aSopenharmony_ci%endif
755cabdff1aSopenharmony_ci%endmacro
756cabdff1aSopenharmony_ci
757cabdff1aSopenharmony_ci%macro SPLATW 2-3 0
758cabdff1aSopenharmony_ci%if cpuflag(avx2) && %3 == 0
759cabdff1aSopenharmony_ci    vpbroadcastw %1, %2
760cabdff1aSopenharmony_ci%elif mmsize == 16
761cabdff1aSopenharmony_ci    pshuflw    %1, %2, (%3)*0x55
762cabdff1aSopenharmony_ci    punpcklqdq %1, %1
763cabdff1aSopenharmony_ci%elif cpuflag(mmxext)
764cabdff1aSopenharmony_ci    pshufw     %1, %2, (%3)*0x55
765cabdff1aSopenharmony_ci%else
766cabdff1aSopenharmony_ci    %ifnidn %1, %2
767cabdff1aSopenharmony_ci        mova       %1, %2
768cabdff1aSopenharmony_ci    %endif
769cabdff1aSopenharmony_ci    %if %3 & 2
770cabdff1aSopenharmony_ci        punpckhwd  %1, %1
771cabdff1aSopenharmony_ci    %else
772cabdff1aSopenharmony_ci        punpcklwd  %1, %1
773cabdff1aSopenharmony_ci    %endif
774cabdff1aSopenharmony_ci    %if %3 & 1
775cabdff1aSopenharmony_ci        punpckhwd  %1, %1
776cabdff1aSopenharmony_ci    %else
777cabdff1aSopenharmony_ci        punpcklwd  %1, %1
778cabdff1aSopenharmony_ci    %endif
779cabdff1aSopenharmony_ci%endif
780cabdff1aSopenharmony_ci%endmacro
781cabdff1aSopenharmony_ci
782cabdff1aSopenharmony_ci%macro SPLATD 1
783cabdff1aSopenharmony_ci%if mmsize == 8
784cabdff1aSopenharmony_ci    punpckldq  %1, %1
785cabdff1aSopenharmony_ci%elif cpuflag(sse2)
786cabdff1aSopenharmony_ci    pshufd  %1, %1, 0
787cabdff1aSopenharmony_ci%elif cpuflag(sse)
788cabdff1aSopenharmony_ci    shufps  %1, %1, 0
789cabdff1aSopenharmony_ci%endif
790cabdff1aSopenharmony_ci%endmacro
791cabdff1aSopenharmony_ci
792cabdff1aSopenharmony_ci%macro CLIPUB 3 ;(dst, min, max)
793cabdff1aSopenharmony_ci    pmaxub %1, %2
794cabdff1aSopenharmony_ci    pminub %1, %3
795cabdff1aSopenharmony_ci%endmacro
796cabdff1aSopenharmony_ci
797cabdff1aSopenharmony_ci%macro CLIPW 3 ;(dst, min, max)
798cabdff1aSopenharmony_ci    pmaxsw %1, %2
799cabdff1aSopenharmony_ci    pminsw %1, %3
800cabdff1aSopenharmony_ci%endmacro
801cabdff1aSopenharmony_ci
802cabdff1aSopenharmony_ci%macro PMINSD 3 ; dst, src, tmp/unused
803cabdff1aSopenharmony_ci%if cpuflag(sse4)
804cabdff1aSopenharmony_ci    pminsd    %1, %2
805cabdff1aSopenharmony_ci%elif cpuflag(sse2)
806cabdff1aSopenharmony_ci    cvtdq2ps  %1, %1
807cabdff1aSopenharmony_ci    minps     %1, %2
808cabdff1aSopenharmony_ci    cvtps2dq  %1, %1
809cabdff1aSopenharmony_ci%else
810cabdff1aSopenharmony_ci    mova      %3, %2
811cabdff1aSopenharmony_ci    pcmpgtd   %3, %1
812cabdff1aSopenharmony_ci    pxor      %1, %2
813cabdff1aSopenharmony_ci    pand      %1, %3
814cabdff1aSopenharmony_ci    pxor      %1, %2
815cabdff1aSopenharmony_ci%endif
816cabdff1aSopenharmony_ci%endmacro
817cabdff1aSopenharmony_ci
818cabdff1aSopenharmony_ci%macro PMAXSD 3 ; dst, src, tmp/unused
819cabdff1aSopenharmony_ci%if cpuflag(sse4)
820cabdff1aSopenharmony_ci    pmaxsd    %1, %2
821cabdff1aSopenharmony_ci%else
822cabdff1aSopenharmony_ci    mova      %3, %1
823cabdff1aSopenharmony_ci    pcmpgtd   %3, %2
824cabdff1aSopenharmony_ci    pand      %1, %3
825cabdff1aSopenharmony_ci    pandn     %3, %2
826cabdff1aSopenharmony_ci    por       %1, %3
827cabdff1aSopenharmony_ci%endif
828cabdff1aSopenharmony_ci%endmacro
829cabdff1aSopenharmony_ci
830cabdff1aSopenharmony_ci%macro CLIPD 3-4
831cabdff1aSopenharmony_ci%if cpuflag(sse4);  src/dst, min, max, unused
832cabdff1aSopenharmony_ci    pminsd  %1, %3
833cabdff1aSopenharmony_ci    pmaxsd  %1, %2
834cabdff1aSopenharmony_ci%elif cpuflag(sse2) ; src/dst, min (float), max (float), unused
835cabdff1aSopenharmony_ci    cvtdq2ps  %1, %1
836cabdff1aSopenharmony_ci    minps     %1, %3
837cabdff1aSopenharmony_ci    maxps     %1, %2
838cabdff1aSopenharmony_ci    cvtps2dq  %1, %1
839cabdff1aSopenharmony_ci%else               ; src/dst, min, max, tmp
840cabdff1aSopenharmony_ci    PMINSD    %1, %3, %4
841cabdff1aSopenharmony_ci    PMAXSD    %1, %2, %4
842cabdff1aSopenharmony_ci%endif
843cabdff1aSopenharmony_ci%endmacro
844cabdff1aSopenharmony_ci
845cabdff1aSopenharmony_ci%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
846cabdff1aSopenharmony_ci%if cpuflag(avx2)
847cabdff1aSopenharmony_ci    vbroadcastss  %1, %2
848cabdff1aSopenharmony_ci%elif cpuflag(avx)
849cabdff1aSopenharmony_ci    %ifnum sizeof%2         ; avx1 register
850cabdff1aSopenharmony_ci        shufps  xmm%1, xmm%2, xmm%2, q0000
851cabdff1aSopenharmony_ci        %if sizeof%1 >= 32  ; mmsize>=32
852cabdff1aSopenharmony_ci            vinsertf128  %1, %1, xmm%1, 1
853cabdff1aSopenharmony_ci        %endif
854cabdff1aSopenharmony_ci    %else                   ; avx1 memory
855cabdff1aSopenharmony_ci        vbroadcastss  %1, %2
856cabdff1aSopenharmony_ci    %endif
857cabdff1aSopenharmony_ci%else
858cabdff1aSopenharmony_ci    %ifnum sizeof%2         ; sse register
859cabdff1aSopenharmony_ci        shufps  %1, %2, %2, q0000
860cabdff1aSopenharmony_ci    %else                   ; sse memory
861cabdff1aSopenharmony_ci        movss   %1, %2
862cabdff1aSopenharmony_ci        shufps  %1, %1, 0
863cabdff1aSopenharmony_ci    %endif
864cabdff1aSopenharmony_ci%endif
865cabdff1aSopenharmony_ci%endmacro
866cabdff1aSopenharmony_ci
867cabdff1aSopenharmony_ci%macro VBROADCASTSD 2 ; dst xmm/ymm, src m64
868cabdff1aSopenharmony_ci%if cpuflag(avx) && mmsize == 32
869cabdff1aSopenharmony_ci    vbroadcastsd %1, %2
870cabdff1aSopenharmony_ci%elif cpuflag(sse3)
871cabdff1aSopenharmony_ci    movddup      %1, %2
872cabdff1aSopenharmony_ci%else ; sse2
873cabdff1aSopenharmony_ci    movsd        %1, %2
874cabdff1aSopenharmony_ci    movlhps      %1, %1
875cabdff1aSopenharmony_ci%endif
876cabdff1aSopenharmony_ci%endmacro
877cabdff1aSopenharmony_ci
878cabdff1aSopenharmony_ci%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm
879cabdff1aSopenharmony_ci%if cpuflag(avx2)
880cabdff1aSopenharmony_ci    vpbroadcastd  %1, %2
881cabdff1aSopenharmony_ci%elif cpuflag(avx) && sizeof%1 >= 32
882cabdff1aSopenharmony_ci    %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss
883cabdff1aSopenharmony_ci%else
884cabdff1aSopenharmony_ci    %ifnum sizeof%2         ; sse2 register
885cabdff1aSopenharmony_ci        pshufd  %1, %2, q0000
886cabdff1aSopenharmony_ci    %else                   ; sse memory
887cabdff1aSopenharmony_ci        movd    %1, %2
888cabdff1aSopenharmony_ci        pshufd  %1, %1, 0
889cabdff1aSopenharmony_ci    %endif
890cabdff1aSopenharmony_ci%endif
891cabdff1aSopenharmony_ci%endmacro
892cabdff1aSopenharmony_ci
893cabdff1aSopenharmony_ci%macro VBROADCASTI128 2 ; dst xmm/ymm, src : 128bits val
894cabdff1aSopenharmony_ci%if mmsize > 16
895cabdff1aSopenharmony_ci    vbroadcasti128 %1, %2
896cabdff1aSopenharmony_ci%else
897cabdff1aSopenharmony_ci    mova           %1, %2
898cabdff1aSopenharmony_ci%endif
899cabdff1aSopenharmony_ci%endmacro
900cabdff1aSopenharmony_ci
901cabdff1aSopenharmony_ci%macro SHUFFLE_MASK_W 8
902cabdff1aSopenharmony_ci    %rep 8
903cabdff1aSopenharmony_ci        %if %1>=0x80
904cabdff1aSopenharmony_ci            db %1, %1
905cabdff1aSopenharmony_ci        %else
906cabdff1aSopenharmony_ci            db %1*2
907cabdff1aSopenharmony_ci            db %1*2+1
908cabdff1aSopenharmony_ci        %endif
909cabdff1aSopenharmony_ci        %rotate 1
910cabdff1aSopenharmony_ci    %endrep
911cabdff1aSopenharmony_ci%endmacro
912cabdff1aSopenharmony_ci
913cabdff1aSopenharmony_ci%macro PMOVSXWD 2; dst, src
914cabdff1aSopenharmony_ci%if cpuflag(sse4)
915cabdff1aSopenharmony_ci    pmovsxwd     %1, %2
916cabdff1aSopenharmony_ci%else
917cabdff1aSopenharmony_ci    %ifnidn %1, %2
918cabdff1aSopenharmony_ci    mova         %1, %2
919cabdff1aSopenharmony_ci    %endif
920cabdff1aSopenharmony_ci    punpcklwd    %1, %1
921cabdff1aSopenharmony_ci    psrad        %1, 16
922cabdff1aSopenharmony_ci%endif
923cabdff1aSopenharmony_ci%endmacro
924cabdff1aSopenharmony_ci
925cabdff1aSopenharmony_ci; Wrapper for non-FMA version of fmaddps
926cabdff1aSopenharmony_ci%macro FMULADD_PS 5
927cabdff1aSopenharmony_ci    %if cpuflag(fma3) || cpuflag(fma4)
928cabdff1aSopenharmony_ci        fmaddps %1, %2, %3, %4
929cabdff1aSopenharmony_ci    %elifidn %1, %4
930cabdff1aSopenharmony_ci        mulps   %5, %2, %3
931cabdff1aSopenharmony_ci        addps   %1, %4, %5
932cabdff1aSopenharmony_ci    %else
933cabdff1aSopenharmony_ci        mulps   %1, %2, %3
934cabdff1aSopenharmony_ci        addps   %1, %4
935cabdff1aSopenharmony_ci    %endif
936cabdff1aSopenharmony_ci%endmacro
937cabdff1aSopenharmony_ci
938cabdff1aSopenharmony_ci%macro LSHIFT 2
939cabdff1aSopenharmony_ci%if mmsize > 8
940cabdff1aSopenharmony_ci    pslldq  %1, %2
941cabdff1aSopenharmony_ci%else
942cabdff1aSopenharmony_ci    psllq   %1, 8*(%2)
943cabdff1aSopenharmony_ci%endif
944cabdff1aSopenharmony_ci%endmacro
945cabdff1aSopenharmony_ci
946cabdff1aSopenharmony_ci%macro RSHIFT 2
947cabdff1aSopenharmony_ci%if mmsize > 8
948cabdff1aSopenharmony_ci    psrldq  %1, %2
949cabdff1aSopenharmony_ci%else
950cabdff1aSopenharmony_ci    psrlq   %1, 8*(%2)
951cabdff1aSopenharmony_ci%endif
952cabdff1aSopenharmony_ci%endmacro
953cabdff1aSopenharmony_ci
954cabdff1aSopenharmony_ci%macro MOVHL 2 ; dst, src
955cabdff1aSopenharmony_ci%ifidn %1, %2
956cabdff1aSopenharmony_ci    punpckhqdq %1, %2
957cabdff1aSopenharmony_ci%elif cpuflag(avx)
958cabdff1aSopenharmony_ci    punpckhqdq %1, %2, %2
959cabdff1aSopenharmony_ci%elif cpuflag(sse4)
960cabdff1aSopenharmony_ci    pshufd     %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones
961cabdff1aSopenharmony_ci%else
962cabdff1aSopenharmony_ci    movhlps    %1, %2        ; may cause an int/float domain transition and has a dependency on dst
963cabdff1aSopenharmony_ci%endif
964cabdff1aSopenharmony_ci%endmacro
965cabdff1aSopenharmony_ci
966cabdff1aSopenharmony_ci; Horizontal Sum of Packed Single precision floats
967cabdff1aSopenharmony_ci; The resulting sum is in all elements.
968cabdff1aSopenharmony_ci%macro HSUMPS 2 ; dst/src, tmp
969cabdff1aSopenharmony_ci%if cpuflag(avx)
970cabdff1aSopenharmony_ci    %if sizeof%1>=32  ; avx
971cabdff1aSopenharmony_ci        vperm2f128  %2, %1, %1, (0)*16+(1)
972cabdff1aSopenharmony_ci        addps       %1, %2
973cabdff1aSopenharmony_ci    %endif
974cabdff1aSopenharmony_ci    shufps      %2, %1, %1, q1032
975cabdff1aSopenharmony_ci    addps       %1, %2
976cabdff1aSopenharmony_ci    shufps      %2, %1, %1, q0321
977cabdff1aSopenharmony_ci    addps       %1, %2
978cabdff1aSopenharmony_ci%else  ; this form is a bit faster than the short avx-like emulation.
979cabdff1aSopenharmony_ci    movaps      %2, %1
980cabdff1aSopenharmony_ci    shufps      %1, %1, q1032
981cabdff1aSopenharmony_ci    addps       %1, %2
982cabdff1aSopenharmony_ci    movaps      %2, %1
983cabdff1aSopenharmony_ci    shufps      %1, %1, q0321
984cabdff1aSopenharmony_ci    addps       %1, %2
985cabdff1aSopenharmony_ci    ; all %1 members should be equal for as long as float a+b==b+a
986cabdff1aSopenharmony_ci%endif
987cabdff1aSopenharmony_ci%endmacro
988cabdff1aSopenharmony_ci
989cabdff1aSopenharmony_ci; Emulate blendvps if not available
990cabdff1aSopenharmony_ci;
991cabdff1aSopenharmony_ci; src_b is destroyed when using emulation with logical operands
992cabdff1aSopenharmony_ci; SSE41 blendv instruction is hard coded to use xmm0 as mask
993cabdff1aSopenharmony_ci%macro BLENDVPS 3 ; dst/src_a, src_b, mask
994cabdff1aSopenharmony_ci%if cpuflag(avx)
995cabdff1aSopenharmony_ci    blendvps  %1, %1, %2, %3
996cabdff1aSopenharmony_ci%elif cpuflag(sse4)
997cabdff1aSopenharmony_ci    %ifnidn %3,xmm0
998cabdff1aSopenharmony_ci        %error sse41 blendvps uses xmm0 as default 3d operand, you used %3
999cabdff1aSopenharmony_ci    %endif
1000cabdff1aSopenharmony_ci    blendvps  %1, %2, %3
1001cabdff1aSopenharmony_ci%else
1002cabdff1aSopenharmony_ci    xorps  %2, %1
1003cabdff1aSopenharmony_ci    andps  %2, %3
1004cabdff1aSopenharmony_ci    xorps  %1, %2
1005cabdff1aSopenharmony_ci%endif
1006cabdff1aSopenharmony_ci%endmacro
1007cabdff1aSopenharmony_ci
1008cabdff1aSopenharmony_ci; Emulate pblendvb if not available
1009cabdff1aSopenharmony_ci;
1010cabdff1aSopenharmony_ci; src_b is destroyed when using emulation with logical operands
1011cabdff1aSopenharmony_ci; SSE41 blendv instruction is hard coded to use xmm0 as mask
1012cabdff1aSopenharmony_ci%macro PBLENDVB 3 ; dst/src_a, src_b, mask
1013cabdff1aSopenharmony_ci%if cpuflag(avx)
1014cabdff1aSopenharmony_ci    %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32
1015cabdff1aSopenharmony_ci        %error pblendb not possible with ymm on avx1, try blendvps.
1016cabdff1aSopenharmony_ci    %endif
1017cabdff1aSopenharmony_ci    pblendvb  %1, %1, %2, %3
1018cabdff1aSopenharmony_ci%elif cpuflag(sse4)
1019cabdff1aSopenharmony_ci    %ifnidn %3,xmm0
1020cabdff1aSopenharmony_ci        %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3
1021cabdff1aSopenharmony_ci    %endif
1022cabdff1aSopenharmony_ci    pblendvb  %1, %2, %3
1023cabdff1aSopenharmony_ci%else
1024cabdff1aSopenharmony_ci    pxor  %2, %1
1025cabdff1aSopenharmony_ci    pand  %2, %3
1026cabdff1aSopenharmony_ci    pxor  %1, %2
1027cabdff1aSopenharmony_ci%endif
1028cabdff1aSopenharmony_ci%endmacro
1029