1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;*
3cabdff1aSopenharmony_ci;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
4cabdff1aSopenharmony_ci;* Copyright (c)      Nick Kurshev <nickols_k@mail.ru>
5cabdff1aSopenharmony_ci;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
6cabdff1aSopenharmony_ci;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
7cabdff1aSopenharmony_ci;* Copyright (c) 2013 Daniel Kang
8cabdff1aSopenharmony_ci;*
9cabdff1aSopenharmony_ci;* SIMD-optimized halfpel functions
10cabdff1aSopenharmony_ci;*
11cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
12cabdff1aSopenharmony_ci;*
13cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
14cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
15cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
16cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
17cabdff1aSopenharmony_ci;*
18cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
19cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
22cabdff1aSopenharmony_ci;*
23cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
24cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
25cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26cabdff1aSopenharmony_ci;******************************************************************************
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ciSECTION_RODATA
31cabdff1aSopenharmony_cicextern pb_1
32cabdff1aSopenharmony_cicextern pw_2
33cabdff1aSopenharmony_cipb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
34cabdff1aSopenharmony_cipb_interleave8:  db 0, 4, 1, 5, 2, 6, 3, 7
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_cicextern pw_8192
37cabdff1aSopenharmony_ci
38cabdff1aSopenharmony_ciSECTION .text
39cabdff1aSopenharmony_ci
40cabdff1aSopenharmony_ci; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
41cabdff1aSopenharmony_ci%macro PUT_PIXELS8_X2 0
42cabdff1aSopenharmony_ci%if cpuflag(sse2)
43cabdff1aSopenharmony_cicglobal put_pixels16_x2, 4,5,4
44cabdff1aSopenharmony_ci%else
45cabdff1aSopenharmony_cicglobal put_pixels8_x2, 4,5
46cabdff1aSopenharmony_ci%endif
47cabdff1aSopenharmony_ci    lea          r4, [r2*2]
48cabdff1aSopenharmony_ci.loop:
49cabdff1aSopenharmony_ci    movu         m0, [r1+1]
50cabdff1aSopenharmony_ci    movu         m1, [r1+r2+1]
51cabdff1aSopenharmony_ci%if cpuflag(sse2)
52cabdff1aSopenharmony_ci    movu         m2, [r1]
53cabdff1aSopenharmony_ci    movu         m3, [r1+r2]
54cabdff1aSopenharmony_ci    pavgb        m0, m2
55cabdff1aSopenharmony_ci    pavgb        m1, m3
56cabdff1aSopenharmony_ci%else
57cabdff1aSopenharmony_ci    PAVGB        m0, [r1]
58cabdff1aSopenharmony_ci    PAVGB        m1, [r1+r2]
59cabdff1aSopenharmony_ci%endif
60cabdff1aSopenharmony_ci    mova       [r0], m0
61cabdff1aSopenharmony_ci    mova    [r0+r2], m1
62cabdff1aSopenharmony_ci    add          r1, r4
63cabdff1aSopenharmony_ci    add          r0, r4
64cabdff1aSopenharmony_ci    movu         m0, [r1+1]
65cabdff1aSopenharmony_ci    movu         m1, [r1+r2+1]
66cabdff1aSopenharmony_ci%if cpuflag(sse2)
67cabdff1aSopenharmony_ci    movu         m2, [r1]
68cabdff1aSopenharmony_ci    movu         m3, [r1+r2]
69cabdff1aSopenharmony_ci    pavgb        m0, m2
70cabdff1aSopenharmony_ci    pavgb        m1, m3
71cabdff1aSopenharmony_ci%else
72cabdff1aSopenharmony_ci    PAVGB        m0, [r1]
73cabdff1aSopenharmony_ci    PAVGB        m1, [r1+r2]
74cabdff1aSopenharmony_ci%endif
75cabdff1aSopenharmony_ci    add          r1, r4
76cabdff1aSopenharmony_ci    mova       [r0], m0
77cabdff1aSopenharmony_ci    mova    [r0+r2], m1
78cabdff1aSopenharmony_ci    add          r0, r4
79cabdff1aSopenharmony_ci    sub         r3d, 4
80cabdff1aSopenharmony_ci    jne .loop
81cabdff1aSopenharmony_ci    REP_RET
82cabdff1aSopenharmony_ci%endmacro
83cabdff1aSopenharmony_ci
84cabdff1aSopenharmony_ciINIT_MMX mmxext
85cabdff1aSopenharmony_ciPUT_PIXELS8_X2
86cabdff1aSopenharmony_ci
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
89cabdff1aSopenharmony_ci%macro PUT_PIXELS_16 0
90cabdff1aSopenharmony_cicglobal put_pixels16_x2, 4,5
91cabdff1aSopenharmony_ci    lea          r4, [r2*2]
92cabdff1aSopenharmony_ci.loop:
93cabdff1aSopenharmony_ci    mova         m0, [r1]
94cabdff1aSopenharmony_ci    mova         m1, [r1+r2]
95cabdff1aSopenharmony_ci    mova         m2, [r1+8]
96cabdff1aSopenharmony_ci    mova         m3, [r1+r2+8]
97cabdff1aSopenharmony_ci    PAVGB        m0, [r1+1]
98cabdff1aSopenharmony_ci    PAVGB        m1, [r1+r2+1]
99cabdff1aSopenharmony_ci    PAVGB        m2, [r1+9]
100cabdff1aSopenharmony_ci    PAVGB        m3, [r1+r2+9]
101cabdff1aSopenharmony_ci    mova       [r0], m0
102cabdff1aSopenharmony_ci    mova    [r0+r2], m1
103cabdff1aSopenharmony_ci    mova     [r0+8], m2
104cabdff1aSopenharmony_ci    mova  [r0+r2+8], m3
105cabdff1aSopenharmony_ci    add          r1, r4
106cabdff1aSopenharmony_ci    add          r0, r4
107cabdff1aSopenharmony_ci    mova         m0, [r1]
108cabdff1aSopenharmony_ci    mova         m1, [r1+r2]
109cabdff1aSopenharmony_ci    mova         m2, [r1+8]
110cabdff1aSopenharmony_ci    mova         m3, [r1+r2+8]
111cabdff1aSopenharmony_ci    PAVGB        m0, [r1+1]
112cabdff1aSopenharmony_ci    PAVGB        m1, [r1+r2+1]
113cabdff1aSopenharmony_ci    PAVGB        m2, [r1+9]
114cabdff1aSopenharmony_ci    PAVGB        m3, [r1+r2+9]
115cabdff1aSopenharmony_ci    add          r1, r4
116cabdff1aSopenharmony_ci    mova       [r0], m0
117cabdff1aSopenharmony_ci    mova    [r0+r2], m1
118cabdff1aSopenharmony_ci    mova     [r0+8], m2
119cabdff1aSopenharmony_ci    mova  [r0+r2+8], m3
120cabdff1aSopenharmony_ci    add          r0, r4
121cabdff1aSopenharmony_ci    sub         r3d, 4
122cabdff1aSopenharmony_ci    jne .loop
123cabdff1aSopenharmony_ci    REP_RET
124cabdff1aSopenharmony_ci%endmacro
125cabdff1aSopenharmony_ci
126cabdff1aSopenharmony_ciINIT_MMX mmxext
127cabdff1aSopenharmony_ciPUT_PIXELS_16
128cabdff1aSopenharmony_ci; The 8_X2 macro can easily be used here
129cabdff1aSopenharmony_ciINIT_XMM sse2
130cabdff1aSopenharmony_ciPUT_PIXELS8_X2
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_ci
133cabdff1aSopenharmony_ci; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
134cabdff1aSopenharmony_ciINIT_MMX mmxext
135cabdff1aSopenharmony_cicglobal put_no_rnd_pixels8_x2, 4,5
136cabdff1aSopenharmony_ci    mova         m6, [pb_1]
137cabdff1aSopenharmony_ci    lea          r4, [r2*2]
138cabdff1aSopenharmony_ci.loop:
139cabdff1aSopenharmony_ci    mova         m0, [r1]
140cabdff1aSopenharmony_ci    mova         m2, [r1+r2]
141cabdff1aSopenharmony_ci    mova         m1, [r1+1]
142cabdff1aSopenharmony_ci    mova         m3, [r1+r2+1]
143cabdff1aSopenharmony_ci    add          r1, r4
144cabdff1aSopenharmony_ci    psubusb      m0, m6
145cabdff1aSopenharmony_ci    psubusb      m2, m6
146cabdff1aSopenharmony_ci    PAVGB        m0, m1
147cabdff1aSopenharmony_ci    PAVGB        m2, m3
148cabdff1aSopenharmony_ci    mova       [r0], m0
149cabdff1aSopenharmony_ci    mova    [r0+r2], m2
150cabdff1aSopenharmony_ci    mova         m0, [r1]
151cabdff1aSopenharmony_ci    mova         m1, [r1+1]
152cabdff1aSopenharmony_ci    mova         m2, [r1+r2]
153cabdff1aSopenharmony_ci    mova         m3, [r1+r2+1]
154cabdff1aSopenharmony_ci    add          r0, r4
155cabdff1aSopenharmony_ci    add          r1, r4
156cabdff1aSopenharmony_ci    psubusb      m0, m6
157cabdff1aSopenharmony_ci    psubusb      m2, m6
158cabdff1aSopenharmony_ci    PAVGB        m0, m1
159cabdff1aSopenharmony_ci    PAVGB        m2, m3
160cabdff1aSopenharmony_ci    mova       [r0], m0
161cabdff1aSopenharmony_ci    mova    [r0+r2], m2
162cabdff1aSopenharmony_ci    add          r0, r4
163cabdff1aSopenharmony_ci    sub         r3d, 4
164cabdff1aSopenharmony_ci    jne .loop
165cabdff1aSopenharmony_ci    REP_RET
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_ci
168cabdff1aSopenharmony_ci; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
169cabdff1aSopenharmony_ci%macro PUT_PIXELS8_Y2 0
170cabdff1aSopenharmony_ci%if cpuflag(sse2)
171cabdff1aSopenharmony_cicglobal put_pixels16_y2, 4,5,3
172cabdff1aSopenharmony_ci%else
173cabdff1aSopenharmony_cicglobal put_pixels8_y2, 4,5
174cabdff1aSopenharmony_ci%endif
175cabdff1aSopenharmony_ci    lea          r4, [r2*2]
176cabdff1aSopenharmony_ci    movu         m0, [r1]
177cabdff1aSopenharmony_ci    sub          r0, r2
178cabdff1aSopenharmony_ci.loop:
179cabdff1aSopenharmony_ci    movu         m1, [r1+r2]
180cabdff1aSopenharmony_ci    movu         m2, [r1+r4]
181cabdff1aSopenharmony_ci    add          r1, r4
182cabdff1aSopenharmony_ci    PAVGB        m0, m1
183cabdff1aSopenharmony_ci    PAVGB        m1, m2
184cabdff1aSopenharmony_ci    mova    [r0+r2], m0
185cabdff1aSopenharmony_ci    mova    [r0+r4], m1
186cabdff1aSopenharmony_ci    movu         m1, [r1+r2]
187cabdff1aSopenharmony_ci    movu         m0, [r1+r4]
188cabdff1aSopenharmony_ci    add          r0, r4
189cabdff1aSopenharmony_ci    add          r1, r4
190cabdff1aSopenharmony_ci    PAVGB        m2, m1
191cabdff1aSopenharmony_ci    PAVGB        m1, m0
192cabdff1aSopenharmony_ci    mova    [r0+r2], m2
193cabdff1aSopenharmony_ci    mova    [r0+r4], m1
194cabdff1aSopenharmony_ci    add          r0, r4
195cabdff1aSopenharmony_ci    sub         r3d, 4
196cabdff1aSopenharmony_ci    jne .loop
197cabdff1aSopenharmony_ci    REP_RET
198cabdff1aSopenharmony_ci%endmacro
199cabdff1aSopenharmony_ci
200cabdff1aSopenharmony_ciINIT_MMX mmxext
201cabdff1aSopenharmony_ciPUT_PIXELS8_Y2
202cabdff1aSopenharmony_ci; actually, put_pixels16_y2_sse2
203cabdff1aSopenharmony_ciINIT_XMM sse2
204cabdff1aSopenharmony_ciPUT_PIXELS8_Y2
205cabdff1aSopenharmony_ci
206cabdff1aSopenharmony_ci
207cabdff1aSopenharmony_ci; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
208cabdff1aSopenharmony_ciINIT_MMX mmxext
209cabdff1aSopenharmony_cicglobal put_no_rnd_pixels8_y2, 4,5
210cabdff1aSopenharmony_ci    mova         m6, [pb_1]
211cabdff1aSopenharmony_ci    lea          r4, [r2+r2]
212cabdff1aSopenharmony_ci    mova         m0, [r1]
213cabdff1aSopenharmony_ci    sub          r0, r2
214cabdff1aSopenharmony_ci.loop:
215cabdff1aSopenharmony_ci    mova         m1, [r1+r2]
216cabdff1aSopenharmony_ci    mova         m2, [r1+r4]
217cabdff1aSopenharmony_ci    add          r1, r4
218cabdff1aSopenharmony_ci    psubusb      m1, m6
219cabdff1aSopenharmony_ci    PAVGB        m0, m1
220cabdff1aSopenharmony_ci    PAVGB        m1, m2
221cabdff1aSopenharmony_ci    mova    [r0+r2], m0
222cabdff1aSopenharmony_ci    mova    [r0+r4], m1
223cabdff1aSopenharmony_ci    mova         m1, [r1+r2]
224cabdff1aSopenharmony_ci    mova         m0, [r1+r4]
225cabdff1aSopenharmony_ci    add          r0, r4
226cabdff1aSopenharmony_ci    add          r1, r4
227cabdff1aSopenharmony_ci    psubusb      m1, m6
228cabdff1aSopenharmony_ci    PAVGB        m2, m1
229cabdff1aSopenharmony_ci    PAVGB        m1, m0
230cabdff1aSopenharmony_ci    mova    [r0+r2], m2
231cabdff1aSopenharmony_ci    mova    [r0+r4], m1
232cabdff1aSopenharmony_ci    add          r0, r4
233cabdff1aSopenharmony_ci    sub         r3d, 4
234cabdff1aSopenharmony_ci    jne .loop
235cabdff1aSopenharmony_ci    REP_RET
236cabdff1aSopenharmony_ci
237cabdff1aSopenharmony_ci
238cabdff1aSopenharmony_ci; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
239cabdff1aSopenharmony_ci%macro AVG_PIXELS8_X2 0
240cabdff1aSopenharmony_ci%if cpuflag(sse2)
241cabdff1aSopenharmony_cicglobal avg_pixels16_x2, 4,5,4
242cabdff1aSopenharmony_ci%else
243cabdff1aSopenharmony_cicglobal avg_pixels8_x2, 4,5
244cabdff1aSopenharmony_ci%endif
245cabdff1aSopenharmony_ci    lea          r4, [r2*2]
246cabdff1aSopenharmony_ci.loop:
247cabdff1aSopenharmony_ci    movu         m0, [r1]
248cabdff1aSopenharmony_ci    movu         m2, [r1+r2]
249cabdff1aSopenharmony_ci%if cpuflag(sse2)
250cabdff1aSopenharmony_ci    movu         m1, [r1+1]
251cabdff1aSopenharmony_ci    movu         m3, [r1+r2+1]
252cabdff1aSopenharmony_ci    pavgb        m0, m1
253cabdff1aSopenharmony_ci    pavgb        m2, m3
254cabdff1aSopenharmony_ci%else
255cabdff1aSopenharmony_ci    PAVGB        m0, [r1+1], m3, m5
256cabdff1aSopenharmony_ci    PAVGB        m2, [r1+r2+1], m4, m5
257cabdff1aSopenharmony_ci%endif
258cabdff1aSopenharmony_ci    PAVGB        m0, [r0], m3, m5
259cabdff1aSopenharmony_ci    PAVGB        m2, [r0+r2], m4, m5
260cabdff1aSopenharmony_ci    add          r1, r4
261cabdff1aSopenharmony_ci    mova       [r0], m0
262cabdff1aSopenharmony_ci    mova    [r0+r2], m2
263cabdff1aSopenharmony_ci    movu         m0, [r1]
264cabdff1aSopenharmony_ci    movu         m2, [r1+r2]
265cabdff1aSopenharmony_ci%if cpuflag(sse2)
266cabdff1aSopenharmony_ci    movu         m1, [r1+1]
267cabdff1aSopenharmony_ci    movu         m3, [r1+r2+1]
268cabdff1aSopenharmony_ci    pavgb        m0, m1
269cabdff1aSopenharmony_ci    pavgb        m2, m3
270cabdff1aSopenharmony_ci%else
271cabdff1aSopenharmony_ci    PAVGB        m0, [r1+1], m3, m5
272cabdff1aSopenharmony_ci    PAVGB        m2, [r1+r2+1], m4, m5
273cabdff1aSopenharmony_ci%endif
274cabdff1aSopenharmony_ci    add          r0, r4
275cabdff1aSopenharmony_ci    add          r1, r4
276cabdff1aSopenharmony_ci    PAVGB        m0, [r0], m3, m5
277cabdff1aSopenharmony_ci    PAVGB        m2, [r0+r2], m4, m5
278cabdff1aSopenharmony_ci    mova       [r0], m0
279cabdff1aSopenharmony_ci    mova    [r0+r2], m2
280cabdff1aSopenharmony_ci    add          r0, r4
281cabdff1aSopenharmony_ci    sub         r3d, 4
282cabdff1aSopenharmony_ci    jne .loop
283cabdff1aSopenharmony_ci    REP_RET
284cabdff1aSopenharmony_ci%endmacro
285cabdff1aSopenharmony_ci
286cabdff1aSopenharmony_ciINIT_MMX mmxext
287cabdff1aSopenharmony_ciAVG_PIXELS8_X2
288cabdff1aSopenharmony_ci; actually avg_pixels16_x2
289cabdff1aSopenharmony_ciINIT_XMM sse2
290cabdff1aSopenharmony_ciAVG_PIXELS8_X2
291cabdff1aSopenharmony_ci
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ci; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
294cabdff1aSopenharmony_ci%macro AVG_PIXELS8_Y2 0
295cabdff1aSopenharmony_ci%if cpuflag(sse2)
296cabdff1aSopenharmony_cicglobal avg_pixels16_y2, 4,5,3
297cabdff1aSopenharmony_ci%else
298cabdff1aSopenharmony_cicglobal avg_pixels8_y2, 4,5
299cabdff1aSopenharmony_ci%endif
300cabdff1aSopenharmony_ci    lea          r4, [r2*2]
301cabdff1aSopenharmony_ci    movu         m0, [r1]
302cabdff1aSopenharmony_ci    sub          r0, r2
303cabdff1aSopenharmony_ci.loop:
304cabdff1aSopenharmony_ci    movu         m1, [r1+r2]
305cabdff1aSopenharmony_ci    movu         m2, [r1+r4]
306cabdff1aSopenharmony_ci    add          r1, r4
307cabdff1aSopenharmony_ci    PAVGB        m0, m1
308cabdff1aSopenharmony_ci    PAVGB        m1, m2
309cabdff1aSopenharmony_ci    PAVGB        m0, [r0+r2]
310cabdff1aSopenharmony_ci    PAVGB        m1, [r0+r4]
311cabdff1aSopenharmony_ci    mova    [r0+r2], m0
312cabdff1aSopenharmony_ci    mova    [r0+r4], m1
313cabdff1aSopenharmony_ci    movu         m1, [r1+r2]
314cabdff1aSopenharmony_ci    movu         m0, [r1+r4]
315cabdff1aSopenharmony_ci    PAVGB        m2, m1
316cabdff1aSopenharmony_ci    PAVGB        m1, m0
317cabdff1aSopenharmony_ci    add          r0, r4
318cabdff1aSopenharmony_ci    add          r1, r4
319cabdff1aSopenharmony_ci    PAVGB        m2, [r0+r2]
320cabdff1aSopenharmony_ci    PAVGB        m1, [r0+r4]
321cabdff1aSopenharmony_ci    mova    [r0+r2], m2
322cabdff1aSopenharmony_ci    mova    [r0+r4], m1
323cabdff1aSopenharmony_ci    add          r0, r4
324cabdff1aSopenharmony_ci    sub         r3d, 4
325cabdff1aSopenharmony_ci    jne .loop
326cabdff1aSopenharmony_ci    REP_RET
327cabdff1aSopenharmony_ci%endmacro
328cabdff1aSopenharmony_ci
329cabdff1aSopenharmony_ciINIT_MMX mmxext
330cabdff1aSopenharmony_ciAVG_PIXELS8_Y2
331cabdff1aSopenharmony_ci; actually avg_pixels16_y2
332cabdff1aSopenharmony_ciINIT_XMM sse2
333cabdff1aSopenharmony_ciAVG_PIXELS8_Y2
334cabdff1aSopenharmony_ci
335cabdff1aSopenharmony_ci
336cabdff1aSopenharmony_ci; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
337cabdff1aSopenharmony_ci; Note this is not correctly rounded, and is therefore used for
338cabdff1aSopenharmony_ci; not-bitexact output
339cabdff1aSopenharmony_ciINIT_MMX mmxext
340cabdff1aSopenharmony_cicglobal avg_approx_pixels8_xy2, 4,5
341cabdff1aSopenharmony_ci    mova         m6, [pb_1]
342cabdff1aSopenharmony_ci    lea          r4, [r2*2]
343cabdff1aSopenharmony_ci    mova         m0, [r1]
344cabdff1aSopenharmony_ci    PAVGB        m0, [r1+1]
345cabdff1aSopenharmony_ci.loop:
346cabdff1aSopenharmony_ci    mova         m2, [r1+r4]
347cabdff1aSopenharmony_ci    mova         m1, [r1+r2]
348cabdff1aSopenharmony_ci    psubusb      m2, m6
349cabdff1aSopenharmony_ci    PAVGB        m1, [r1+r2+1]
350cabdff1aSopenharmony_ci    PAVGB        m2, [r1+r4+1]
351cabdff1aSopenharmony_ci    add          r1, r4
352cabdff1aSopenharmony_ci    PAVGB        m0, m1
353cabdff1aSopenharmony_ci    PAVGB        m1, m2
354cabdff1aSopenharmony_ci    PAVGB        m0, [r0]
355cabdff1aSopenharmony_ci    PAVGB        m1, [r0+r2]
356cabdff1aSopenharmony_ci    mova       [r0], m0
357cabdff1aSopenharmony_ci    mova    [r0+r2], m1
358cabdff1aSopenharmony_ci    mova         m1, [r1+r2]
359cabdff1aSopenharmony_ci    mova         m0, [r1+r4]
360cabdff1aSopenharmony_ci    PAVGB        m1, [r1+r2+1]
361cabdff1aSopenharmony_ci    PAVGB        m0, [r1+r4+1]
362cabdff1aSopenharmony_ci    add          r0, r4
363cabdff1aSopenharmony_ci    add          r1, r4
364cabdff1aSopenharmony_ci    PAVGB        m2, m1
365cabdff1aSopenharmony_ci    PAVGB        m1, m0
366cabdff1aSopenharmony_ci    PAVGB        m2, [r0]
367cabdff1aSopenharmony_ci    PAVGB        m1, [r0+r2]
368cabdff1aSopenharmony_ci    mova       [r0], m2
369cabdff1aSopenharmony_ci    mova    [r0+r2], m1
370cabdff1aSopenharmony_ci    add          r0, r4
371cabdff1aSopenharmony_ci    sub         r3d, 4
372cabdff1aSopenharmony_ci    jne .loop
373cabdff1aSopenharmony_ci    REP_RET
374cabdff1aSopenharmony_ci
375cabdff1aSopenharmony_ci
376cabdff1aSopenharmony_ci; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
377cabdff1aSopenharmony_ci%macro SET_PIXELS_XY2 1
378cabdff1aSopenharmony_ci%if cpuflag(sse2)
379cabdff1aSopenharmony_cicglobal %1_pixels16_xy2, 4,5,8
380cabdff1aSopenharmony_ci%else
381cabdff1aSopenharmony_cicglobal %1_pixels8_xy2, 4,5
382cabdff1aSopenharmony_ci%endif
383cabdff1aSopenharmony_ci    pxor        m7, m7
384cabdff1aSopenharmony_ci    mova        m6, [pw_2]
385cabdff1aSopenharmony_ci    movu        m0, [r1]
386cabdff1aSopenharmony_ci    movu        m4, [r1+1]
387cabdff1aSopenharmony_ci    mova        m1, m0
388cabdff1aSopenharmony_ci    mova        m5, m4
389cabdff1aSopenharmony_ci    punpcklbw   m0, m7
390cabdff1aSopenharmony_ci    punpcklbw   m4, m7
391cabdff1aSopenharmony_ci    punpckhbw   m1, m7
392cabdff1aSopenharmony_ci    punpckhbw   m5, m7
393cabdff1aSopenharmony_ci    paddusw     m4, m0
394cabdff1aSopenharmony_ci    paddusw     m5, m1
395cabdff1aSopenharmony_ci    xor         r4, r4
396cabdff1aSopenharmony_ci    add         r1, r2
397cabdff1aSopenharmony_ci.loop:
398cabdff1aSopenharmony_ci    movu        m0, [r1+r4]
399cabdff1aSopenharmony_ci    movu        m2, [r1+r4+1]
400cabdff1aSopenharmony_ci    mova        m1, m0
401cabdff1aSopenharmony_ci    mova        m3, m2
402cabdff1aSopenharmony_ci    punpcklbw   m0, m7
403cabdff1aSopenharmony_ci    punpcklbw   m2, m7
404cabdff1aSopenharmony_ci    punpckhbw   m1, m7
405cabdff1aSopenharmony_ci    punpckhbw   m3, m7
406cabdff1aSopenharmony_ci    paddusw     m0, m2
407cabdff1aSopenharmony_ci    paddusw     m1, m3
408cabdff1aSopenharmony_ci    paddusw     m4, m6
409cabdff1aSopenharmony_ci    paddusw     m5, m6
410cabdff1aSopenharmony_ci    paddusw     m4, m0
411cabdff1aSopenharmony_ci    paddusw     m5, m1
412cabdff1aSopenharmony_ci    psrlw       m4, 2
413cabdff1aSopenharmony_ci    psrlw       m5, 2
414cabdff1aSopenharmony_ci%ifidn %1, avg
415cabdff1aSopenharmony_ci    mova        m3, [r0+r4]
416cabdff1aSopenharmony_ci    packuswb    m4, m5
417cabdff1aSopenharmony_ci    PAVGB       m4, m3
418cabdff1aSopenharmony_ci%else
419cabdff1aSopenharmony_ci    packuswb    m4, m5
420cabdff1aSopenharmony_ci%endif
421cabdff1aSopenharmony_ci    mova   [r0+r4], m4
422cabdff1aSopenharmony_ci    add         r4, r2
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_ci    movu        m2, [r1+r4]
425cabdff1aSopenharmony_ci    movu        m4, [r1+r4+1]
426cabdff1aSopenharmony_ci    mova        m3, m2
427cabdff1aSopenharmony_ci    mova        m5, m4
428cabdff1aSopenharmony_ci    punpcklbw   m2, m7
429cabdff1aSopenharmony_ci    punpcklbw   m4, m7
430cabdff1aSopenharmony_ci    punpckhbw   m3, m7
431cabdff1aSopenharmony_ci    punpckhbw   m5, m7
432cabdff1aSopenharmony_ci    paddusw     m4, m2
433cabdff1aSopenharmony_ci    paddusw     m5, m3
434cabdff1aSopenharmony_ci    paddusw     m0, m6
435cabdff1aSopenharmony_ci    paddusw     m1, m6
436cabdff1aSopenharmony_ci    paddusw     m0, m4
437cabdff1aSopenharmony_ci    paddusw     m1, m5
438cabdff1aSopenharmony_ci    psrlw       m0, 2
439cabdff1aSopenharmony_ci    psrlw       m1, 2
440cabdff1aSopenharmony_ci%ifidn %1, avg
441cabdff1aSopenharmony_ci    mova        m3, [r0+r4]
442cabdff1aSopenharmony_ci    packuswb    m0, m1
443cabdff1aSopenharmony_ci    PAVGB       m0, m3
444cabdff1aSopenharmony_ci%else
445cabdff1aSopenharmony_ci    packuswb    m0, m1
446cabdff1aSopenharmony_ci%endif
447cabdff1aSopenharmony_ci    mova   [r0+r4], m0
448cabdff1aSopenharmony_ci    add         r4, r2
449cabdff1aSopenharmony_ci    sub        r3d, 2
450cabdff1aSopenharmony_ci    jnz .loop
451cabdff1aSopenharmony_ci    REP_RET
452cabdff1aSopenharmony_ci%endmacro
453cabdff1aSopenharmony_ci
454cabdff1aSopenharmony_ciINIT_MMX mmxext
455cabdff1aSopenharmony_ciSET_PIXELS_XY2 avg
456cabdff1aSopenharmony_ciINIT_XMM sse2
457cabdff1aSopenharmony_ciSET_PIXELS_XY2 put
458cabdff1aSopenharmony_ciSET_PIXELS_XY2 avg
459cabdff1aSopenharmony_ci
460cabdff1aSopenharmony_ci%macro SSSE3_PIXELS_XY2 1-2
461cabdff1aSopenharmony_ci%if %0 == 2 ; sse2
462cabdff1aSopenharmony_cicglobal %1_pixels16_xy2, 4,5,%2
463cabdff1aSopenharmony_ci    mova        m4, [pb_interleave16]
464cabdff1aSopenharmony_ci%else
465cabdff1aSopenharmony_cicglobal %1_pixels8_xy2, 4,5
466cabdff1aSopenharmony_ci    mova        m4, [pb_interleave8]
467cabdff1aSopenharmony_ci%endif
468cabdff1aSopenharmony_ci    mova        m5, [pb_1]
469cabdff1aSopenharmony_ci    movu        m0, [r1]
470cabdff1aSopenharmony_ci    movu        m1, [r1+1]
471cabdff1aSopenharmony_ci    pmaddubsw   m0, m5
472cabdff1aSopenharmony_ci    pmaddubsw   m1, m5
473cabdff1aSopenharmony_ci    xor         r4, r4
474cabdff1aSopenharmony_ci    add         r1, r2
475cabdff1aSopenharmony_ci.loop:
476cabdff1aSopenharmony_ci    movu        m2, [r1+r4]
477cabdff1aSopenharmony_ci    movu        m3, [r1+r4+1]
478cabdff1aSopenharmony_ci    pmaddubsw   m2, m5
479cabdff1aSopenharmony_ci    pmaddubsw   m3, m5
480cabdff1aSopenharmony_ci    paddusw     m0, m2
481cabdff1aSopenharmony_ci    paddusw     m1, m3
482cabdff1aSopenharmony_ci    pmulhrsw    m0, [pw_8192]
483cabdff1aSopenharmony_ci    pmulhrsw    m1, [pw_8192]
484cabdff1aSopenharmony_ci%ifidn %1, avg
485cabdff1aSopenharmony_ci    mova        m6, [r0+r4]
486cabdff1aSopenharmony_ci    packuswb    m0, m1
487cabdff1aSopenharmony_ci    pshufb      m0, m4
488cabdff1aSopenharmony_ci    pavgb       m0, m6
489cabdff1aSopenharmony_ci%else
490cabdff1aSopenharmony_ci    packuswb    m0, m1
491cabdff1aSopenharmony_ci    pshufb      m0, m4
492cabdff1aSopenharmony_ci%endif
493cabdff1aSopenharmony_ci    mova   [r0+r4], m0
494cabdff1aSopenharmony_ci    add         r4, r2
495cabdff1aSopenharmony_ci
496cabdff1aSopenharmony_ci    movu        m0, [r1+r4]
497cabdff1aSopenharmony_ci    movu        m1, [r1+r4+1]
498cabdff1aSopenharmony_ci    pmaddubsw   m0, m5
499cabdff1aSopenharmony_ci    pmaddubsw   m1, m5
500cabdff1aSopenharmony_ci    paddusw     m2, m0
501cabdff1aSopenharmony_ci    paddusw     m3, m1
502cabdff1aSopenharmony_ci    pmulhrsw    m2, [pw_8192]
503cabdff1aSopenharmony_ci    pmulhrsw    m3, [pw_8192]
504cabdff1aSopenharmony_ci%ifidn %1, avg
505cabdff1aSopenharmony_ci    mova        m6, [r0+r4]
506cabdff1aSopenharmony_ci    packuswb    m2, m3
507cabdff1aSopenharmony_ci    pshufb      m2, m4
508cabdff1aSopenharmony_ci    pavgb       m2, m6
509cabdff1aSopenharmony_ci%else
510cabdff1aSopenharmony_ci    packuswb    m2, m3
511cabdff1aSopenharmony_ci    pshufb      m2, m4
512cabdff1aSopenharmony_ci%endif
513cabdff1aSopenharmony_ci    mova   [r0+r4], m2
514cabdff1aSopenharmony_ci    add         r4, r2
515cabdff1aSopenharmony_ci    sub        r3d, 2
516cabdff1aSopenharmony_ci    jnz .loop
517cabdff1aSopenharmony_ci    REP_RET
518cabdff1aSopenharmony_ci%endmacro
519cabdff1aSopenharmony_ci
520cabdff1aSopenharmony_ciINIT_MMX ssse3
521cabdff1aSopenharmony_ciSSSE3_PIXELS_XY2 put
522cabdff1aSopenharmony_ciSSSE3_PIXELS_XY2 avg
523cabdff1aSopenharmony_ciINIT_XMM ssse3
524cabdff1aSopenharmony_ciSSSE3_PIXELS_XY2 put, 6
525cabdff1aSopenharmony_ciSSSE3_PIXELS_XY2 avg, 7
526