1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* Core video DSP functions
3cabdff1aSopenharmony_ci;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
4cabdff1aSopenharmony_ci;*
5cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci;*
12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
16cabdff1aSopenharmony_ci;*
17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci;******************************************************************************
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ciSECTION .text
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ci; slow vertical extension loop function. Works with variable-width, and
27cabdff1aSopenharmony_ci; does per-line reading/writing of source data
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_ci%macro V_COPY_ROW 2 ; type (top/body/bottom), h
30cabdff1aSopenharmony_ci.%1_y_loop:                                     ; do {
31cabdff1aSopenharmony_ci    mov              wq, r7mp                   ;   initialize w (r7mp = wmp)
32cabdff1aSopenharmony_ci.%1_x_loop:                                     ;   do {
33cabdff1aSopenharmony_ci    movu             m0, [srcq+wq]              ;     m0 = read($mmsize)
34cabdff1aSopenharmony_ci    movu      [dstq+wq], m0                     ;     write(m0, $mmsize)
35cabdff1aSopenharmony_ci    add              wq, mmsize                 ;     w -= $mmsize
36cabdff1aSopenharmony_ci    cmp              wq, -mmsize                ;   } while (w > $mmsize);
37cabdff1aSopenharmony_ci    jl .%1_x_loop
38cabdff1aSopenharmony_ci    movu             m0, [srcq-mmsize]          ;     m0 = read($mmsize)
39cabdff1aSopenharmony_ci    movu  [dstq-mmsize], m0                     ;     write(m0, $mmsize)
40cabdff1aSopenharmony_ci%ifidn %1, body                                 ;   if ($type == body) {
41cabdff1aSopenharmony_ci    add            srcq, src_strideq            ;     src += src_stride
42cabdff1aSopenharmony_ci%endif                                          ;   }
43cabdff1aSopenharmony_ci    add            dstq, dst_strideq            ;   dst += dst_stride
44cabdff1aSopenharmony_ci    dec              %2                         ; } while (--$h);
45cabdff1aSopenharmony_ci    jnz .%1_y_loop
46cabdff1aSopenharmony_ci%endmacro
47cabdff1aSopenharmony_ci
48cabdff1aSopenharmony_ci; .----. <- zero
49cabdff1aSopenharmony_ci; |    |    <- top is copied from first line in body of source
50cabdff1aSopenharmony_ci; |----| <- start_y
51cabdff1aSopenharmony_ci; |    |    <- body is copied verbatim (line-by-line) from source
52cabdff1aSopenharmony_ci; |----| <- end_y
53cabdff1aSopenharmony_ci; |    |    <- bottom is copied from last line in body of source
54cabdff1aSopenharmony_ci; '----' <- bh
55cabdff1aSopenharmony_ciINIT_XMM sse
56cabdff1aSopenharmony_ci%if ARCH_X86_64
57cabdff1aSopenharmony_cicglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \
58cabdff1aSopenharmony_ci                                start_y, end_y, bh, w
59cabdff1aSopenharmony_ci%else ; x86-32
60cabdff1aSopenharmony_cicglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w
61cabdff1aSopenharmony_ci%define src_strideq r3mp
62cabdff1aSopenharmony_ci%define dst_strideq r1mp
63cabdff1aSopenharmony_ci    mov            srcq, r2mp
64cabdff1aSopenharmony_ci    mov        start_yq, r4mp
65cabdff1aSopenharmony_ci    mov          end_yq, r5mp
66cabdff1aSopenharmony_ci    mov             bhq, r6mp
67cabdff1aSopenharmony_ci%endif
68cabdff1aSopenharmony_ci    sub             bhq, end_yq                 ; bh    -= end_q
69cabdff1aSopenharmony_ci    sub          end_yq, start_yq               ; end_q -= start_q
70cabdff1aSopenharmony_ci    add            srcq, r7mp                   ; (r7mp = wmp)
71cabdff1aSopenharmony_ci    add            dstq, r7mp                   ; (r7mp = wmp)
72cabdff1aSopenharmony_ci    neg            r7mp                         ; (r7mp = wmp)
73cabdff1aSopenharmony_ci    test       start_yq, start_yq               ; if (start_q) {
74cabdff1aSopenharmony_ci    jz .body
75cabdff1aSopenharmony_ci    V_COPY_ROW      top, start_yq               ;   v_copy_row(top, start_yq)
76cabdff1aSopenharmony_ci.body:                                          ; }
77cabdff1aSopenharmony_ci    V_COPY_ROW     body, end_yq                 ; v_copy_row(body, end_yq)
78cabdff1aSopenharmony_ci    test            bhq, bhq                    ; if (bh) {
79cabdff1aSopenharmony_ci    jz .end
80cabdff1aSopenharmony_ci    sub            srcq, src_strideq            ;   src -= src_stride
81cabdff1aSopenharmony_ci    V_COPY_ROW   bottom, bhq                    ;   v_copy_row(bottom, bh)
82cabdff1aSopenharmony_ci.end:                                           ; }
83cabdff1aSopenharmony_ci    RET
84cabdff1aSopenharmony_ci
85cabdff1aSopenharmony_ci%macro hvar_fn 0
86cabdff1aSopenharmony_cicglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
87cabdff1aSopenharmony_ci    lea            dstq, [dstq+n_wordsq*2]
88cabdff1aSopenharmony_ci    neg        n_wordsq
89cabdff1aSopenharmony_ci    lea        start_xq, [start_xq+n_wordsq*2]
90cabdff1aSopenharmony_ci.y_loop:                                        ; do {
91cabdff1aSopenharmony_ci%if cpuflag(avx2)
92cabdff1aSopenharmony_ci    vpbroadcastb     m0, [dstq+start_xq]
93cabdff1aSopenharmony_ci    mov              wq, n_wordsq               ;   initialize w
94cabdff1aSopenharmony_ci%else
95cabdff1aSopenharmony_ci    movzx            wd, byte [dstq+start_xq]   ;   w = read(1)
96cabdff1aSopenharmony_ci    imul             wd, 0x01010101             ;   w *= 0x01010101
97cabdff1aSopenharmony_ci    movd             m0, wd
98cabdff1aSopenharmony_ci    mov              wq, n_wordsq               ;   initialize w
99cabdff1aSopenharmony_ci    pshufd           m0, m0, q0000              ;   splat
100cabdff1aSopenharmony_ci%endif ; avx2
101cabdff1aSopenharmony_ci.x_loop:                                        ;   do {
102cabdff1aSopenharmony_ci    movu    [dstq+wq*2], m0                     ;     write($reg, $mmsize)
103cabdff1aSopenharmony_ci    add              wq, mmsize/2               ;     w -= $mmsize/2
104cabdff1aSopenharmony_ci    cmp              wq, -(mmsize/2)            ;   } while (w > $mmsize/2)
105cabdff1aSopenharmony_ci    jl .x_loop
106cabdff1aSopenharmony_ci    movu  [dstq-mmsize], m0                     ;   write($reg, $mmsize)
107cabdff1aSopenharmony_ci    add            dstq, dst_strideq            ;   dst += dst_stride
108cabdff1aSopenharmony_ci    dec              hq                         ; } while (h--)
109cabdff1aSopenharmony_ci    jnz .y_loop
110cabdff1aSopenharmony_ci    RET
111cabdff1aSopenharmony_ci%endmacro
112cabdff1aSopenharmony_ci
113cabdff1aSopenharmony_ciINIT_XMM sse2
114cabdff1aSopenharmony_cihvar_fn
115cabdff1aSopenharmony_ci
116cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
117cabdff1aSopenharmony_ciINIT_XMM avx2
118cabdff1aSopenharmony_cihvar_fn
119cabdff1aSopenharmony_ci%endif
120cabdff1aSopenharmony_ci
121cabdff1aSopenharmony_ci; macro to read/write a horizontal number of pixels (%2) to/from registers
122cabdff1aSopenharmony_ci; on sse, - fills xmm0-15 for consecutive sets of 16 pixels
123cabdff1aSopenharmony_ci;         - if (%2 & 8)  fills 8 bytes into xmm$next
124cabdff1aSopenharmony_ci;         - if (%2 & 4)  fills 4 bytes into xmm$next
125cabdff1aSopenharmony_ci;         - if (%2 & 3)  fills 1, 2 or 4 bytes in eax
126cabdff1aSopenharmony_ci; on mmx, - fills mm0-7 for consecutive sets of 8 pixels
127cabdff1aSopenharmony_ci;         - if (%2 & 4)  fills 4 bytes into mm$next
128cabdff1aSopenharmony_ci;         - if (%2 & 3)  fills 1, 2 or 4 bytes in eax
129cabdff1aSopenharmony_ci; writing data out is in the same way
130cabdff1aSopenharmony_ci%macro READ_NUM_BYTES 2
131cabdff1aSopenharmony_ci%assign %%off 0     ; offset in source buffer
132cabdff1aSopenharmony_ci%assign %%mmx_idx 0 ; mmx register index
133cabdff1aSopenharmony_ci%assign %%xmm_idx 0 ; xmm register index
134cabdff1aSopenharmony_ci
135cabdff1aSopenharmony_ci%rep %2/mmsize
136cabdff1aSopenharmony_ci%if mmsize == 16
137cabdff1aSopenharmony_ci    movu   xmm %+ %%xmm_idx, [srcq+%%off]
138cabdff1aSopenharmony_ci%assign %%xmm_idx %%xmm_idx+1
139cabdff1aSopenharmony_ci%else ; mmx
140cabdff1aSopenharmony_ci    movu    mm %+ %%mmx_idx, [srcq+%%off]
141cabdff1aSopenharmony_ci%assign %%mmx_idx %%mmx_idx+1
142cabdff1aSopenharmony_ci%endif
143cabdff1aSopenharmony_ci%assign %%off %%off+mmsize
144cabdff1aSopenharmony_ci%endrep ; %2/mmsize
145cabdff1aSopenharmony_ci
146cabdff1aSopenharmony_ci%if mmsize == 16
147cabdff1aSopenharmony_ci%if (%2-%%off) >= 8
148cabdff1aSopenharmony_ci%if %2 > 16 && (%2-%%off) > 8
149cabdff1aSopenharmony_ci    movu   xmm %+ %%xmm_idx, [srcq+%2-16]
150cabdff1aSopenharmony_ci%assign %%xmm_idx %%xmm_idx+1
151cabdff1aSopenharmony_ci%assign %%off %2
152cabdff1aSopenharmony_ci%else
153cabdff1aSopenharmony_ci    movq    mm %+ %%mmx_idx, [srcq+%%off]
154cabdff1aSopenharmony_ci%assign %%mmx_idx %%mmx_idx+1
155cabdff1aSopenharmony_ci%assign %%off %%off+8
156cabdff1aSopenharmony_ci%endif
157cabdff1aSopenharmony_ci%endif ; (%2-%%off) >= 8
158cabdff1aSopenharmony_ci%endif
159cabdff1aSopenharmony_ci
160cabdff1aSopenharmony_ci%if (%2-%%off) >= 4
161cabdff1aSopenharmony_ci%if %2 > 8 && (%2-%%off) > 4
162cabdff1aSopenharmony_ci    movq    mm %+ %%mmx_idx, [srcq+%2-8]
163cabdff1aSopenharmony_ci%assign %%off %2
164cabdff1aSopenharmony_ci%else
165cabdff1aSopenharmony_ci    movd    mm %+ %%mmx_idx, [srcq+%%off]
166cabdff1aSopenharmony_ci%assign %%off %%off+4
167cabdff1aSopenharmony_ci%endif
168cabdff1aSopenharmony_ci%assign %%mmx_idx %%mmx_idx+1
169cabdff1aSopenharmony_ci%endif ; (%2-%%off) >= 4
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_ci%if (%2-%%off) >= 1
172cabdff1aSopenharmony_ci%if %2 >= 4
173cabdff1aSopenharmony_ci    movd mm %+ %%mmx_idx, [srcq+%2-4]
174cabdff1aSopenharmony_ci%elif (%2-%%off) == 1
175cabdff1aSopenharmony_ci    mov            valb, [srcq+%2-1]
176cabdff1aSopenharmony_ci%elif (%2-%%off) == 2
177cabdff1aSopenharmony_ci    mov            valw, [srcq+%2-2]
178cabdff1aSopenharmony_ci%else
179cabdff1aSopenharmony_ci    mov            valb, [srcq+%2-1]
180cabdff1aSopenharmony_ci    ror            vald, 16
181cabdff1aSopenharmony_ci    mov            valw, [srcq+%2-3]
182cabdff1aSopenharmony_ci%endif
183cabdff1aSopenharmony_ci%endif ; (%2-%%off) >= 1
184cabdff1aSopenharmony_ci%endmacro ; READ_NUM_BYTES
185cabdff1aSopenharmony_ci
186cabdff1aSopenharmony_ci%macro WRITE_NUM_BYTES 2
187cabdff1aSopenharmony_ci%assign %%off 0     ; offset in destination buffer
188cabdff1aSopenharmony_ci%assign %%mmx_idx 0 ; mmx register index
189cabdff1aSopenharmony_ci%assign %%xmm_idx 0 ; xmm register index
190cabdff1aSopenharmony_ci
191cabdff1aSopenharmony_ci%rep %2/mmsize
192cabdff1aSopenharmony_ci%if mmsize == 16
193cabdff1aSopenharmony_ci    movu   [dstq+%%off], xmm %+ %%xmm_idx
194cabdff1aSopenharmony_ci%assign %%xmm_idx %%xmm_idx+1
195cabdff1aSopenharmony_ci%else ; mmx
196cabdff1aSopenharmony_ci    movu   [dstq+%%off], mm %+ %%mmx_idx
197cabdff1aSopenharmony_ci%assign %%mmx_idx %%mmx_idx+1
198cabdff1aSopenharmony_ci%endif
199cabdff1aSopenharmony_ci%assign %%off %%off+mmsize
200cabdff1aSopenharmony_ci%endrep ; %2/mmsize
201cabdff1aSopenharmony_ci
202cabdff1aSopenharmony_ci%if mmsize == 16
203cabdff1aSopenharmony_ci%if (%2-%%off) >= 8
204cabdff1aSopenharmony_ci%if %2 > 16 && (%2-%%off) > 8
205cabdff1aSopenharmony_ci    movu   [dstq+%2-16], xmm %+ %%xmm_idx
206cabdff1aSopenharmony_ci%assign %%xmm_idx %%xmm_idx+1
207cabdff1aSopenharmony_ci%assign %%off %2
208cabdff1aSopenharmony_ci%else
209cabdff1aSopenharmony_ci    movq   [dstq+%%off], mm %+ %%mmx_idx
210cabdff1aSopenharmony_ci%assign %%mmx_idx %%mmx_idx+1
211cabdff1aSopenharmony_ci%assign %%off %%off+8
212cabdff1aSopenharmony_ci%endif
213cabdff1aSopenharmony_ci%endif ; (%2-%%off) >= 8
214cabdff1aSopenharmony_ci%endif
215cabdff1aSopenharmony_ci
216cabdff1aSopenharmony_ci%if (%2-%%off) >= 4
217cabdff1aSopenharmony_ci%if %2 > 8 && (%2-%%off) > 4
218cabdff1aSopenharmony_ci    movq    [dstq+%2-8], mm %+ %%mmx_idx
219cabdff1aSopenharmony_ci%assign %%off %2
220cabdff1aSopenharmony_ci%else
221cabdff1aSopenharmony_ci    movd   [dstq+%%off], mm %+ %%mmx_idx
222cabdff1aSopenharmony_ci%assign %%off %%off+4
223cabdff1aSopenharmony_ci%endif
224cabdff1aSopenharmony_ci%assign %%mmx_idx %%mmx_idx+1
225cabdff1aSopenharmony_ci%endif ; (%2-%%off) >= 4
226cabdff1aSopenharmony_ci
227cabdff1aSopenharmony_ci%if (%2-%%off) >= 1
228cabdff1aSopenharmony_ci%if %2 >= 4
229cabdff1aSopenharmony_ci    movd    [dstq+%2-4], mm %+ %%mmx_idx
230cabdff1aSopenharmony_ci%elif (%2-%%off) == 1
231cabdff1aSopenharmony_ci    mov     [dstq+%2-1], valb
232cabdff1aSopenharmony_ci%elif (%2-%%off) == 2
233cabdff1aSopenharmony_ci    mov     [dstq+%2-2], valw
234cabdff1aSopenharmony_ci%else
235cabdff1aSopenharmony_ci    mov     [dstq+%2-3], valw
236cabdff1aSopenharmony_ci    ror            vald, 16
237cabdff1aSopenharmony_ci    mov     [dstq+%2-1], valb
238cabdff1aSopenharmony_ci%ifnidn %1, body
239cabdff1aSopenharmony_ci    ror            vald, 16
240cabdff1aSopenharmony_ci%endif
241cabdff1aSopenharmony_ci%endif
242cabdff1aSopenharmony_ci%endif ; (%2-%%off) >= 1
243cabdff1aSopenharmony_ci%endmacro ; WRITE_NUM_BYTES
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ci; vertical top/bottom extend and body copy fast loops
246cabdff1aSopenharmony_ci; these are function pointers to set-width line copy functions, i.e.
247cabdff1aSopenharmony_ci; they read a fixed number of pixels into set registers, and write
248cabdff1aSopenharmony_ci; those out into the destination buffer
249cabdff1aSopenharmony_ci%macro VERTICAL_EXTEND 2
250cabdff1aSopenharmony_ci%assign %%n %1
251cabdff1aSopenharmony_ci%rep 1+%2-%1
252cabdff1aSopenharmony_ci%if %%n <= 3
253cabdff1aSopenharmony_ci%if ARCH_X86_64
254cabdff1aSopenharmony_cicglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \
255cabdff1aSopenharmony_ci                                       start_y, end_y, val, bh
256cabdff1aSopenharmony_ci    mov             bhq, r6mp                   ; r6mp = bhmp
257cabdff1aSopenharmony_ci%else ; x86-32
258cabdff1aSopenharmony_cicglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh
259cabdff1aSopenharmony_ci    mov            dstq, r0mp
260cabdff1aSopenharmony_ci    mov            srcq, r2mp
261cabdff1aSopenharmony_ci    mov        start_yq, r4mp
262cabdff1aSopenharmony_ci    mov          end_yq, r5mp
263cabdff1aSopenharmony_ci    mov             bhq, r6mp
264cabdff1aSopenharmony_ci%define dst_strideq r1mp
265cabdff1aSopenharmony_ci%define src_strideq r3mp
266cabdff1aSopenharmony_ci%endif ; x86-64/32
267cabdff1aSopenharmony_ci%else
268cabdff1aSopenharmony_ci%if ARCH_X86_64
269cabdff1aSopenharmony_cicglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \
270cabdff1aSopenharmony_ci                                       start_y, end_y, bh
271cabdff1aSopenharmony_ci%else ; x86-32
272cabdff1aSopenharmony_cicglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
273cabdff1aSopenharmony_ci    mov            srcq, r2mp
274cabdff1aSopenharmony_ci    mov        start_yq, r4mp
275cabdff1aSopenharmony_ci    mov          end_yq, r5mp
276cabdff1aSopenharmony_ci    mov             bhq, r6mp
277cabdff1aSopenharmony_ci%define dst_strideq r1mp
278cabdff1aSopenharmony_ci%define src_strideq r3mp
279cabdff1aSopenharmony_ci%endif ; x86-64/32
280cabdff1aSopenharmony_ci%endif
281cabdff1aSopenharmony_ci    ; FIXME move this to c wrapper?
282cabdff1aSopenharmony_ci    sub             bhq, end_yq                 ; bh    -= end_y
283cabdff1aSopenharmony_ci    sub          end_yq, start_yq               ; end_y -= start_y
284cabdff1aSopenharmony_ci
285cabdff1aSopenharmony_ci    ; extend pixels above body
286cabdff1aSopenharmony_ci    test       start_yq, start_yq               ; if (start_y) {
287cabdff1aSopenharmony_ci    jz .body_loop
288cabdff1aSopenharmony_ci    READ_NUM_BYTES  top, %%n                    ;   $variable_regs = read($n)
289cabdff1aSopenharmony_ci.top_loop:                                      ;   do {
290cabdff1aSopenharmony_ci    WRITE_NUM_BYTES top, %%n                    ;     write($variable_regs, $n)
291cabdff1aSopenharmony_ci    add            dstq, dst_strideq            ;     dst += linesize
292cabdff1aSopenharmony_ci    dec        start_yq                         ;   } while (--start_y)
293cabdff1aSopenharmony_ci    jnz .top_loop                               ; }
294cabdff1aSopenharmony_ci
295cabdff1aSopenharmony_ci    ; copy body pixels
296cabdff1aSopenharmony_ci.body_loop:                                     ; do {
297cabdff1aSopenharmony_ci    READ_NUM_BYTES  body, %%n                   ;   $variable_regs = read($n)
298cabdff1aSopenharmony_ci    WRITE_NUM_BYTES body, %%n                   ;   write($variable_regs, $n)
299cabdff1aSopenharmony_ci    add            dstq, dst_strideq            ;   dst += dst_stride
300cabdff1aSopenharmony_ci    add            srcq, src_strideq            ;   src += src_stride
301cabdff1aSopenharmony_ci    dec          end_yq                         ; } while (--end_y)
302cabdff1aSopenharmony_ci    jnz .body_loop
303cabdff1aSopenharmony_ci
304cabdff1aSopenharmony_ci    ; copy bottom pixels
305cabdff1aSopenharmony_ci    test            bhq, bhq                    ; if (block_h) {
306cabdff1aSopenharmony_ci    jz .end
307cabdff1aSopenharmony_ci    sub            srcq, src_strideq            ;   src -= linesize
308cabdff1aSopenharmony_ci    READ_NUM_BYTES  bottom, %%n                 ;   $variable_regs = read($n)
309cabdff1aSopenharmony_ci.bottom_loop:                                   ;   do {
310cabdff1aSopenharmony_ci    WRITE_NUM_BYTES bottom, %%n                 ;     write($variable_regs, $n)
311cabdff1aSopenharmony_ci    add            dstq, dst_strideq            ;     dst += linesize
312cabdff1aSopenharmony_ci    dec             bhq                         ;   } while (--bh)
313cabdff1aSopenharmony_ci    jnz .bottom_loop                            ; }
314cabdff1aSopenharmony_ci
315cabdff1aSopenharmony_ci.end:
316cabdff1aSopenharmony_ci    RET
317cabdff1aSopenharmony_ci%assign %%n %%n+1
318cabdff1aSopenharmony_ci%endrep ; 1+%2-%1
319cabdff1aSopenharmony_ci%endmacro ; VERTICAL_EXTEND
320cabdff1aSopenharmony_ci
321cabdff1aSopenharmony_ciINIT_MMX mmx
322cabdff1aSopenharmony_ciVERTICAL_EXTEND 1, 15
323cabdff1aSopenharmony_ci
324cabdff1aSopenharmony_ciINIT_XMM sse
325cabdff1aSopenharmony_ciVERTICAL_EXTEND 16, 22
326cabdff1aSopenharmony_ci
327cabdff1aSopenharmony_ci; left/right (horizontal) fast extend functions
328cabdff1aSopenharmony_ci; these are essentially identical to the vertical extend ones above,
329cabdff1aSopenharmony_ci; just left/right separated because number of pixels to extend is
330cabdff1aSopenharmony_ci; obviously not the same on both sides.
331cabdff1aSopenharmony_ci
332cabdff1aSopenharmony_ci%macro READ_V_PIXEL 2
333cabdff1aSopenharmony_ci%if cpuflag(avx2)
334cabdff1aSopenharmony_ci    vpbroadcastb     m0, %2
335cabdff1aSopenharmony_ci%else
336cabdff1aSopenharmony_ci    movzx          vald, byte %2
337cabdff1aSopenharmony_ci    imul           vald, 0x01010101
338cabdff1aSopenharmony_ci%if %1 >= 8
339cabdff1aSopenharmony_ci    movd             m0, vald
340cabdff1aSopenharmony_ci%if mmsize == 16
341cabdff1aSopenharmony_ci    pshufd           m0, m0, q0000
342cabdff1aSopenharmony_ci%else
343cabdff1aSopenharmony_ci    punpckldq        m0, m0
344cabdff1aSopenharmony_ci%endif ; mmsize == 16
345cabdff1aSopenharmony_ci%endif ; %1 > 16
346cabdff1aSopenharmony_ci%endif ; avx2
347cabdff1aSopenharmony_ci%endmacro ; READ_V_PIXEL
348cabdff1aSopenharmony_ci
349cabdff1aSopenharmony_ci%macro WRITE_V_PIXEL 2
350cabdff1aSopenharmony_ci%assign %%off 0
351cabdff1aSopenharmony_ci
352cabdff1aSopenharmony_ci%if %1 >= 8
353cabdff1aSopenharmony_ci
354cabdff1aSopenharmony_ci%rep %1/mmsize
355cabdff1aSopenharmony_ci    movu     [%2+%%off], m0
356cabdff1aSopenharmony_ci%assign %%off %%off+mmsize
357cabdff1aSopenharmony_ci%endrep ; %1/mmsize
358cabdff1aSopenharmony_ci
359cabdff1aSopenharmony_ci%if mmsize == 16
360cabdff1aSopenharmony_ci%if %1-%%off >= 8
361cabdff1aSopenharmony_ci%if %1 > 16 && %1-%%off > 8
362cabdff1aSopenharmony_ci    movu     [%2+%1-16], m0
363cabdff1aSopenharmony_ci%assign %%off %1
364cabdff1aSopenharmony_ci%else
365cabdff1aSopenharmony_ci    movq     [%2+%%off], m0
366cabdff1aSopenharmony_ci%assign %%off %%off+8
367cabdff1aSopenharmony_ci%endif
368cabdff1aSopenharmony_ci%endif ; %1-%%off >= 8
369cabdff1aSopenharmony_ci%endif ; mmsize == 16
370cabdff1aSopenharmony_ci
371cabdff1aSopenharmony_ci%if %1-%%off >= 4
372cabdff1aSopenharmony_ci%if %1 > 8 && %1-%%off > 4
373cabdff1aSopenharmony_ci    movq      [%2+%1-8], m0
374cabdff1aSopenharmony_ci%assign %%off %1
375cabdff1aSopenharmony_ci%else
376cabdff1aSopenharmony_ci    movd     [%2+%%off], m0
377cabdff1aSopenharmony_ci%assign %%off %%off+4
378cabdff1aSopenharmony_ci%endif
379cabdff1aSopenharmony_ci%endif ; %1-%%off >= 4
380cabdff1aSopenharmony_ci
381cabdff1aSopenharmony_ci%else ; %1 < 8
382cabdff1aSopenharmony_ci
383cabdff1aSopenharmony_ci%rep %1/4
384cabdff1aSopenharmony_ci    mov      [%2+%%off], vald
385cabdff1aSopenharmony_ci%assign %%off %%off+4
386cabdff1aSopenharmony_ci%endrep ; %1/4
387cabdff1aSopenharmony_ci
388cabdff1aSopenharmony_ci%endif ; %1 >=/< 8
389cabdff1aSopenharmony_ci
390cabdff1aSopenharmony_ci%if %1-%%off == 2
391cabdff1aSopenharmony_ci%if cpuflag(avx2)
392cabdff1aSopenharmony_ci    movd     [%2+%%off-2], m0
393cabdff1aSopenharmony_ci%else
394cabdff1aSopenharmony_ci    mov      [%2+%%off], valw
395cabdff1aSopenharmony_ci%endif ; avx2
396cabdff1aSopenharmony_ci%endif ; (%1-%%off)/2
397cabdff1aSopenharmony_ci%endmacro ; WRITE_V_PIXEL
398cabdff1aSopenharmony_ci
399cabdff1aSopenharmony_ci%macro H_EXTEND 2
400cabdff1aSopenharmony_ci%assign %%n %1
401cabdff1aSopenharmony_ci%rep 1+(%2-%1)/2
402cabdff1aSopenharmony_ci%if cpuflag(avx2)
403cabdff1aSopenharmony_cicglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh
404cabdff1aSopenharmony_ci%else
405cabdff1aSopenharmony_cicglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
406cabdff1aSopenharmony_ci%endif
407cabdff1aSopenharmony_ci.loop_y:                                        ; do {
408cabdff1aSopenharmony_ci    READ_V_PIXEL    %%n, [dstq+start_xq]        ;   $variable_regs = read($n)
409cabdff1aSopenharmony_ci    WRITE_V_PIXEL   %%n, dstq                   ;   write($variable_regs, $n)
410cabdff1aSopenharmony_ci    add            dstq, dst_strideq            ;   dst += dst_stride
411cabdff1aSopenharmony_ci    dec             bhq                         ; } while (--bh)
412cabdff1aSopenharmony_ci    jnz .loop_y
413cabdff1aSopenharmony_ci    RET
414cabdff1aSopenharmony_ci%assign %%n %%n+2
415cabdff1aSopenharmony_ci%endrep ; 1+(%2-%1)/2
416cabdff1aSopenharmony_ci%endmacro ; H_EXTEND
417cabdff1aSopenharmony_ci
418cabdff1aSopenharmony_ciINIT_MMX mmx
419cabdff1aSopenharmony_ciH_EXTEND 2, 14
420cabdff1aSopenharmony_ci
421cabdff1aSopenharmony_ciINIT_XMM sse2
422cabdff1aSopenharmony_ciH_EXTEND 16, 22
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
425cabdff1aSopenharmony_ciINIT_XMM avx2
426cabdff1aSopenharmony_ciH_EXTEND 8, 22
427cabdff1aSopenharmony_ci%endif
428cabdff1aSopenharmony_ci
429cabdff1aSopenharmony_ciINIT_MMX mmxext
430cabdff1aSopenharmony_cicglobal prefetch, 3, 3, 0, buf, stride, h
431cabdff1aSopenharmony_ci.loop:
432cabdff1aSopenharmony_ci    prefetcht0 [bufq]
433cabdff1aSopenharmony_ci    add      bufq, strideq
434cabdff1aSopenharmony_ci    dec        hd
435cabdff1aSopenharmony_ci    jg .loop
436cabdff1aSopenharmony_ci    REP_RET
437