1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* Core video DSP functions 3cabdff1aSopenharmony_ci;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> 4cabdff1aSopenharmony_ci;* 5cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci;* 12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 16cabdff1aSopenharmony_ci;* 17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci;****************************************************************************** 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ciSECTION .text 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci; slow vertical extension loop function. Works with variable-width, and 27cabdff1aSopenharmony_ci; does per-line reading/writing of source data 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_ci%macro V_COPY_ROW 2 ; type (top/body/bottom), h 30cabdff1aSopenharmony_ci.%1_y_loop: ; do { 31cabdff1aSopenharmony_ci mov wq, r7mp ; initialize w (r7mp = wmp) 32cabdff1aSopenharmony_ci.%1_x_loop: ; do { 33cabdff1aSopenharmony_ci movu m0, [srcq+wq] ; m0 = read($mmsize) 34cabdff1aSopenharmony_ci movu [dstq+wq], m0 ; write(m0, $mmsize) 35cabdff1aSopenharmony_ci add wq, mmsize ; w -= $mmsize 36cabdff1aSopenharmony_ci cmp wq, -mmsize ; } while (w > $mmsize); 37cabdff1aSopenharmony_ci jl .%1_x_loop 38cabdff1aSopenharmony_ci movu m0, [srcq-mmsize] ; m0 = read($mmsize) 39cabdff1aSopenharmony_ci movu [dstq-mmsize], m0 ; write(m0, $mmsize) 40cabdff1aSopenharmony_ci%ifidn %1, body ; if ($type == body) { 41cabdff1aSopenharmony_ci add srcq, src_strideq ; src += src_stride 42cabdff1aSopenharmony_ci%endif ; } 43cabdff1aSopenharmony_ci add dstq, dst_strideq ; dst += dst_stride 44cabdff1aSopenharmony_ci dec %2 ; } while (--$h); 45cabdff1aSopenharmony_ci jnz .%1_y_loop 46cabdff1aSopenharmony_ci%endmacro 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ci; .----. <- zero 49cabdff1aSopenharmony_ci; | | <- top is copied from first line in body of source 50cabdff1aSopenharmony_ci; |----| <- start_y 51cabdff1aSopenharmony_ci; | | <- body is copied verbatim (line-by-line) from source 52cabdff1aSopenharmony_ci; |----| <- end_y 53cabdff1aSopenharmony_ci; | | <- bottom is copied from last line in body of source 54cabdff1aSopenharmony_ci; '----' <- bh 55cabdff1aSopenharmony_ciINIT_XMM sse 56cabdff1aSopenharmony_ci%if ARCH_X86_64 57cabdff1aSopenharmony_cicglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ 58cabdff1aSopenharmony_ci start_y, end_y, bh, w 59cabdff1aSopenharmony_ci%else ; x86-32 60cabdff1aSopenharmony_cicglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w 61cabdff1aSopenharmony_ci%define src_strideq r3mp 62cabdff1aSopenharmony_ci%define dst_strideq r1mp 63cabdff1aSopenharmony_ci mov srcq, r2mp 64cabdff1aSopenharmony_ci mov start_yq, r4mp 65cabdff1aSopenharmony_ci mov end_yq, r5mp 66cabdff1aSopenharmony_ci mov bhq, r6mp 67cabdff1aSopenharmony_ci%endif 68cabdff1aSopenharmony_ci sub bhq, end_yq ; bh -= end_q 69cabdff1aSopenharmony_ci sub end_yq, start_yq ; end_q -= start_q 70cabdff1aSopenharmony_ci add srcq, r7mp ; (r7mp = wmp) 71cabdff1aSopenharmony_ci add dstq, r7mp ; (r7mp = wmp) 72cabdff1aSopenharmony_ci neg r7mp ; (r7mp = wmp) 73cabdff1aSopenharmony_ci test start_yq, start_yq ; if (start_q) { 74cabdff1aSopenharmony_ci jz .body 75cabdff1aSopenharmony_ci V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq) 76cabdff1aSopenharmony_ci.body: ; } 77cabdff1aSopenharmony_ci V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq) 78cabdff1aSopenharmony_ci test bhq, bhq ; if (bh) { 79cabdff1aSopenharmony_ci jz .end 80cabdff1aSopenharmony_ci sub srcq, src_strideq ; src -= src_stride 81cabdff1aSopenharmony_ci V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh) 82cabdff1aSopenharmony_ci.end: ; } 83cabdff1aSopenharmony_ci RET 84cabdff1aSopenharmony_ci 85cabdff1aSopenharmony_ci%macro hvar_fn 0 86cabdff1aSopenharmony_cicglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w 87cabdff1aSopenharmony_ci lea dstq, [dstq+n_wordsq*2] 88cabdff1aSopenharmony_ci neg n_wordsq 89cabdff1aSopenharmony_ci lea start_xq, [start_xq+n_wordsq*2] 90cabdff1aSopenharmony_ci.y_loop: ; do { 91cabdff1aSopenharmony_ci%if cpuflag(avx2) 92cabdff1aSopenharmony_ci vpbroadcastb m0, [dstq+start_xq] 93cabdff1aSopenharmony_ci mov wq, n_wordsq ; initialize w 94cabdff1aSopenharmony_ci%else 95cabdff1aSopenharmony_ci movzx wd, byte [dstq+start_xq] ; w = read(1) 96cabdff1aSopenharmony_ci imul wd, 0x01010101 ; w *= 0x01010101 97cabdff1aSopenharmony_ci movd m0, wd 98cabdff1aSopenharmony_ci mov wq, n_wordsq ; initialize w 99cabdff1aSopenharmony_ci pshufd m0, m0, q0000 ; splat 100cabdff1aSopenharmony_ci%endif ; avx2 101cabdff1aSopenharmony_ci.x_loop: ; do { 102cabdff1aSopenharmony_ci movu [dstq+wq*2], m0 ; write($reg, $mmsize) 103cabdff1aSopenharmony_ci add wq, mmsize/2 ; w -= $mmsize/2 104cabdff1aSopenharmony_ci cmp wq, -(mmsize/2) ; } while (w > $mmsize/2) 105cabdff1aSopenharmony_ci jl .x_loop 106cabdff1aSopenharmony_ci movu [dstq-mmsize], m0 ; write($reg, $mmsize) 107cabdff1aSopenharmony_ci add dstq, dst_strideq ; dst += dst_stride 108cabdff1aSopenharmony_ci dec hq ; } while (h--) 109cabdff1aSopenharmony_ci jnz .y_loop 110cabdff1aSopenharmony_ci RET 111cabdff1aSopenharmony_ci%endmacro 112cabdff1aSopenharmony_ci 113cabdff1aSopenharmony_ciINIT_XMM sse2 114cabdff1aSopenharmony_cihvar_fn 115cabdff1aSopenharmony_ci 116cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 117cabdff1aSopenharmony_ciINIT_XMM avx2 118cabdff1aSopenharmony_cihvar_fn 119cabdff1aSopenharmony_ci%endif 120cabdff1aSopenharmony_ci 121cabdff1aSopenharmony_ci; macro to read/write a horizontal number of pixels (%2) to/from registers 122cabdff1aSopenharmony_ci; on sse, - fills xmm0-15 for consecutive sets of 16 pixels 123cabdff1aSopenharmony_ci; - if (%2 & 8) fills 8 bytes into xmm$next 124cabdff1aSopenharmony_ci; - if (%2 & 4) fills 4 bytes into xmm$next 125cabdff1aSopenharmony_ci; - if (%2 & 3) fills 1, 2 or 4 bytes in eax 126cabdff1aSopenharmony_ci; on mmx, - fills mm0-7 for consecutive sets of 8 pixels 127cabdff1aSopenharmony_ci; - if (%2 & 4) fills 4 bytes into mm$next 128cabdff1aSopenharmony_ci; - if (%2 & 3) fills 1, 2 or 4 bytes in eax 129cabdff1aSopenharmony_ci; writing data out is in the same way 130cabdff1aSopenharmony_ci%macro READ_NUM_BYTES 2 131cabdff1aSopenharmony_ci%assign %%off 0 ; offset in source buffer 132cabdff1aSopenharmony_ci%assign %%mmx_idx 0 ; mmx register index 133cabdff1aSopenharmony_ci%assign %%xmm_idx 0 ; xmm register index 134cabdff1aSopenharmony_ci 135cabdff1aSopenharmony_ci%rep %2/mmsize 136cabdff1aSopenharmony_ci%if mmsize == 16 137cabdff1aSopenharmony_ci movu xmm %+ %%xmm_idx, [srcq+%%off] 138cabdff1aSopenharmony_ci%assign %%xmm_idx %%xmm_idx+1 139cabdff1aSopenharmony_ci%else ; mmx 140cabdff1aSopenharmony_ci movu mm %+ %%mmx_idx, [srcq+%%off] 141cabdff1aSopenharmony_ci%assign %%mmx_idx %%mmx_idx+1 142cabdff1aSopenharmony_ci%endif 143cabdff1aSopenharmony_ci%assign %%off %%off+mmsize 144cabdff1aSopenharmony_ci%endrep ; %2/mmsize 145cabdff1aSopenharmony_ci 146cabdff1aSopenharmony_ci%if mmsize == 16 147cabdff1aSopenharmony_ci%if (%2-%%off) >= 8 148cabdff1aSopenharmony_ci%if %2 > 16 && (%2-%%off) > 8 149cabdff1aSopenharmony_ci movu xmm %+ %%xmm_idx, [srcq+%2-16] 150cabdff1aSopenharmony_ci%assign %%xmm_idx %%xmm_idx+1 151cabdff1aSopenharmony_ci%assign %%off %2 152cabdff1aSopenharmony_ci%else 153cabdff1aSopenharmony_ci movq mm %+ %%mmx_idx, [srcq+%%off] 154cabdff1aSopenharmony_ci%assign %%mmx_idx %%mmx_idx+1 155cabdff1aSopenharmony_ci%assign %%off %%off+8 156cabdff1aSopenharmony_ci%endif 157cabdff1aSopenharmony_ci%endif ; (%2-%%off) >= 8 158cabdff1aSopenharmony_ci%endif 159cabdff1aSopenharmony_ci 160cabdff1aSopenharmony_ci%if (%2-%%off) >= 4 161cabdff1aSopenharmony_ci%if %2 > 8 && (%2-%%off) > 4 162cabdff1aSopenharmony_ci movq mm %+ %%mmx_idx, [srcq+%2-8] 163cabdff1aSopenharmony_ci%assign %%off %2 164cabdff1aSopenharmony_ci%else 165cabdff1aSopenharmony_ci movd mm %+ %%mmx_idx, [srcq+%%off] 166cabdff1aSopenharmony_ci%assign %%off %%off+4 167cabdff1aSopenharmony_ci%endif 168cabdff1aSopenharmony_ci%assign %%mmx_idx %%mmx_idx+1 169cabdff1aSopenharmony_ci%endif ; (%2-%%off) >= 4 170cabdff1aSopenharmony_ci 171cabdff1aSopenharmony_ci%if (%2-%%off) >= 1 172cabdff1aSopenharmony_ci%if %2 >= 4 173cabdff1aSopenharmony_ci movd mm %+ %%mmx_idx, [srcq+%2-4] 174cabdff1aSopenharmony_ci%elif (%2-%%off) == 1 175cabdff1aSopenharmony_ci mov valb, [srcq+%2-1] 176cabdff1aSopenharmony_ci%elif (%2-%%off) == 2 177cabdff1aSopenharmony_ci mov valw, [srcq+%2-2] 178cabdff1aSopenharmony_ci%else 179cabdff1aSopenharmony_ci mov valb, [srcq+%2-1] 180cabdff1aSopenharmony_ci ror vald, 16 181cabdff1aSopenharmony_ci mov valw, [srcq+%2-3] 182cabdff1aSopenharmony_ci%endif 183cabdff1aSopenharmony_ci%endif ; (%2-%%off) >= 1 184cabdff1aSopenharmony_ci%endmacro ; READ_NUM_BYTES 185cabdff1aSopenharmony_ci 186cabdff1aSopenharmony_ci%macro WRITE_NUM_BYTES 2 187cabdff1aSopenharmony_ci%assign %%off 0 ; offset in destination buffer 188cabdff1aSopenharmony_ci%assign %%mmx_idx 0 ; mmx register index 189cabdff1aSopenharmony_ci%assign %%xmm_idx 0 ; xmm register index 190cabdff1aSopenharmony_ci 191cabdff1aSopenharmony_ci%rep %2/mmsize 192cabdff1aSopenharmony_ci%if mmsize == 16 193cabdff1aSopenharmony_ci movu [dstq+%%off], xmm %+ %%xmm_idx 194cabdff1aSopenharmony_ci%assign %%xmm_idx %%xmm_idx+1 195cabdff1aSopenharmony_ci%else ; mmx 196cabdff1aSopenharmony_ci movu [dstq+%%off], mm %+ %%mmx_idx 197cabdff1aSopenharmony_ci%assign %%mmx_idx %%mmx_idx+1 198cabdff1aSopenharmony_ci%endif 199cabdff1aSopenharmony_ci%assign %%off %%off+mmsize 200cabdff1aSopenharmony_ci%endrep ; %2/mmsize 201cabdff1aSopenharmony_ci 202cabdff1aSopenharmony_ci%if mmsize == 16 203cabdff1aSopenharmony_ci%if (%2-%%off) >= 8 204cabdff1aSopenharmony_ci%if %2 > 16 && (%2-%%off) > 8 205cabdff1aSopenharmony_ci movu [dstq+%2-16], xmm %+ %%xmm_idx 206cabdff1aSopenharmony_ci%assign %%xmm_idx %%xmm_idx+1 207cabdff1aSopenharmony_ci%assign %%off %2 208cabdff1aSopenharmony_ci%else 209cabdff1aSopenharmony_ci movq [dstq+%%off], mm %+ %%mmx_idx 210cabdff1aSopenharmony_ci%assign %%mmx_idx %%mmx_idx+1 211cabdff1aSopenharmony_ci%assign %%off %%off+8 212cabdff1aSopenharmony_ci%endif 213cabdff1aSopenharmony_ci%endif ; (%2-%%off) >= 8 214cabdff1aSopenharmony_ci%endif 215cabdff1aSopenharmony_ci 216cabdff1aSopenharmony_ci%if (%2-%%off) >= 4 217cabdff1aSopenharmony_ci%if %2 > 8 && (%2-%%off) > 4 218cabdff1aSopenharmony_ci movq [dstq+%2-8], mm %+ %%mmx_idx 219cabdff1aSopenharmony_ci%assign %%off %2 220cabdff1aSopenharmony_ci%else 221cabdff1aSopenharmony_ci movd [dstq+%%off], mm %+ %%mmx_idx 222cabdff1aSopenharmony_ci%assign %%off %%off+4 223cabdff1aSopenharmony_ci%endif 224cabdff1aSopenharmony_ci%assign %%mmx_idx %%mmx_idx+1 225cabdff1aSopenharmony_ci%endif ; (%2-%%off) >= 4 226cabdff1aSopenharmony_ci 227cabdff1aSopenharmony_ci%if (%2-%%off) >= 1 228cabdff1aSopenharmony_ci%if %2 >= 4 229cabdff1aSopenharmony_ci movd [dstq+%2-4], mm %+ %%mmx_idx 230cabdff1aSopenharmony_ci%elif (%2-%%off) == 1 231cabdff1aSopenharmony_ci mov [dstq+%2-1], valb 232cabdff1aSopenharmony_ci%elif (%2-%%off) == 2 233cabdff1aSopenharmony_ci mov [dstq+%2-2], valw 234cabdff1aSopenharmony_ci%else 235cabdff1aSopenharmony_ci mov [dstq+%2-3], valw 236cabdff1aSopenharmony_ci ror vald, 16 237cabdff1aSopenharmony_ci mov [dstq+%2-1], valb 238cabdff1aSopenharmony_ci%ifnidn %1, body 239cabdff1aSopenharmony_ci ror vald, 16 240cabdff1aSopenharmony_ci%endif 241cabdff1aSopenharmony_ci%endif 242cabdff1aSopenharmony_ci%endif ; (%2-%%off) >= 1 243cabdff1aSopenharmony_ci%endmacro ; WRITE_NUM_BYTES 244cabdff1aSopenharmony_ci 245cabdff1aSopenharmony_ci; vertical top/bottom extend and body copy fast loops 246cabdff1aSopenharmony_ci; these are function pointers to set-width line copy functions, i.e. 247cabdff1aSopenharmony_ci; they read a fixed number of pixels into set registers, and write 248cabdff1aSopenharmony_ci; those out into the destination buffer 249cabdff1aSopenharmony_ci%macro VERTICAL_EXTEND 2 250cabdff1aSopenharmony_ci%assign %%n %1 251cabdff1aSopenharmony_ci%rep 1+%2-%1 252cabdff1aSopenharmony_ci%if %%n <= 3 253cabdff1aSopenharmony_ci%if ARCH_X86_64 254cabdff1aSopenharmony_cicglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ 255cabdff1aSopenharmony_ci start_y, end_y, val, bh 256cabdff1aSopenharmony_ci mov bhq, r6mp ; r6mp = bhmp 257cabdff1aSopenharmony_ci%else ; x86-32 258cabdff1aSopenharmony_cicglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh 259cabdff1aSopenharmony_ci mov dstq, r0mp 260cabdff1aSopenharmony_ci mov srcq, r2mp 261cabdff1aSopenharmony_ci mov start_yq, r4mp 262cabdff1aSopenharmony_ci mov end_yq, r5mp 263cabdff1aSopenharmony_ci mov bhq, r6mp 264cabdff1aSopenharmony_ci%define dst_strideq r1mp 265cabdff1aSopenharmony_ci%define src_strideq r3mp 266cabdff1aSopenharmony_ci%endif ; x86-64/32 267cabdff1aSopenharmony_ci%else 268cabdff1aSopenharmony_ci%if ARCH_X86_64 269cabdff1aSopenharmony_cicglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ 270cabdff1aSopenharmony_ci start_y, end_y, bh 271cabdff1aSopenharmony_ci%else ; x86-32 272cabdff1aSopenharmony_cicglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh 273cabdff1aSopenharmony_ci mov srcq, r2mp 274cabdff1aSopenharmony_ci mov start_yq, r4mp 275cabdff1aSopenharmony_ci mov end_yq, r5mp 276cabdff1aSopenharmony_ci mov bhq, r6mp 277cabdff1aSopenharmony_ci%define dst_strideq r1mp 278cabdff1aSopenharmony_ci%define src_strideq r3mp 279cabdff1aSopenharmony_ci%endif ; x86-64/32 280cabdff1aSopenharmony_ci%endif 281cabdff1aSopenharmony_ci ; FIXME move this to c wrapper? 282cabdff1aSopenharmony_ci sub bhq, end_yq ; bh -= end_y 283cabdff1aSopenharmony_ci sub end_yq, start_yq ; end_y -= start_y 284cabdff1aSopenharmony_ci 285cabdff1aSopenharmony_ci ; extend pixels above body 286cabdff1aSopenharmony_ci test start_yq, start_yq ; if (start_y) { 287cabdff1aSopenharmony_ci jz .body_loop 288cabdff1aSopenharmony_ci READ_NUM_BYTES top, %%n ; $variable_regs = read($n) 289cabdff1aSopenharmony_ci.top_loop: ; do { 290cabdff1aSopenharmony_ci WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n) 291cabdff1aSopenharmony_ci add dstq, dst_strideq ; dst += linesize 292cabdff1aSopenharmony_ci dec start_yq ; } while (--start_y) 293cabdff1aSopenharmony_ci jnz .top_loop ; } 294cabdff1aSopenharmony_ci 295cabdff1aSopenharmony_ci ; copy body pixels 296cabdff1aSopenharmony_ci.body_loop: ; do { 297cabdff1aSopenharmony_ci READ_NUM_BYTES body, %%n ; $variable_regs = read($n) 298cabdff1aSopenharmony_ci WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n) 299cabdff1aSopenharmony_ci add dstq, dst_strideq ; dst += dst_stride 300cabdff1aSopenharmony_ci add srcq, src_strideq ; src += src_stride 301cabdff1aSopenharmony_ci dec end_yq ; } while (--end_y) 302cabdff1aSopenharmony_ci jnz .body_loop 303cabdff1aSopenharmony_ci 304cabdff1aSopenharmony_ci ; copy bottom pixels 305cabdff1aSopenharmony_ci test bhq, bhq ; if (block_h) { 306cabdff1aSopenharmony_ci jz .end 307cabdff1aSopenharmony_ci sub srcq, src_strideq ; src -= linesize 308cabdff1aSopenharmony_ci READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n) 309cabdff1aSopenharmony_ci.bottom_loop: ; do { 310cabdff1aSopenharmony_ci WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n) 311cabdff1aSopenharmony_ci add dstq, dst_strideq ; dst += linesize 312cabdff1aSopenharmony_ci dec bhq ; } while (--bh) 313cabdff1aSopenharmony_ci jnz .bottom_loop ; } 314cabdff1aSopenharmony_ci 315cabdff1aSopenharmony_ci.end: 316cabdff1aSopenharmony_ci RET 317cabdff1aSopenharmony_ci%assign %%n %%n+1 318cabdff1aSopenharmony_ci%endrep ; 1+%2-%1 319cabdff1aSopenharmony_ci%endmacro ; VERTICAL_EXTEND 320cabdff1aSopenharmony_ci 321cabdff1aSopenharmony_ciINIT_MMX mmx 322cabdff1aSopenharmony_ciVERTICAL_EXTEND 1, 15 323cabdff1aSopenharmony_ci 324cabdff1aSopenharmony_ciINIT_XMM sse 325cabdff1aSopenharmony_ciVERTICAL_EXTEND 16, 22 326cabdff1aSopenharmony_ci 327cabdff1aSopenharmony_ci; left/right (horizontal) fast extend functions 328cabdff1aSopenharmony_ci; these are essentially identical to the vertical extend ones above, 329cabdff1aSopenharmony_ci; just left/right separated because number of pixels to extend is 330cabdff1aSopenharmony_ci; obviously not the same on both sides. 331cabdff1aSopenharmony_ci 332cabdff1aSopenharmony_ci%macro READ_V_PIXEL 2 333cabdff1aSopenharmony_ci%if cpuflag(avx2) 334cabdff1aSopenharmony_ci vpbroadcastb m0, %2 335cabdff1aSopenharmony_ci%else 336cabdff1aSopenharmony_ci movzx vald, byte %2 337cabdff1aSopenharmony_ci imul vald, 0x01010101 338cabdff1aSopenharmony_ci%if %1 >= 8 339cabdff1aSopenharmony_ci movd m0, vald 340cabdff1aSopenharmony_ci%if mmsize == 16 341cabdff1aSopenharmony_ci pshufd m0, m0, q0000 342cabdff1aSopenharmony_ci%else 343cabdff1aSopenharmony_ci punpckldq m0, m0 344cabdff1aSopenharmony_ci%endif ; mmsize == 16 345cabdff1aSopenharmony_ci%endif ; %1 > 16 346cabdff1aSopenharmony_ci%endif ; avx2 347cabdff1aSopenharmony_ci%endmacro ; READ_V_PIXEL 348cabdff1aSopenharmony_ci 349cabdff1aSopenharmony_ci%macro WRITE_V_PIXEL 2 350cabdff1aSopenharmony_ci%assign %%off 0 351cabdff1aSopenharmony_ci 352cabdff1aSopenharmony_ci%if %1 >= 8 353cabdff1aSopenharmony_ci 354cabdff1aSopenharmony_ci%rep %1/mmsize 355cabdff1aSopenharmony_ci movu [%2+%%off], m0 356cabdff1aSopenharmony_ci%assign %%off %%off+mmsize 357cabdff1aSopenharmony_ci%endrep ; %1/mmsize 358cabdff1aSopenharmony_ci 359cabdff1aSopenharmony_ci%if mmsize == 16 360cabdff1aSopenharmony_ci%if %1-%%off >= 8 361cabdff1aSopenharmony_ci%if %1 > 16 && %1-%%off > 8 362cabdff1aSopenharmony_ci movu [%2+%1-16], m0 363cabdff1aSopenharmony_ci%assign %%off %1 364cabdff1aSopenharmony_ci%else 365cabdff1aSopenharmony_ci movq [%2+%%off], m0 366cabdff1aSopenharmony_ci%assign %%off %%off+8 367cabdff1aSopenharmony_ci%endif 368cabdff1aSopenharmony_ci%endif ; %1-%%off >= 8 369cabdff1aSopenharmony_ci%endif ; mmsize == 16 370cabdff1aSopenharmony_ci 371cabdff1aSopenharmony_ci%if %1-%%off >= 4 372cabdff1aSopenharmony_ci%if %1 > 8 && %1-%%off > 4 373cabdff1aSopenharmony_ci movq [%2+%1-8], m0 374cabdff1aSopenharmony_ci%assign %%off %1 375cabdff1aSopenharmony_ci%else 376cabdff1aSopenharmony_ci movd [%2+%%off], m0 377cabdff1aSopenharmony_ci%assign %%off %%off+4 378cabdff1aSopenharmony_ci%endif 379cabdff1aSopenharmony_ci%endif ; %1-%%off >= 4 380cabdff1aSopenharmony_ci 381cabdff1aSopenharmony_ci%else ; %1 < 8 382cabdff1aSopenharmony_ci 383cabdff1aSopenharmony_ci%rep %1/4 384cabdff1aSopenharmony_ci mov [%2+%%off], vald 385cabdff1aSopenharmony_ci%assign %%off %%off+4 386cabdff1aSopenharmony_ci%endrep ; %1/4 387cabdff1aSopenharmony_ci 388cabdff1aSopenharmony_ci%endif ; %1 >=/< 8 389cabdff1aSopenharmony_ci 390cabdff1aSopenharmony_ci%if %1-%%off == 2 391cabdff1aSopenharmony_ci%if cpuflag(avx2) 392cabdff1aSopenharmony_ci movd [%2+%%off-2], m0 393cabdff1aSopenharmony_ci%else 394cabdff1aSopenharmony_ci mov [%2+%%off], valw 395cabdff1aSopenharmony_ci%endif ; avx2 396cabdff1aSopenharmony_ci%endif ; (%1-%%off)/2 397cabdff1aSopenharmony_ci%endmacro ; WRITE_V_PIXEL 398cabdff1aSopenharmony_ci 399cabdff1aSopenharmony_ci%macro H_EXTEND 2 400cabdff1aSopenharmony_ci%assign %%n %1 401cabdff1aSopenharmony_ci%rep 1+(%2-%1)/2 402cabdff1aSopenharmony_ci%if cpuflag(avx2) 403cabdff1aSopenharmony_cicglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh 404cabdff1aSopenharmony_ci%else 405cabdff1aSopenharmony_cicglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val 406cabdff1aSopenharmony_ci%endif 407cabdff1aSopenharmony_ci.loop_y: ; do { 408cabdff1aSopenharmony_ci READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) 409cabdff1aSopenharmony_ci WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) 410cabdff1aSopenharmony_ci add dstq, dst_strideq ; dst += dst_stride 411cabdff1aSopenharmony_ci dec bhq ; } while (--bh) 412cabdff1aSopenharmony_ci jnz .loop_y 413cabdff1aSopenharmony_ci RET 414cabdff1aSopenharmony_ci%assign %%n %%n+2 415cabdff1aSopenharmony_ci%endrep ; 1+(%2-%1)/2 416cabdff1aSopenharmony_ci%endmacro ; H_EXTEND 417cabdff1aSopenharmony_ci 418cabdff1aSopenharmony_ciINIT_MMX mmx 419cabdff1aSopenharmony_ciH_EXTEND 2, 14 420cabdff1aSopenharmony_ci 421cabdff1aSopenharmony_ciINIT_XMM sse2 422cabdff1aSopenharmony_ciH_EXTEND 16, 22 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 425cabdff1aSopenharmony_ciINIT_XMM avx2 426cabdff1aSopenharmony_ciH_EXTEND 8, 22 427cabdff1aSopenharmony_ci%endif 428cabdff1aSopenharmony_ci 429cabdff1aSopenharmony_ciINIT_MMX mmxext 430cabdff1aSopenharmony_cicglobal prefetch, 3, 3, 0, buf, stride, h 431cabdff1aSopenharmony_ci.loop: 432cabdff1aSopenharmony_ci prefetcht0 [bufq] 433cabdff1aSopenharmony_ci add bufq, strideq 434cabdff1aSopenharmony_ci dec hd 435cabdff1aSopenharmony_ci jg .loop 436cabdff1aSopenharmony_ci REP_RET 437