1cabdff1aSopenharmony_ci;************************************************************************ 2cabdff1aSopenharmony_ci;* SIMD-optimized lossless video encoding functions 3cabdff1aSopenharmony_ci;* Copyright (c) 2000, 2001 Fabrice Bellard 4cabdff1aSopenharmony_ci;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* MMX optimization by Nick Kurshev <nickols_k@mail.ru> 7cabdff1aSopenharmony_ci;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com> 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 10cabdff1aSopenharmony_ci;* 11cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 12cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 13cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 14cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 15cabdff1aSopenharmony_ci;* 16cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 17cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 18cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 20cabdff1aSopenharmony_ci;* 21cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 22cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 23cabdff1aSopenharmony_ci;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24cabdff1aSopenharmony_ci;****************************************************************************** 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cicextern pb_80 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ciSECTION .text 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_ci; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, 33cabdff1aSopenharmony_ci; intptr_t w); 34cabdff1aSopenharmony_ci%macro DIFF_BYTES_PROLOGUE 0 35cabdff1aSopenharmony_ci%if ARCH_X86_32 36cabdff1aSopenharmony_cicglobal diff_bytes, 3,5,2, dst, src1, src2 37cabdff1aSopenharmony_ci%define wq r4q 38cabdff1aSopenharmony_ci DECLARE_REG_TMP 3 39cabdff1aSopenharmony_ci mov wq, r3mp 40cabdff1aSopenharmony_ci%else 41cabdff1aSopenharmony_cicglobal diff_bytes, 4,5,2, dst, src1, src2, w 42cabdff1aSopenharmony_ci DECLARE_REG_TMP 4 43cabdff1aSopenharmony_ci%endif ; ARCH_X86_32 44cabdff1aSopenharmony_ci%define i t0q 45cabdff1aSopenharmony_ci%endmacro 46cabdff1aSopenharmony_ci 47cabdff1aSopenharmony_ci; labels to jump to if w < regsize and w < 0 48cabdff1aSopenharmony_ci%macro DIFF_BYTES_LOOP_PREP 2 49cabdff1aSopenharmony_ci mov i, wq 50cabdff1aSopenharmony_ci and i, -2 * regsize 51cabdff1aSopenharmony_ci js %2 52cabdff1aSopenharmony_ci jz %1 53cabdff1aSopenharmony_ci add dstq, i 54cabdff1aSopenharmony_ci add src1q, i 55cabdff1aSopenharmony_ci add src2q, i 56cabdff1aSopenharmony_ci neg i 57cabdff1aSopenharmony_ci%endmacro 58cabdff1aSopenharmony_ci 59cabdff1aSopenharmony_ci; mov type used for src1q, dstq, first reg, second reg 60cabdff1aSopenharmony_ci%macro DIFF_BYTES_LOOP_CORE 4 61cabdff1aSopenharmony_ci%if mmsize != 16 62cabdff1aSopenharmony_ci mov%1 %3, [src1q + i] 63cabdff1aSopenharmony_ci mov%1 %4, [src1q + i + regsize] 64cabdff1aSopenharmony_ci psubb %3, [src2q + i] 65cabdff1aSopenharmony_ci psubb %4, [src2q + i + regsize] 66cabdff1aSopenharmony_ci mov%2 [dstq + i], %3 67cabdff1aSopenharmony_ci mov%2 [regsize + dstq + i], %4 68cabdff1aSopenharmony_ci%else 69cabdff1aSopenharmony_ci ; SSE enforces alignment of psubb operand 70cabdff1aSopenharmony_ci mov%1 %3, [src1q + i] 71cabdff1aSopenharmony_ci movu %4, [src2q + i] 72cabdff1aSopenharmony_ci psubb %3, %4 73cabdff1aSopenharmony_ci mov%2 [dstq + i], %3 74cabdff1aSopenharmony_ci mov%1 %3, [src1q + i + regsize] 75cabdff1aSopenharmony_ci movu %4, [src2q + i + regsize] 76cabdff1aSopenharmony_ci psubb %3, %4 77cabdff1aSopenharmony_ci mov%2 [regsize + dstq + i], %3 78cabdff1aSopenharmony_ci%endif 79cabdff1aSopenharmony_ci%endmacro 80cabdff1aSopenharmony_ci 81cabdff1aSopenharmony_ci%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq 82cabdff1aSopenharmony_ci %define regsize mmsize 83cabdff1aSopenharmony_ci.loop_%1%2: 84cabdff1aSopenharmony_ci DIFF_BYTES_LOOP_CORE %1, %2, m0, m1 85cabdff1aSopenharmony_ci add i, 2 * regsize 86cabdff1aSopenharmony_ci jl .loop_%1%2 87cabdff1aSopenharmony_ci.skip_main_%1%2: 88cabdff1aSopenharmony_ci and wq, 2 * regsize - 1 89cabdff1aSopenharmony_ci jz .end_%1%2 90cabdff1aSopenharmony_ci%if mmsize > 16 91cabdff1aSopenharmony_ci ; fall back to narrower xmm 92cabdff1aSopenharmony_ci %define regsize (mmsize / 2) 93cabdff1aSopenharmony_ci DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa, .end_aa 94cabdff1aSopenharmony_ci.loop2_%1%2: 95cabdff1aSopenharmony_ci DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1 96cabdff1aSopenharmony_ci add i, 2 * regsize 97cabdff1aSopenharmony_ci jl .loop2_%1%2 98cabdff1aSopenharmony_ci.setup_loop_gpr_%1%2: 99cabdff1aSopenharmony_ci and wq, 2 * regsize - 1 100cabdff1aSopenharmony_ci jz .end_%1%2 101cabdff1aSopenharmony_ci%endif 102cabdff1aSopenharmony_ci add dstq, wq 103cabdff1aSopenharmony_ci add src1q, wq 104cabdff1aSopenharmony_ci add src2q, wq 105cabdff1aSopenharmony_ci neg wq 106cabdff1aSopenharmony_ci.loop_gpr_%1%2: 107cabdff1aSopenharmony_ci mov t0b, [src1q + wq] 108cabdff1aSopenharmony_ci sub t0b, [src2q + wq] 109cabdff1aSopenharmony_ci mov [dstq + wq], t0b 110cabdff1aSopenharmony_ci inc wq 111cabdff1aSopenharmony_ci jl .loop_gpr_%1%2 112cabdff1aSopenharmony_ci.end_%1%2: 113cabdff1aSopenharmony_ci REP_RET 114cabdff1aSopenharmony_ci%endmacro 115cabdff1aSopenharmony_ci 116cabdff1aSopenharmony_ciINIT_XMM sse2 117cabdff1aSopenharmony_ciDIFF_BYTES_PROLOGUE 118cabdff1aSopenharmony_ci %define regsize mmsize 119cabdff1aSopenharmony_ci DIFF_BYTES_LOOP_PREP .skip_main_aa, .end_aa 120cabdff1aSopenharmony_ci test dstq, regsize - 1 121cabdff1aSopenharmony_ci jnz .loop_uu 122cabdff1aSopenharmony_ci test src1q, regsize - 1 123cabdff1aSopenharmony_ci jnz .loop_ua 124cabdff1aSopenharmony_ci DIFF_BYTES_BODY a, a 125cabdff1aSopenharmony_ci DIFF_BYTES_BODY u, a 126cabdff1aSopenharmony_ci DIFF_BYTES_BODY u, u 127cabdff1aSopenharmony_ci%undef i 128cabdff1aSopenharmony_ci 129cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 130cabdff1aSopenharmony_ciINIT_YMM avx2 131cabdff1aSopenharmony_ciDIFF_BYTES_PROLOGUE 132cabdff1aSopenharmony_ci %define regsize mmsize 133cabdff1aSopenharmony_ci ; Directly using unaligned SSE2 version is marginally faster than 134cabdff1aSopenharmony_ci ; branching based on arguments. 135cabdff1aSopenharmony_ci DIFF_BYTES_LOOP_PREP .skip_main_uu, .end_uu 136cabdff1aSopenharmony_ci test dstq, regsize - 1 137cabdff1aSopenharmony_ci jnz .loop_uu 138cabdff1aSopenharmony_ci test src1q, regsize - 1 139cabdff1aSopenharmony_ci jnz .loop_ua 140cabdff1aSopenharmony_ci DIFF_BYTES_BODY a, a 141cabdff1aSopenharmony_ci DIFF_BYTES_BODY u, a 142cabdff1aSopenharmony_ci DIFF_BYTES_BODY u, u 143cabdff1aSopenharmony_ci%undef i 144cabdff1aSopenharmony_ci%endif 145cabdff1aSopenharmony_ci 146cabdff1aSopenharmony_ci 147cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------------------------- 148cabdff1aSopenharmony_ci;void sub_left_predict(uint8_t *dst, uint8_t *src, ptrdiff_t stride, ptrdiff_t width, int height) 149cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------------------------- 150cabdff1aSopenharmony_ci 151cabdff1aSopenharmony_ciINIT_XMM avx 152cabdff1aSopenharmony_cicglobal sub_left_predict, 5,6,5, dst, src, stride, width, height, x 153cabdff1aSopenharmony_ci mova m1, [pb_80] ; prev initial 154cabdff1aSopenharmony_ci add dstq, widthq 155cabdff1aSopenharmony_ci add srcq, widthq 156cabdff1aSopenharmony_ci lea xd, [widthq-1] 157cabdff1aSopenharmony_ci neg widthq 158cabdff1aSopenharmony_ci and xd, 15 159cabdff1aSopenharmony_ci pinsrb m4, m1, xd, 15 160cabdff1aSopenharmony_ci mov xq, widthq 161cabdff1aSopenharmony_ci 162cabdff1aSopenharmony_ci .loop: 163cabdff1aSopenharmony_ci movu m0, [srcq + widthq] 164cabdff1aSopenharmony_ci palignr m2, m0, m1, 15 165cabdff1aSopenharmony_ci movu m1, [srcq + widthq + 16] 166cabdff1aSopenharmony_ci palignr m3, m1, m0, 15 167cabdff1aSopenharmony_ci psubb m2, m0, m2 168cabdff1aSopenharmony_ci psubb m3, m1, m3 169cabdff1aSopenharmony_ci movu [dstq + widthq], m2 170cabdff1aSopenharmony_ci movu [dstq + widthq + 16], m3 171cabdff1aSopenharmony_ci add widthq, 2 * 16 172cabdff1aSopenharmony_ci jl .loop 173cabdff1aSopenharmony_ci 174cabdff1aSopenharmony_ci add srcq, strideq 175cabdff1aSopenharmony_ci sub dstq, xq ; dst + width 176cabdff1aSopenharmony_ci test xd, 16 177cabdff1aSopenharmony_ci jz .mod32 178cabdff1aSopenharmony_ci mova m1, m0 179cabdff1aSopenharmony_ci 180cabdff1aSopenharmony_ci.mod32: 181cabdff1aSopenharmony_ci pshufb m1, m4 182cabdff1aSopenharmony_ci mov widthq, xq 183cabdff1aSopenharmony_ci dec heightd 184cabdff1aSopenharmony_ci jg .loop 185cabdff1aSopenharmony_ci RET 186