1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* SIMD-optimized JPEG2000 DSP functions 3cabdff1aSopenharmony_ci;* Copyright (c) 2014 Nicolas Bertrand 4cabdff1aSopenharmony_ci;* Copyright (c) 2015 James Almer 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci;****************************************************************************** 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ciSECTION_RODATA 32 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_cipf_ict0: times 8 dd 1.402 28cabdff1aSopenharmony_cipf_ict1: times 8 dd 0.34413 29cabdff1aSopenharmony_cipf_ict2: times 8 dd 0.71414 30cabdff1aSopenharmony_cipf_ict3: times 8 dd 1.772 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_ciSECTION .text 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ci;*********************************************************************** 35cabdff1aSopenharmony_ci; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize) 36cabdff1aSopenharmony_ci;*********************************************************************** 37cabdff1aSopenharmony_ci%macro ICT_FLOAT 1 38cabdff1aSopenharmony_cicglobal ict_float, 4, 4, %1, src0, src1, src2, csize 39cabdff1aSopenharmony_ci shl csized, 2 40cabdff1aSopenharmony_ci add src0q, csizeq 41cabdff1aSopenharmony_ci add src1q, csizeq 42cabdff1aSopenharmony_ci add src2q, csizeq 43cabdff1aSopenharmony_ci neg csizeq 44cabdff1aSopenharmony_ci movaps m6, [pf_ict0] 45cabdff1aSopenharmony_ci movaps m7, [pf_ict1] 46cabdff1aSopenharmony_ci %define ICT0 m6 47cabdff1aSopenharmony_ci %define ICT1 m7 48cabdff1aSopenharmony_ci 49cabdff1aSopenharmony_ci%if ARCH_X86_64 50cabdff1aSopenharmony_ci movaps m8, [pf_ict2] 51cabdff1aSopenharmony_ci %define ICT2 m8 52cabdff1aSopenharmony_ci%if cpuflag(avx) 53cabdff1aSopenharmony_ci movaps m3, [pf_ict3] 54cabdff1aSopenharmony_ci %define ICT3 m3 55cabdff1aSopenharmony_ci%else 56cabdff1aSopenharmony_ci movaps m9, [pf_ict3] 57cabdff1aSopenharmony_ci %define ICT3 m9 58cabdff1aSopenharmony_ci%endif 59cabdff1aSopenharmony_ci 60cabdff1aSopenharmony_ci%else ; ARCH_X86_32 61cabdff1aSopenharmony_ci %define ICT2 [pf_ict2] 62cabdff1aSopenharmony_ci%if cpuflag(avx) 63cabdff1aSopenharmony_ci movaps m3, [pf_ict3] 64cabdff1aSopenharmony_ci %define ICT3 m3 65cabdff1aSopenharmony_ci%else 66cabdff1aSopenharmony_ci %define ICT3 [pf_ict3] 67cabdff1aSopenharmony_ci%endif 68cabdff1aSopenharmony_ci 69cabdff1aSopenharmony_ci%endif ; ARCH 70cabdff1aSopenharmony_ci 71cabdff1aSopenharmony_cialign 16 72cabdff1aSopenharmony_ci.loop: 73cabdff1aSopenharmony_ci movaps m0, [src0q+csizeq] 74cabdff1aSopenharmony_ci movaps m1, [src1q+csizeq] 75cabdff1aSopenharmony_ci movaps m2, [src2q+csizeq] 76cabdff1aSopenharmony_ci 77cabdff1aSopenharmony_ci%if cpuflag(fma4) || cpuflag(fma3) 78cabdff1aSopenharmony_ci%if cpuflag(fma4) 79cabdff1aSopenharmony_ci fnmaddps m5, m1, ICT1, m0 80cabdff1aSopenharmony_ci fmaddps m4, m2, ICT0, m0 81cabdff1aSopenharmony_ci%else ; fma3 82cabdff1aSopenharmony_ci movaps m5, m1 83cabdff1aSopenharmony_ci movaps m4, m2 84cabdff1aSopenharmony_ci fnmaddps m5, m5, ICT1, m0 85cabdff1aSopenharmony_ci fmaddps m4, m4, ICT0, m0 86cabdff1aSopenharmony_ci%endif 87cabdff1aSopenharmony_ci fmaddps m0, m1, ICT3, m0 88cabdff1aSopenharmony_ci fnmaddps m5, m2, ICT2, m5 89cabdff1aSopenharmony_ci%else ; non FMA 90cabdff1aSopenharmony_ci%if cpuflag(avx) 91cabdff1aSopenharmony_ci mulps m5, m1, ICT1 92cabdff1aSopenharmony_ci mulps m4, m2, ICT0 93cabdff1aSopenharmony_ci mulps m1, m1, ICT3 94cabdff1aSopenharmony_ci mulps m2, m2, ICT2 95cabdff1aSopenharmony_ci subps m5, m0, m5 96cabdff1aSopenharmony_ci%else ; sse 97cabdff1aSopenharmony_ci movaps m3, m1 98cabdff1aSopenharmony_ci movaps m4, m2 99cabdff1aSopenharmony_ci movaps m5, m0 100cabdff1aSopenharmony_ci mulps m3, ICT1 101cabdff1aSopenharmony_ci mulps m4, ICT0 102cabdff1aSopenharmony_ci mulps m1, ICT3 103cabdff1aSopenharmony_ci mulps m2, ICT2 104cabdff1aSopenharmony_ci subps m5, m3 105cabdff1aSopenharmony_ci%endif 106cabdff1aSopenharmony_ci addps m4, m4, m0 107cabdff1aSopenharmony_ci addps m0, m0, m1 108cabdff1aSopenharmony_ci subps m5, m5, m2 109cabdff1aSopenharmony_ci%endif 110cabdff1aSopenharmony_ci 111cabdff1aSopenharmony_ci movaps [src0q+csizeq], m4 112cabdff1aSopenharmony_ci movaps [src2q+csizeq], m0 113cabdff1aSopenharmony_ci movaps [src1q+csizeq], m5 114cabdff1aSopenharmony_ci add csizeq, mmsize 115cabdff1aSopenharmony_ci jl .loop 116cabdff1aSopenharmony_ci REP_RET 117cabdff1aSopenharmony_ci%endmacro 118cabdff1aSopenharmony_ci 119cabdff1aSopenharmony_ciINIT_XMM sse 120cabdff1aSopenharmony_ciICT_FLOAT 10 121cabdff1aSopenharmony_ciINIT_YMM avx 122cabdff1aSopenharmony_ciICT_FLOAT 9 123cabdff1aSopenharmony_ci%if HAVE_FMA4_EXTERNAL 124cabdff1aSopenharmony_ciINIT_XMM fma4 125cabdff1aSopenharmony_ciICT_FLOAT 9 126cabdff1aSopenharmony_ci%endif 127cabdff1aSopenharmony_ciINIT_YMM fma3 128cabdff1aSopenharmony_ciICT_FLOAT 9 129cabdff1aSopenharmony_ci 130cabdff1aSopenharmony_ci;*************************************************************************** 131cabdff1aSopenharmony_ci; ff_rct_int_<opt>(int32_t *src0, int32_t *src1, int32_t *src2, int csize) 132cabdff1aSopenharmony_ci;*************************************************************************** 133cabdff1aSopenharmony_ci%macro RCT_INT 0 134cabdff1aSopenharmony_cicglobal rct_int, 4, 4, 4, src0, src1, src2, csize 135cabdff1aSopenharmony_ci shl csized, 2 136cabdff1aSopenharmony_ci add src0q, csizeq 137cabdff1aSopenharmony_ci add src1q, csizeq 138cabdff1aSopenharmony_ci add src2q, csizeq 139cabdff1aSopenharmony_ci neg csizeq 140cabdff1aSopenharmony_ci 141cabdff1aSopenharmony_cialign 16 142cabdff1aSopenharmony_ci.loop: 143cabdff1aSopenharmony_ci mova m1, [src1q+csizeq] 144cabdff1aSopenharmony_ci mova m2, [src2q+csizeq] 145cabdff1aSopenharmony_ci mova m0, [src0q+csizeq] 146cabdff1aSopenharmony_ci paddd m3, m1, m2 147cabdff1aSopenharmony_ci psrad m3, 2 148cabdff1aSopenharmony_ci psubd m0, m3 149cabdff1aSopenharmony_ci paddd m1, m0 150cabdff1aSopenharmony_ci paddd m2, m0 151cabdff1aSopenharmony_ci mova [src1q+csizeq], m0 152cabdff1aSopenharmony_ci mova [src2q+csizeq], m1 153cabdff1aSopenharmony_ci mova [src0q+csizeq], m2 154cabdff1aSopenharmony_ci add csizeq, mmsize 155cabdff1aSopenharmony_ci jl .loop 156cabdff1aSopenharmony_ci REP_RET 157cabdff1aSopenharmony_ci%endmacro 158cabdff1aSopenharmony_ci 159cabdff1aSopenharmony_ciINIT_XMM sse2 160cabdff1aSopenharmony_ciRCT_INT 161cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 162cabdff1aSopenharmony_ciINIT_YMM avx2 163cabdff1aSopenharmony_ciRCT_INT 164cabdff1aSopenharmony_ci%endif 165