1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* FFT transform with SSE/AVX optimizations 3cabdff1aSopenharmony_ci;* Copyright (c) 2008 Loren Merritt 4cabdff1aSopenharmony_ci;* Copyright (c) 2011 Vitor Sessak 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This algorithm (though not any of the implementation details) is 7cabdff1aSopenharmony_ci;* based on libdjbfft by D. J. Bernstein. 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 10cabdff1aSopenharmony_ci;* 11cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 12cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 13cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 14cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 15cabdff1aSopenharmony_ci;* 16cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 17cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 18cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 20cabdff1aSopenharmony_ci;* 21cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 22cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 23cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24cabdff1aSopenharmony_ci;****************************************************************************** 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci; These functions are not individually interchangeable with the C versions. 27cabdff1aSopenharmony_ci; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results 28cabdff1aSopenharmony_ci; in blocks as conventient to the vector size. 29cabdff1aSopenharmony_ci; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_ci%if ARCH_X86_64 34cabdff1aSopenharmony_ci%define pointer resq 35cabdff1aSopenharmony_ci%else 36cabdff1aSopenharmony_ci%define pointer resd 37cabdff1aSopenharmony_ci%endif 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_cistruc FFTContext 40cabdff1aSopenharmony_ci .nbits: resd 1 41cabdff1aSopenharmony_ci .reverse: resd 1 42cabdff1aSopenharmony_ci .revtab: pointer 1 43cabdff1aSopenharmony_ci .tmpbuf: pointer 1 44cabdff1aSopenharmony_ci .mdctsize: resd 1 45cabdff1aSopenharmony_ci .mdctbits: resd 1 46cabdff1aSopenharmony_ci .tcos: pointer 1 47cabdff1aSopenharmony_ci .tsin: pointer 1 48cabdff1aSopenharmony_ci .fftperm: pointer 1 49cabdff1aSopenharmony_ci .fftcalc: pointer 1 50cabdff1aSopenharmony_ci .imdctcalc:pointer 1 51cabdff1aSopenharmony_ci .imdcthalf:pointer 1 52cabdff1aSopenharmony_ciendstruc 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_ciSECTION_RODATA 32 55cabdff1aSopenharmony_ci 56cabdff1aSopenharmony_ci%define M_SQRT1_2 0.70710678118654752440 57cabdff1aSopenharmony_ci%define M_COS_PI_1_8 0.923879532511287 58cabdff1aSopenharmony_ci%define M_COS_PI_3_8 0.38268343236509 59cabdff1aSopenharmony_ci 60cabdff1aSopenharmony_cips_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 61cabdff1aSopenharmony_cips_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 62cabdff1aSopenharmony_ci 63cabdff1aSopenharmony_cips_root2: times 8 dd M_SQRT1_2 64cabdff1aSopenharmony_cips_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 65cabdff1aSopenharmony_cips_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_ciperm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 68cabdff1aSopenharmony_ciperm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 69cabdff1aSopenharmony_cips_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 70cabdff1aSopenharmony_cips_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 71cabdff1aSopenharmony_cips_m1p1: dd 1<<31, 0 72cabdff1aSopenharmony_ci 73cabdff1aSopenharmony_cicextern ps_neg 74cabdff1aSopenharmony_ci 75cabdff1aSopenharmony_ci%assign i 16 76cabdff1aSopenharmony_ci%rep 14 77cabdff1aSopenharmony_cicextern cos_ %+ i 78cabdff1aSopenharmony_ci%assign i i<<1 79cabdff1aSopenharmony_ci%endrep 80cabdff1aSopenharmony_ci 81cabdff1aSopenharmony_ci%if ARCH_X86_64 82cabdff1aSopenharmony_ci %define pointer dq 83cabdff1aSopenharmony_ci%else 84cabdff1aSopenharmony_ci %define pointer dd 85cabdff1aSopenharmony_ci%endif 86cabdff1aSopenharmony_ci 87cabdff1aSopenharmony_ci%macro IF0 1+ 88cabdff1aSopenharmony_ci%endmacro 89cabdff1aSopenharmony_ci%macro IF1 1+ 90cabdff1aSopenharmony_ci %1 91cabdff1aSopenharmony_ci%endmacro 92cabdff1aSopenharmony_ci 93cabdff1aSopenharmony_ciSECTION .text 94cabdff1aSopenharmony_ci 95cabdff1aSopenharmony_ci; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} 96cabdff1aSopenharmony_ci; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} 97cabdff1aSopenharmony_ci; %3, %4, %5 tmp 98cabdff1aSopenharmony_ci; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} 99cabdff1aSopenharmony_ci; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} 100cabdff1aSopenharmony_ci%macro T8_AVX 5 101cabdff1aSopenharmony_ci vsubps %5, %1, %2 ; v = %1 - %2 102cabdff1aSopenharmony_ci vaddps %3, %1, %2 ; w = %1 + %2 103cabdff1aSopenharmony_ci vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 104cabdff1aSopenharmony_ci vpermilps %2, %2, [perm1] 105cabdff1aSopenharmony_ci vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} 106cabdff1aSopenharmony_ci vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} 107cabdff1aSopenharmony_ci vsubps %4, %5, %1 ; s = r - q 108cabdff1aSopenharmony_ci vaddps %1, %5, %1 ; u = r + q 109cabdff1aSopenharmony_ci vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} 110cabdff1aSopenharmony_ci vshufps %5, %4, %1, 0xbb 111cabdff1aSopenharmony_ci vshufps %3, %4, %1, 0xee 112cabdff1aSopenharmony_ci vperm2f128 %3, %3, %5, 0x13 113cabdff1aSopenharmony_ci vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} 114cabdff1aSopenharmony_ci vshufps %2, %1, %4, 0xdd 115cabdff1aSopenharmony_ci vshufps %1, %1, %4, 0x88 116cabdff1aSopenharmony_ci vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} 117cabdff1aSopenharmony_ci vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} 118cabdff1aSopenharmony_ci vsubps %5, %1, %3 119cabdff1aSopenharmony_ci vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} 120cabdff1aSopenharmony_ci vsubps %2, %4, %1 ; %2 = v - w 121cabdff1aSopenharmony_ci vaddps %1, %4, %1 ; %1 = v + w 122cabdff1aSopenharmony_ci%endmacro 123cabdff1aSopenharmony_ci 124cabdff1aSopenharmony_ci; In SSE mode do one fft4 transforms 125cabdff1aSopenharmony_ci; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} 126cabdff1aSopenharmony_ci; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} 127cabdff1aSopenharmony_ci; 128cabdff1aSopenharmony_ci; In AVX mode do two fft4 transforms 129cabdff1aSopenharmony_ci; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} 130cabdff1aSopenharmony_ci; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} 131cabdff1aSopenharmony_ci%macro T4_SSE 3 132cabdff1aSopenharmony_ci subps %3, %1, %2 ; {t3,t4,-t8,t7} 133cabdff1aSopenharmony_ci addps %1, %1, %2 ; {t1,t2,t6,t5} 134cabdff1aSopenharmony_ci xorps %3, %3, [ps_p1p1m1p1] 135cabdff1aSopenharmony_ci shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} 136cabdff1aSopenharmony_ci shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} 137cabdff1aSopenharmony_ci subps %3, %1, %2 ; {r2,i2,r3,i3} 138cabdff1aSopenharmony_ci addps %1, %1, %2 ; {r0,i0,r1,i1} 139cabdff1aSopenharmony_ci shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} 140cabdff1aSopenharmony_ci shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} 141cabdff1aSopenharmony_ci%endmacro 142cabdff1aSopenharmony_ci 143cabdff1aSopenharmony_ci; In SSE mode do one FFT8 144cabdff1aSopenharmony_ci; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} 145cabdff1aSopenharmony_ci; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} 146cabdff1aSopenharmony_ci; 147cabdff1aSopenharmony_ci; In AVX mode do two FFT8 148cabdff1aSopenharmony_ci; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} 149cabdff1aSopenharmony_ci; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} 150cabdff1aSopenharmony_ci; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} 151cabdff1aSopenharmony_ci; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} 152cabdff1aSopenharmony_ci%macro T8_SSE 6 153cabdff1aSopenharmony_ci addps %6, %3, %4 ; {t1,t2,t3,t4} 154cabdff1aSopenharmony_ci subps %3, %3, %4 ; {r5,i5,r7,i7} 155cabdff1aSopenharmony_ci shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} 156cabdff1aSopenharmony_ci mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} 157cabdff1aSopenharmony_ci mulps %4, %4, [ps_root2] 158cabdff1aSopenharmony_ci addps %3, %3, %4 ; {t8,t7,ta,t9} 159cabdff1aSopenharmony_ci shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} 160cabdff1aSopenharmony_ci shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} 161cabdff1aSopenharmony_ci subps %3, %6, %4 ; {t6,t5,tc,tb} 162cabdff1aSopenharmony_ci addps %6, %6, %4 ; {t1,t2,t9,ta} 163cabdff1aSopenharmony_ci shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} 164cabdff1aSopenharmony_ci shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} 165cabdff1aSopenharmony_ci subps %3, %1, %6 ; {r4,r5,r6,r7} 166cabdff1aSopenharmony_ci addps %1, %1, %6 ; {r0,r1,r2,r3} 167cabdff1aSopenharmony_ci subps %4, %2, %5 ; {i4,i5,i6,i7} 168cabdff1aSopenharmony_ci addps %2, %2, %5 ; {i0,i1,i2,i3} 169cabdff1aSopenharmony_ci%endmacro 170cabdff1aSopenharmony_ci 171cabdff1aSopenharmony_ci%macro INTERL 5 172cabdff1aSopenharmony_ci%if cpuflag(avx) 173cabdff1aSopenharmony_ci vunpckhps %3, %2, %1 174cabdff1aSopenharmony_ci vunpcklps %2, %2, %1 175cabdff1aSopenharmony_ci vextractf128 %4(%5), %2, 0 176cabdff1aSopenharmony_ci vextractf128 %4 %+ H(%5), %3, 0 177cabdff1aSopenharmony_ci vextractf128 %4(%5 + 1), %2, 1 178cabdff1aSopenharmony_ci vextractf128 %4 %+ H(%5 + 1), %3, 1 179cabdff1aSopenharmony_ci%elif cpuflag(sse) 180cabdff1aSopenharmony_ci mova %3, %2 181cabdff1aSopenharmony_ci unpcklps %2, %1 182cabdff1aSopenharmony_ci unpckhps %3, %1 183cabdff1aSopenharmony_ci mova %4(%5), %2 184cabdff1aSopenharmony_ci mova %4(%5+1), %3 185cabdff1aSopenharmony_ci%endif 186cabdff1aSopenharmony_ci%endmacro 187cabdff1aSopenharmony_ci 188cabdff1aSopenharmony_ci; scheduled for cpu-bound sizes 189cabdff1aSopenharmony_ci%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim 190cabdff1aSopenharmony_ciIF%1 mova m4, Z(4) 191cabdff1aSopenharmony_ciIF%1 mova m5, Z(5) 192cabdff1aSopenharmony_ci mova m0, %2 ; wre 193cabdff1aSopenharmony_ci mova m1, %3 ; wim 194cabdff1aSopenharmony_ci mulps m2, m4, m0 ; r2*wre 195cabdff1aSopenharmony_ciIF%1 mova m6, Z2(6) 196cabdff1aSopenharmony_ci mulps m3, m5, m1 ; i2*wim 197cabdff1aSopenharmony_ciIF%1 mova m7, Z2(7) 198cabdff1aSopenharmony_ci mulps m4, m4, m1 ; r2*wim 199cabdff1aSopenharmony_ci mulps m5, m5, m0 ; i2*wre 200cabdff1aSopenharmony_ci addps m2, m2, m3 ; r2*wre + i2*wim 201cabdff1aSopenharmony_ci mulps m3, m1, m7 ; i3*wim 202cabdff1aSopenharmony_ci subps m5, m5, m4 ; i2*wre - r2*wim 203cabdff1aSopenharmony_ci mulps m1, m1, m6 ; r3*wim 204cabdff1aSopenharmony_ci mulps m4, m0, m6 ; r3*wre 205cabdff1aSopenharmony_ci mulps m0, m0, m7 ; i3*wre 206cabdff1aSopenharmony_ci subps m4, m4, m3 ; r3*wre - i3*wim 207cabdff1aSopenharmony_ci mova m3, Z(0) 208cabdff1aSopenharmony_ci addps m0, m0, m1 ; i3*wre + r3*wim 209cabdff1aSopenharmony_ci subps m1, m4, m2 ; t3 210cabdff1aSopenharmony_ci addps m4, m4, m2 ; t5 211cabdff1aSopenharmony_ci subps m3, m3, m4 ; r2 212cabdff1aSopenharmony_ci addps m4, m4, Z(0) ; r0 213cabdff1aSopenharmony_ci mova m6, Z(2) 214cabdff1aSopenharmony_ci mova Z(4), m3 215cabdff1aSopenharmony_ci mova Z(0), m4 216cabdff1aSopenharmony_ci subps m3, m5, m0 ; t4 217cabdff1aSopenharmony_ci subps m4, m6, m3 ; r3 218cabdff1aSopenharmony_ci addps m3, m3, m6 ; r1 219cabdff1aSopenharmony_ci mova Z2(6), m4 220cabdff1aSopenharmony_ci mova Z(2), m3 221cabdff1aSopenharmony_ci mova m2, Z(3) 222cabdff1aSopenharmony_ci addps m3, m5, m0 ; t6 223cabdff1aSopenharmony_ci subps m2, m2, m1 ; i3 224cabdff1aSopenharmony_ci mova m7, Z(1) 225cabdff1aSopenharmony_ci addps m1, m1, Z(3) ; i1 226cabdff1aSopenharmony_ci mova Z2(7), m2 227cabdff1aSopenharmony_ci mova Z(3), m1 228cabdff1aSopenharmony_ci subps m4, m7, m3 ; i2 229cabdff1aSopenharmony_ci addps m3, m3, m7 ; i0 230cabdff1aSopenharmony_ci mova Z(5), m4 231cabdff1aSopenharmony_ci mova Z(1), m3 232cabdff1aSopenharmony_ci%endmacro 233cabdff1aSopenharmony_ci 234cabdff1aSopenharmony_ci; scheduled to avoid store->load aliasing 235cabdff1aSopenharmony_ci%macro PASS_BIG 1 ; (!interleave) 236cabdff1aSopenharmony_ci mova m4, Z(4) ; r2 237cabdff1aSopenharmony_ci mova m5, Z(5) ; i2 238cabdff1aSopenharmony_ci mova m0, [wq] ; wre 239cabdff1aSopenharmony_ci mova m1, [wq+o1q] ; wim 240cabdff1aSopenharmony_ci mulps m2, m4, m0 ; r2*wre 241cabdff1aSopenharmony_ci mova m6, Z2(6) ; r3 242cabdff1aSopenharmony_ci mulps m3, m5, m1 ; i2*wim 243cabdff1aSopenharmony_ci mova m7, Z2(7) ; i3 244cabdff1aSopenharmony_ci mulps m4, m4, m1 ; r2*wim 245cabdff1aSopenharmony_ci mulps m5, m5, m0 ; i2*wre 246cabdff1aSopenharmony_ci addps m2, m2, m3 ; r2*wre + i2*wim 247cabdff1aSopenharmony_ci mulps m3, m1, m7 ; i3*wim 248cabdff1aSopenharmony_ci mulps m1, m1, m6 ; r3*wim 249cabdff1aSopenharmony_ci subps m5, m5, m4 ; i2*wre - r2*wim 250cabdff1aSopenharmony_ci mulps m4, m0, m6 ; r3*wre 251cabdff1aSopenharmony_ci mulps m0, m0, m7 ; i3*wre 252cabdff1aSopenharmony_ci subps m4, m4, m3 ; r3*wre - i3*wim 253cabdff1aSopenharmony_ci mova m3, Z(0) 254cabdff1aSopenharmony_ci addps m0, m0, m1 ; i3*wre + r3*wim 255cabdff1aSopenharmony_ci subps m1, m4, m2 ; t3 256cabdff1aSopenharmony_ci addps m4, m4, m2 ; t5 257cabdff1aSopenharmony_ci subps m3, m3, m4 ; r2 258cabdff1aSopenharmony_ci addps m4, m4, Z(0) ; r0 259cabdff1aSopenharmony_ci mova m6, Z(2) 260cabdff1aSopenharmony_ci mova Z(4), m3 261cabdff1aSopenharmony_ci mova Z(0), m4 262cabdff1aSopenharmony_ci subps m3, m5, m0 ; t4 263cabdff1aSopenharmony_ci subps m4, m6, m3 ; r3 264cabdff1aSopenharmony_ci addps m3, m3, m6 ; r1 265cabdff1aSopenharmony_ciIF%1 mova Z2(6), m4 266cabdff1aSopenharmony_ciIF%1 mova Z(2), m3 267cabdff1aSopenharmony_ci mova m2, Z(3) 268cabdff1aSopenharmony_ci addps m5, m5, m0 ; t6 269cabdff1aSopenharmony_ci subps m2, m2, m1 ; i3 270cabdff1aSopenharmony_ci mova m7, Z(1) 271cabdff1aSopenharmony_ci addps m1, m1, Z(3) ; i1 272cabdff1aSopenharmony_ciIF%1 mova Z2(7), m2 273cabdff1aSopenharmony_ciIF%1 mova Z(3), m1 274cabdff1aSopenharmony_ci subps m6, m7, m5 ; i2 275cabdff1aSopenharmony_ci addps m5, m5, m7 ; i0 276cabdff1aSopenharmony_ciIF%1 mova Z(5), m6 277cabdff1aSopenharmony_ciIF%1 mova Z(1), m5 278cabdff1aSopenharmony_ci%if %1==0 279cabdff1aSopenharmony_ci INTERL m1, m3, m7, Z, 2 280cabdff1aSopenharmony_ci INTERL m2, m4, m0, Z2, 6 281cabdff1aSopenharmony_ci 282cabdff1aSopenharmony_ci mova m1, Z(0) 283cabdff1aSopenharmony_ci mova m2, Z(4) 284cabdff1aSopenharmony_ci 285cabdff1aSopenharmony_ci INTERL m5, m1, m3, Z, 0 286cabdff1aSopenharmony_ci INTERL m6, m2, m7, Z, 4 287cabdff1aSopenharmony_ci%endif 288cabdff1aSopenharmony_ci%endmacro 289cabdff1aSopenharmony_ci 290cabdff1aSopenharmony_ci%define Z(x) [r0+mmsize*x] 291cabdff1aSopenharmony_ci%define Z2(x) [r0+mmsize*x] 292cabdff1aSopenharmony_ci%define ZH(x) [r0+mmsize*x+mmsize/2] 293cabdff1aSopenharmony_ci 294cabdff1aSopenharmony_ciINIT_YMM avx 295cabdff1aSopenharmony_ci 296cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 297cabdff1aSopenharmony_cialign 16 298cabdff1aSopenharmony_cifft8_avx: 299cabdff1aSopenharmony_ci mova m0, Z(0) 300cabdff1aSopenharmony_ci mova m1, Z(1) 301cabdff1aSopenharmony_ci T8_AVX m0, m1, m2, m3, m4 302cabdff1aSopenharmony_ci mova Z(0), m0 303cabdff1aSopenharmony_ci mova Z(1), m1 304cabdff1aSopenharmony_ci ret 305cabdff1aSopenharmony_ci 306cabdff1aSopenharmony_ci 307cabdff1aSopenharmony_cialign 16 308cabdff1aSopenharmony_cifft16_avx: 309cabdff1aSopenharmony_ci mova m2, Z(2) 310cabdff1aSopenharmony_ci mova m3, Z(3) 311cabdff1aSopenharmony_ci T4_SSE m2, m3, m7 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_ci mova m0, Z(0) 314cabdff1aSopenharmony_ci mova m1, Z(1) 315cabdff1aSopenharmony_ci T8_AVX m0, m1, m4, m5, m7 316cabdff1aSopenharmony_ci 317cabdff1aSopenharmony_ci mova m4, [ps_cos16_1] 318cabdff1aSopenharmony_ci mova m5, [ps_cos16_2] 319cabdff1aSopenharmony_ci vmulps m6, m2, m4 320cabdff1aSopenharmony_ci vmulps m7, m3, m5 321cabdff1aSopenharmony_ci vaddps m7, m7, m6 322cabdff1aSopenharmony_ci vmulps m2, m2, m5 323cabdff1aSopenharmony_ci vmulps m3, m3, m4 324cabdff1aSopenharmony_ci vsubps m3, m3, m2 325cabdff1aSopenharmony_ci vblendps m2, m7, m3, 0xf0 326cabdff1aSopenharmony_ci vperm2f128 m3, m7, m3, 0x21 327cabdff1aSopenharmony_ci vaddps m4, m2, m3 328cabdff1aSopenharmony_ci vsubps m2, m3, m2 329cabdff1aSopenharmony_ci vperm2f128 m2, m2, m2, 0x01 330cabdff1aSopenharmony_ci vsubps m3, m1, m2 331cabdff1aSopenharmony_ci vaddps m1, m1, m2 332cabdff1aSopenharmony_ci vsubps m5, m0, m4 333cabdff1aSopenharmony_ci vaddps m0, m0, m4 334cabdff1aSopenharmony_ci vextractf128 Z(0), m0, 0 335cabdff1aSopenharmony_ci vextractf128 ZH(0), m1, 0 336cabdff1aSopenharmony_ci vextractf128 Z(1), m0, 1 337cabdff1aSopenharmony_ci vextractf128 ZH(1), m1, 1 338cabdff1aSopenharmony_ci vextractf128 Z(2), m5, 0 339cabdff1aSopenharmony_ci vextractf128 ZH(2), m3, 0 340cabdff1aSopenharmony_ci vextractf128 Z(3), m5, 1 341cabdff1aSopenharmony_ci vextractf128 ZH(3), m3, 1 342cabdff1aSopenharmony_ci ret 343cabdff1aSopenharmony_ci 344cabdff1aSopenharmony_cialign 16 345cabdff1aSopenharmony_cifft32_avx: 346cabdff1aSopenharmony_ci call fft16_avx 347cabdff1aSopenharmony_ci 348cabdff1aSopenharmony_ci mova m0, Z(4) 349cabdff1aSopenharmony_ci mova m1, Z(5) 350cabdff1aSopenharmony_ci 351cabdff1aSopenharmony_ci T4_SSE m0, m1, m4 352cabdff1aSopenharmony_ci 353cabdff1aSopenharmony_ci mova m2, Z(6) 354cabdff1aSopenharmony_ci mova m3, Z(7) 355cabdff1aSopenharmony_ci 356cabdff1aSopenharmony_ci T8_SSE m0, m1, m2, m3, m4, m6 357cabdff1aSopenharmony_ci ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} 358cabdff1aSopenharmony_ci ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} 359cabdff1aSopenharmony_ci 360cabdff1aSopenharmony_ci vperm2f128 m4, m0, m2, 0x20 361cabdff1aSopenharmony_ci vperm2f128 m5, m1, m3, 0x20 362cabdff1aSopenharmony_ci vperm2f128 m6, m0, m2, 0x31 363cabdff1aSopenharmony_ci vperm2f128 m7, m1, m3, 0x31 364cabdff1aSopenharmony_ci 365cabdff1aSopenharmony_ci PASS_SMALL 0, [cos_32], [cos_32+32] 366cabdff1aSopenharmony_ci 367cabdff1aSopenharmony_ci ret 368cabdff1aSopenharmony_ci 369cabdff1aSopenharmony_cifft32_interleave_avx: 370cabdff1aSopenharmony_ci call fft32_avx 371cabdff1aSopenharmony_ci mov r2d, 32 372cabdff1aSopenharmony_ci.deint_loop: 373cabdff1aSopenharmony_ci mova m2, Z(0) 374cabdff1aSopenharmony_ci mova m3, Z(1) 375cabdff1aSopenharmony_ci vunpcklps m0, m2, m3 376cabdff1aSopenharmony_ci vunpckhps m1, m2, m3 377cabdff1aSopenharmony_ci vextractf128 Z(0), m0, 0 378cabdff1aSopenharmony_ci vextractf128 ZH(0), m1, 0 379cabdff1aSopenharmony_ci vextractf128 Z(1), m0, 1 380cabdff1aSopenharmony_ci vextractf128 ZH(1), m1, 1 381cabdff1aSopenharmony_ci add r0, mmsize*2 382cabdff1aSopenharmony_ci sub r2d, mmsize/4 383cabdff1aSopenharmony_ci jg .deint_loop 384cabdff1aSopenharmony_ci ret 385cabdff1aSopenharmony_ci 386cabdff1aSopenharmony_ci%endif 387cabdff1aSopenharmony_ci 388cabdff1aSopenharmony_ciINIT_XMM sse 389cabdff1aSopenharmony_ci 390cabdff1aSopenharmony_cialign 16 391cabdff1aSopenharmony_cifft4_avx: 392cabdff1aSopenharmony_cifft4_sse: 393cabdff1aSopenharmony_ci mova m0, Z(0) 394cabdff1aSopenharmony_ci mova m1, Z(1) 395cabdff1aSopenharmony_ci T4_SSE m0, m1, m2 396cabdff1aSopenharmony_ci mova Z(0), m0 397cabdff1aSopenharmony_ci mova Z(1), m1 398cabdff1aSopenharmony_ci ret 399cabdff1aSopenharmony_ci 400cabdff1aSopenharmony_cialign 16 401cabdff1aSopenharmony_cifft8_sse: 402cabdff1aSopenharmony_ci mova m0, Z(0) 403cabdff1aSopenharmony_ci mova m1, Z(1) 404cabdff1aSopenharmony_ci T4_SSE m0, m1, m2 405cabdff1aSopenharmony_ci mova m2, Z(2) 406cabdff1aSopenharmony_ci mova m3, Z(3) 407cabdff1aSopenharmony_ci T8_SSE m0, m1, m2, m3, m4, m5 408cabdff1aSopenharmony_ci mova Z(0), m0 409cabdff1aSopenharmony_ci mova Z(1), m1 410cabdff1aSopenharmony_ci mova Z(2), m2 411cabdff1aSopenharmony_ci mova Z(3), m3 412cabdff1aSopenharmony_ci ret 413cabdff1aSopenharmony_ci 414cabdff1aSopenharmony_cialign 16 415cabdff1aSopenharmony_cifft16_sse: 416cabdff1aSopenharmony_ci mova m0, Z(0) 417cabdff1aSopenharmony_ci mova m1, Z(1) 418cabdff1aSopenharmony_ci T4_SSE m0, m1, m2 419cabdff1aSopenharmony_ci mova m2, Z(2) 420cabdff1aSopenharmony_ci mova m3, Z(3) 421cabdff1aSopenharmony_ci T8_SSE m0, m1, m2, m3, m4, m5 422cabdff1aSopenharmony_ci mova m4, Z(4) 423cabdff1aSopenharmony_ci mova m5, Z(5) 424cabdff1aSopenharmony_ci mova Z(0), m0 425cabdff1aSopenharmony_ci mova Z(1), m1 426cabdff1aSopenharmony_ci mova Z(2), m2 427cabdff1aSopenharmony_ci mova Z(3), m3 428cabdff1aSopenharmony_ci T4_SSE m4, m5, m6 429cabdff1aSopenharmony_ci mova m6, Z2(6) 430cabdff1aSopenharmony_ci mova m7, Z2(7) 431cabdff1aSopenharmony_ci T4_SSE m6, m7, m0 432cabdff1aSopenharmony_ci PASS_SMALL 0, [cos_16], [cos_16+16] 433cabdff1aSopenharmony_ci ret 434cabdff1aSopenharmony_ci 435cabdff1aSopenharmony_ci 436cabdff1aSopenharmony_ci%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] 437cabdff1aSopenharmony_ci%define Z2(x) [zcq + o3q + mmsize*(x&1)] 438cabdff1aSopenharmony_ci%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] 439cabdff1aSopenharmony_ci%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] 440cabdff1aSopenharmony_ci 441cabdff1aSopenharmony_ci%macro DECL_PASS 2+ ; name, payload 442cabdff1aSopenharmony_cialign 16 443cabdff1aSopenharmony_ci%1: 444cabdff1aSopenharmony_ciDEFINE_ARGS zc, w, n, o1, o3 445cabdff1aSopenharmony_ci lea o3q, [nq*3] 446cabdff1aSopenharmony_ci lea o1q, [nq*8] 447cabdff1aSopenharmony_ci shl o3q, 4 448cabdff1aSopenharmony_ci.loop: 449cabdff1aSopenharmony_ci %2 450cabdff1aSopenharmony_ci add zcq, mmsize*2 451cabdff1aSopenharmony_ci add wq, mmsize 452cabdff1aSopenharmony_ci sub nd, mmsize/8 453cabdff1aSopenharmony_ci jg .loop 454cabdff1aSopenharmony_ci rep ret 455cabdff1aSopenharmony_ci%endmacro 456cabdff1aSopenharmony_ci 457cabdff1aSopenharmony_ci%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs 458cabdff1aSopenharmony_ci lea r2, [dispatch_tab%1] 459cabdff1aSopenharmony_ci mov r2, [r2 + (%2q-2)*gprsize] 460cabdff1aSopenharmony_ci%ifdef PIC 461cabdff1aSopenharmony_ci lea r3, [$$] 462cabdff1aSopenharmony_ci add r2, r3 463cabdff1aSopenharmony_ci%endif 464cabdff1aSopenharmony_ci call r2 465cabdff1aSopenharmony_ci%endmacro ; FFT_DISPATCH 466cabdff1aSopenharmony_ci 467cabdff1aSopenharmony_ciINIT_YMM avx 468cabdff1aSopenharmony_ci 469cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 470cabdff1aSopenharmony_ciDECL_PASS pass_avx, PASS_BIG 1 471cabdff1aSopenharmony_ciDECL_PASS pass_interleave_avx, PASS_BIG 0 472cabdff1aSopenharmony_ci 473cabdff1aSopenharmony_cicglobal fft_calc, 2,5,8 474cabdff1aSopenharmony_ci mov r3d, [r0 + FFTContext.nbits] 475cabdff1aSopenharmony_ci mov r0, r1 476cabdff1aSopenharmony_ci mov r1, r3 477cabdff1aSopenharmony_ci FFT_DISPATCH _interleave %+ SUFFIX, r1 478cabdff1aSopenharmony_ci REP_RET 479cabdff1aSopenharmony_ci 480cabdff1aSopenharmony_ci%endif 481cabdff1aSopenharmony_ci 482cabdff1aSopenharmony_ciINIT_XMM sse 483cabdff1aSopenharmony_ci 484cabdff1aSopenharmony_ciDECL_PASS pass_sse, PASS_BIG 1 485cabdff1aSopenharmony_ciDECL_PASS pass_interleave_sse, PASS_BIG 0 486cabdff1aSopenharmony_ci 487cabdff1aSopenharmony_ciINIT_XMM sse 488cabdff1aSopenharmony_cicglobal fft_calc, 2,5,8 489cabdff1aSopenharmony_ci mov r3d, [r0 + FFTContext.nbits] 490cabdff1aSopenharmony_ci PUSH r1 491cabdff1aSopenharmony_ci PUSH r3 492cabdff1aSopenharmony_ci mov r0, r1 493cabdff1aSopenharmony_ci mov r1, r3 494cabdff1aSopenharmony_ci FFT_DISPATCH _interleave %+ SUFFIX, r1 495cabdff1aSopenharmony_ci POP rcx 496cabdff1aSopenharmony_ci POP r4 497cabdff1aSopenharmony_ci cmp rcx, 3+(mmsize/16) 498cabdff1aSopenharmony_ci jg .end 499cabdff1aSopenharmony_ci mov r2, -1 500cabdff1aSopenharmony_ci add rcx, 3 501cabdff1aSopenharmony_ci shl r2, cl 502cabdff1aSopenharmony_ci sub r4, r2 503cabdff1aSopenharmony_ci.loop: 504cabdff1aSopenharmony_ci movaps xmm0, [r4 + r2] 505cabdff1aSopenharmony_ci movaps xmm1, xmm0 506cabdff1aSopenharmony_ci unpcklps xmm0, [r4 + r2 + 16] 507cabdff1aSopenharmony_ci unpckhps xmm1, [r4 + r2 + 16] 508cabdff1aSopenharmony_ci movaps [r4 + r2], xmm0 509cabdff1aSopenharmony_ci movaps [r4 + r2 + 16], xmm1 510cabdff1aSopenharmony_ci add r2, mmsize*2 511cabdff1aSopenharmony_ci jl .loop 512cabdff1aSopenharmony_ci.end: 513cabdff1aSopenharmony_ci REP_RET 514cabdff1aSopenharmony_ci 515cabdff1aSopenharmony_cicglobal fft_permute, 2,7,1 516cabdff1aSopenharmony_ci mov r4, [r0 + FFTContext.revtab] 517cabdff1aSopenharmony_ci mov r5, [r0 + FFTContext.tmpbuf] 518cabdff1aSopenharmony_ci mov ecx, [r0 + FFTContext.nbits] 519cabdff1aSopenharmony_ci mov r2, 1 520cabdff1aSopenharmony_ci shl r2, cl 521cabdff1aSopenharmony_ci xor r0, r0 522cabdff1aSopenharmony_ci%if ARCH_X86_32 523cabdff1aSopenharmony_ci mov r1, r1m 524cabdff1aSopenharmony_ci%endif 525cabdff1aSopenharmony_ci.loop: 526cabdff1aSopenharmony_ci movaps xmm0, [r1 + 8*r0] 527cabdff1aSopenharmony_ci movzx r6, word [r4 + 2*r0] 528cabdff1aSopenharmony_ci movzx r3, word [r4 + 2*r0 + 2] 529cabdff1aSopenharmony_ci movlps [r5 + 8*r6], xmm0 530cabdff1aSopenharmony_ci movhps [r5 + 8*r3], xmm0 531cabdff1aSopenharmony_ci add r0, 2 532cabdff1aSopenharmony_ci cmp r0, r2 533cabdff1aSopenharmony_ci jl .loop 534cabdff1aSopenharmony_ci shl r2, 3 535cabdff1aSopenharmony_ci add r1, r2 536cabdff1aSopenharmony_ci add r5, r2 537cabdff1aSopenharmony_ci neg r2 538cabdff1aSopenharmony_ci; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B 539cabdff1aSopenharmony_ci.loopcopy: 540cabdff1aSopenharmony_ci movaps xmm0, [r5 + r2] 541cabdff1aSopenharmony_ci movaps xmm1, [r5 + r2 + 16] 542cabdff1aSopenharmony_ci movaps [r1 + r2], xmm0 543cabdff1aSopenharmony_ci movaps [r1 + r2 + 16], xmm1 544cabdff1aSopenharmony_ci add r2, 32 545cabdff1aSopenharmony_ci jl .loopcopy 546cabdff1aSopenharmony_ci REP_RET 547cabdff1aSopenharmony_ci 548cabdff1aSopenharmony_ciINIT_XMM sse 549cabdff1aSopenharmony_cicglobal imdct_calc, 3,5,3 550cabdff1aSopenharmony_ci mov r3d, [r0 + FFTContext.mdctsize] 551cabdff1aSopenharmony_ci mov r4, [r0 + FFTContext.imdcthalf] 552cabdff1aSopenharmony_ci add r1, r3 553cabdff1aSopenharmony_ci PUSH r3 554cabdff1aSopenharmony_ci PUSH r1 555cabdff1aSopenharmony_ci%if ARCH_X86_32 556cabdff1aSopenharmony_ci push r2 557cabdff1aSopenharmony_ci push r1 558cabdff1aSopenharmony_ci push r0 559cabdff1aSopenharmony_ci%else 560cabdff1aSopenharmony_ci sub rsp, 8+32*WIN64 ; allocate win64 shadow space 561cabdff1aSopenharmony_ci%endif 562cabdff1aSopenharmony_ci call r4 563cabdff1aSopenharmony_ci%if ARCH_X86_32 564cabdff1aSopenharmony_ci add esp, 12 565cabdff1aSopenharmony_ci%else 566cabdff1aSopenharmony_ci add rsp, 8+32*WIN64 567cabdff1aSopenharmony_ci%endif 568cabdff1aSopenharmony_ci POP r1 569cabdff1aSopenharmony_ci POP r3 570cabdff1aSopenharmony_ci lea r0, [r1 + 2*r3] 571cabdff1aSopenharmony_ci mov r2, r3 572cabdff1aSopenharmony_ci sub r3, mmsize 573cabdff1aSopenharmony_ci neg r2 574cabdff1aSopenharmony_ci mova m2, [ps_neg] 575cabdff1aSopenharmony_ci.loop: 576cabdff1aSopenharmony_ci mova m0, [r1 + r3] 577cabdff1aSopenharmony_ci mova m1, [r0 + r2] 578cabdff1aSopenharmony_ci shufps m0, m0, 0x1b 579cabdff1aSopenharmony_ci shufps m1, m1, 0x1b 580cabdff1aSopenharmony_ci xorps m0, m2 581cabdff1aSopenharmony_ci mova [r0 + r3], m1 582cabdff1aSopenharmony_ci mova [r1 + r2], m0 583cabdff1aSopenharmony_ci sub r3, mmsize 584cabdff1aSopenharmony_ci add r2, mmsize 585cabdff1aSopenharmony_ci jl .loop 586cabdff1aSopenharmony_ci REP_RET 587cabdff1aSopenharmony_ci 588cabdff1aSopenharmony_ci%ifdef PIC 589cabdff1aSopenharmony_ci%define SECTION_REL - $$ 590cabdff1aSopenharmony_ci%else 591cabdff1aSopenharmony_ci%define SECTION_REL 592cabdff1aSopenharmony_ci%endif 593cabdff1aSopenharmony_ci 594cabdff1aSopenharmony_ci%macro DECL_FFT 1-2 ; nbits, suffix 595cabdff1aSopenharmony_ci%ifidn %0, 1 596cabdff1aSopenharmony_ci%xdefine fullsuffix SUFFIX 597cabdff1aSopenharmony_ci%else 598cabdff1aSopenharmony_ci%xdefine fullsuffix %2 %+ SUFFIX 599cabdff1aSopenharmony_ci%endif 600cabdff1aSopenharmony_ci%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL 601cabdff1aSopenharmony_ci%if %1>=5 602cabdff1aSopenharmony_ci%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL 603cabdff1aSopenharmony_ci%endif 604cabdff1aSopenharmony_ci%if %1>=6 605cabdff1aSopenharmony_ci%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL 606cabdff1aSopenharmony_ci%endif 607cabdff1aSopenharmony_ci 608cabdff1aSopenharmony_ci%assign n 1<<%1 609cabdff1aSopenharmony_ci%rep 18-%1 610cabdff1aSopenharmony_ci%assign n2 n/2 611cabdff1aSopenharmony_ci%assign n4 n/4 612cabdff1aSopenharmony_ci%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL 613cabdff1aSopenharmony_ci 614cabdff1aSopenharmony_cialign 16 615cabdff1aSopenharmony_cifft %+ n %+ fullsuffix: 616cabdff1aSopenharmony_ci call fft %+ n2 %+ SUFFIX 617cabdff1aSopenharmony_ci add r0, n*4 - (n&(-2<<%1)) 618cabdff1aSopenharmony_ci call fft %+ n4 %+ SUFFIX 619cabdff1aSopenharmony_ci add r0, n*2 - (n2&(-2<<%1)) 620cabdff1aSopenharmony_ci call fft %+ n4 %+ SUFFIX 621cabdff1aSopenharmony_ci sub r0, n*6 + (n2&(-2<<%1)) 622cabdff1aSopenharmony_ci lea r1, [cos_ %+ n] 623cabdff1aSopenharmony_ci mov r2d, n4/2 624cabdff1aSopenharmony_ci jmp pass %+ fullsuffix 625cabdff1aSopenharmony_ci 626cabdff1aSopenharmony_ci%assign n n*2 627cabdff1aSopenharmony_ci%endrep 628cabdff1aSopenharmony_ci%undef n 629cabdff1aSopenharmony_ci 630cabdff1aSopenharmony_cialign 8 631cabdff1aSopenharmony_cidispatch_tab %+ fullsuffix: pointer list_of_fft 632cabdff1aSopenharmony_ci%endmacro ; DECL_FFT 633cabdff1aSopenharmony_ci 634cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 635cabdff1aSopenharmony_ciINIT_YMM avx 636cabdff1aSopenharmony_ciDECL_FFT 6 637cabdff1aSopenharmony_ciDECL_FFT 6, _interleave 638cabdff1aSopenharmony_ci%endif 639cabdff1aSopenharmony_ciINIT_XMM sse 640cabdff1aSopenharmony_ciDECL_FFT 5 641cabdff1aSopenharmony_ciDECL_FFT 5, _interleave 642cabdff1aSopenharmony_ci 643cabdff1aSopenharmony_ciINIT_XMM sse 644cabdff1aSopenharmony_ci%undef mulps 645cabdff1aSopenharmony_ci%undef addps 646cabdff1aSopenharmony_ci%undef subps 647cabdff1aSopenharmony_ci%undef unpcklps 648cabdff1aSopenharmony_ci%undef unpckhps 649cabdff1aSopenharmony_ci 650cabdff1aSopenharmony_ci%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 651cabdff1aSopenharmony_ci movaps xmm0, [%3+%2*4] 652cabdff1aSopenharmony_ci movaps xmm1, [%3+%1*4-0x10] 653cabdff1aSopenharmony_ci movaps xmm2, xmm0 654cabdff1aSopenharmony_ci shufps xmm0, xmm1, 0x88 655cabdff1aSopenharmony_ci shufps xmm1, xmm2, 0x77 656cabdff1aSopenharmony_ci movlps xmm4, [%4+%2*2] 657cabdff1aSopenharmony_ci movlps xmm5, [%5+%2*2+0x0] 658cabdff1aSopenharmony_ci movhps xmm4, [%4+%1*2-0x8] 659cabdff1aSopenharmony_ci movhps xmm5, [%5+%1*2-0x8] 660cabdff1aSopenharmony_ci movaps xmm2, xmm0 661cabdff1aSopenharmony_ci movaps xmm3, xmm1 662cabdff1aSopenharmony_ci mulps xmm0, xmm5 663cabdff1aSopenharmony_ci mulps xmm1, xmm4 664cabdff1aSopenharmony_ci mulps xmm2, xmm4 665cabdff1aSopenharmony_ci mulps xmm3, xmm5 666cabdff1aSopenharmony_ci subps xmm1, xmm0 667cabdff1aSopenharmony_ci addps xmm2, xmm3 668cabdff1aSopenharmony_ci movaps xmm0, xmm1 669cabdff1aSopenharmony_ci unpcklps xmm1, xmm2 670cabdff1aSopenharmony_ci unpckhps xmm0, xmm2 671cabdff1aSopenharmony_ci%endmacro 672cabdff1aSopenharmony_ci 673cabdff1aSopenharmony_ci%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 674cabdff1aSopenharmony_ci mulps m6, %3, [%5+%1] 675cabdff1aSopenharmony_ci mulps m7, %2, [%5+%1] 676cabdff1aSopenharmony_ci mulps %2, %2, [%6+%1] 677cabdff1aSopenharmony_ci mulps %3, %3, [%6+%1] 678cabdff1aSopenharmony_ci subps %2, %2, m6 679cabdff1aSopenharmony_ci addps %3, %3, m7 680cabdff1aSopenharmony_ci%endmacro 681cabdff1aSopenharmony_ci 682cabdff1aSopenharmony_ci%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 683cabdff1aSopenharmony_ci.post: 684cabdff1aSopenharmony_ci%if cpuflag(avx) 685cabdff1aSopenharmony_ci vmovaps ymm1, [%3+%1*2] 686cabdff1aSopenharmony_ci vmovaps ymm0, [%3+%1*2+0x20] 687cabdff1aSopenharmony_ci vmovaps ymm3, [%3+%2*2] 688cabdff1aSopenharmony_ci vmovaps ymm2, [%3+%2*2+0x20] 689cabdff1aSopenharmony_ci 690cabdff1aSopenharmony_ci CMUL %1, ymm0, ymm1, %3, %4, %5 691cabdff1aSopenharmony_ci CMUL %2, ymm2, ymm3, %3, %4, %5 692cabdff1aSopenharmony_ci vshufps ymm1, ymm1, ymm1, 0x1b 693cabdff1aSopenharmony_ci vshufps ymm3, ymm3, ymm3, 0x1b 694cabdff1aSopenharmony_ci vperm2f128 ymm1, ymm1, ymm1, 0x01 695cabdff1aSopenharmony_ci vperm2f128 ymm3, ymm3, ymm3, 0x01 696cabdff1aSopenharmony_ci vunpcklps ymm6, ymm2, ymm1 697cabdff1aSopenharmony_ci vunpckhps ymm4, ymm2, ymm1 698cabdff1aSopenharmony_ci vunpcklps ymm7, ymm0, ymm3 699cabdff1aSopenharmony_ci vunpckhps ymm5, ymm0, ymm3 700cabdff1aSopenharmony_ci 701cabdff1aSopenharmony_ci vextractf128 [%3+%1*2], ymm7, 0 702cabdff1aSopenharmony_ci vextractf128 [%3+%1*2+0x10], ymm5, 0 703cabdff1aSopenharmony_ci vextractf128 [%3+%1*2+0x20], ymm7, 1 704cabdff1aSopenharmony_ci vextractf128 [%3+%1*2+0x30], ymm5, 1 705cabdff1aSopenharmony_ci 706cabdff1aSopenharmony_ci vextractf128 [%3+%2*2], ymm6, 0 707cabdff1aSopenharmony_ci vextractf128 [%3+%2*2+0x10], ymm4, 0 708cabdff1aSopenharmony_ci vextractf128 [%3+%2*2+0x20], ymm6, 1 709cabdff1aSopenharmony_ci vextractf128 [%3+%2*2+0x30], ymm4, 1 710cabdff1aSopenharmony_ci sub %2, 0x20 711cabdff1aSopenharmony_ci add %1, 0x20 712cabdff1aSopenharmony_ci jl .post 713cabdff1aSopenharmony_ci%else 714cabdff1aSopenharmony_ci movaps xmm1, [%3+%1*2] 715cabdff1aSopenharmony_ci movaps xmm0, [%3+%1*2+0x10] 716cabdff1aSopenharmony_ci CMUL %1, xmm0, xmm1, %3, %4, %5 717cabdff1aSopenharmony_ci movaps xmm5, [%3+%2*2] 718cabdff1aSopenharmony_ci movaps xmm4, [%3+%2*2+0x10] 719cabdff1aSopenharmony_ci CMUL %2, xmm4, xmm5, %3, %4, %5 720cabdff1aSopenharmony_ci shufps xmm1, xmm1, 0x1b 721cabdff1aSopenharmony_ci shufps xmm5, xmm5, 0x1b 722cabdff1aSopenharmony_ci movaps xmm6, xmm4 723cabdff1aSopenharmony_ci unpckhps xmm4, xmm1 724cabdff1aSopenharmony_ci unpcklps xmm6, xmm1 725cabdff1aSopenharmony_ci movaps xmm2, xmm0 726cabdff1aSopenharmony_ci unpcklps xmm0, xmm5 727cabdff1aSopenharmony_ci unpckhps xmm2, xmm5 728cabdff1aSopenharmony_ci movaps [%3+%2*2], xmm6 729cabdff1aSopenharmony_ci movaps [%3+%2*2+0x10], xmm4 730cabdff1aSopenharmony_ci movaps [%3+%1*2], xmm0 731cabdff1aSopenharmony_ci movaps [%3+%1*2+0x10], xmm2 732cabdff1aSopenharmony_ci sub %2, 0x10 733cabdff1aSopenharmony_ci add %1, 0x10 734cabdff1aSopenharmony_ci jl .post 735cabdff1aSopenharmony_ci%endif 736cabdff1aSopenharmony_ci%endmacro 737cabdff1aSopenharmony_ci 738cabdff1aSopenharmony_ci%macro DECL_IMDCT 0 739cabdff1aSopenharmony_cicglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input 740cabdff1aSopenharmony_ci%if ARCH_X86_64 741cabdff1aSopenharmony_ci%define rrevtab r7 742cabdff1aSopenharmony_ci%define rtcos r8 743cabdff1aSopenharmony_ci%define rtsin r9 744cabdff1aSopenharmony_ci%else 745cabdff1aSopenharmony_ci%define rrevtab r6 746cabdff1aSopenharmony_ci%define rtsin r6 747cabdff1aSopenharmony_ci%define rtcos r5 748cabdff1aSopenharmony_ci%endif 749cabdff1aSopenharmony_ci mov r3d, [r0+FFTContext.mdctsize] 750cabdff1aSopenharmony_ci add r2, r3 751cabdff1aSopenharmony_ci shr r3, 1 752cabdff1aSopenharmony_ci mov rtcos, [r0+FFTContext.tcos] 753cabdff1aSopenharmony_ci mov rtsin, [r0+FFTContext.tsin] 754cabdff1aSopenharmony_ci add rtcos, r3 755cabdff1aSopenharmony_ci add rtsin, r3 756cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 757cabdff1aSopenharmony_ci push rtcos 758cabdff1aSopenharmony_ci push rtsin 759cabdff1aSopenharmony_ci%endif 760cabdff1aSopenharmony_ci shr r3, 1 761cabdff1aSopenharmony_ci mov rrevtab, [r0+FFTContext.revtab] 762cabdff1aSopenharmony_ci add rrevtab, r3 763cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 764cabdff1aSopenharmony_ci push rrevtab 765cabdff1aSopenharmony_ci%endif 766cabdff1aSopenharmony_ci 767cabdff1aSopenharmony_ci sub r3, 4 768cabdff1aSopenharmony_ci%if ARCH_X86_64 769cabdff1aSopenharmony_ci xor r4, r4 770cabdff1aSopenharmony_ci sub r4, r3 771cabdff1aSopenharmony_ci%endif 772cabdff1aSopenharmony_ci.pre: 773cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 774cabdff1aSopenharmony_ci;unspill 775cabdff1aSopenharmony_ci xor r4, r4 776cabdff1aSopenharmony_ci sub r4, r3 777cabdff1aSopenharmony_ci mov rtcos, [esp+8] 778cabdff1aSopenharmony_ci mov rtsin, [esp+4] 779cabdff1aSopenharmony_ci%endif 780cabdff1aSopenharmony_ci 781cabdff1aSopenharmony_ci PREROTATER r4, r3, r2, rtcos, rtsin 782cabdff1aSopenharmony_ci%if ARCH_X86_64 783cabdff1aSopenharmony_ci movzx r5, word [rrevtab+r4-4] 784cabdff1aSopenharmony_ci movzx r6, word [rrevtab+r4-2] 785cabdff1aSopenharmony_ci movzx r10, word [rrevtab+r3] 786cabdff1aSopenharmony_ci movzx r11, word [rrevtab+r3+2] 787cabdff1aSopenharmony_ci movlps [r1+r5 *8], xmm0 788cabdff1aSopenharmony_ci movhps [r1+r6 *8], xmm0 789cabdff1aSopenharmony_ci movlps [r1+r10*8], xmm1 790cabdff1aSopenharmony_ci movhps [r1+r11*8], xmm1 791cabdff1aSopenharmony_ci add r4, 4 792cabdff1aSopenharmony_ci%else 793cabdff1aSopenharmony_ci mov r6, [esp] 794cabdff1aSopenharmony_ci movzx r5, word [r6+r4-4] 795cabdff1aSopenharmony_ci movzx r4, word [r6+r4-2] 796cabdff1aSopenharmony_ci movlps [r1+r5*8], xmm0 797cabdff1aSopenharmony_ci movhps [r1+r4*8], xmm0 798cabdff1aSopenharmony_ci movzx r5, word [r6+r3] 799cabdff1aSopenharmony_ci movzx r4, word [r6+r3+2] 800cabdff1aSopenharmony_ci movlps [r1+r5*8], xmm1 801cabdff1aSopenharmony_ci movhps [r1+r4*8], xmm1 802cabdff1aSopenharmony_ci%endif 803cabdff1aSopenharmony_ci sub r3, 4 804cabdff1aSopenharmony_ci jns .pre 805cabdff1aSopenharmony_ci 806cabdff1aSopenharmony_ci mov r5, r0 807cabdff1aSopenharmony_ci mov r6, r1 808cabdff1aSopenharmony_ci mov r0, r1 809cabdff1aSopenharmony_ci mov r1d, [r5+FFTContext.nbits] 810cabdff1aSopenharmony_ci 811cabdff1aSopenharmony_ci FFT_DISPATCH SUFFIX, r1 812cabdff1aSopenharmony_ci 813cabdff1aSopenharmony_ci mov r0d, [r5+FFTContext.mdctsize] 814cabdff1aSopenharmony_ci add r6, r0 815cabdff1aSopenharmony_ci shr r0, 1 816cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 817cabdff1aSopenharmony_ci%define rtcos r2 818cabdff1aSopenharmony_ci%define rtsin r3 819cabdff1aSopenharmony_ci mov rtcos, [esp+8] 820cabdff1aSopenharmony_ci mov rtsin, [esp+4] 821cabdff1aSopenharmony_ci%endif 822cabdff1aSopenharmony_ci neg r0 823cabdff1aSopenharmony_ci mov r1, -mmsize 824cabdff1aSopenharmony_ci sub r1, r0 825cabdff1aSopenharmony_ci POSROTATESHUF r0, r1, r6, rtcos, rtsin 826cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 827cabdff1aSopenharmony_ci add esp, 12 828cabdff1aSopenharmony_ci%endif 829cabdff1aSopenharmony_ci RET 830cabdff1aSopenharmony_ci%endmacro 831cabdff1aSopenharmony_ci 832cabdff1aSopenharmony_ciDECL_IMDCT 833cabdff1aSopenharmony_ci 834cabdff1aSopenharmony_ciINIT_YMM avx 835cabdff1aSopenharmony_ci 836cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 837cabdff1aSopenharmony_ciDECL_IMDCT 838cabdff1aSopenharmony_ci%endif 839