1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* Copyright (c) Lynne 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci;* 11cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 15cabdff1aSopenharmony_ci;* 16cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci;****************************************************************************** 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci; Open `doc/transforms.md` to see the code upon which the transforms here were 22cabdff1aSopenharmony_ci; based upon and compare. 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci; TODO: 25cabdff1aSopenharmony_ci; carry over registers from smaller transforms to save on ~8 loads/stores 26cabdff1aSopenharmony_ci; check if vinsertf could be faster than verpm2f128 for duplication 27cabdff1aSopenharmony_ci; even faster FFT8 (current one is very #instructions optimized) 28cabdff1aSopenharmony_ci; replace some xors with blends + addsubs? 29cabdff1aSopenharmony_ci; replace some shuffles with vblends? 30cabdff1aSopenharmony_ci; avx512 split-radix 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ci%define private_prefix ff_tx 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_ci%if ARCH_X86_64 37cabdff1aSopenharmony_ci%define ptr resq 38cabdff1aSopenharmony_ci%else 39cabdff1aSopenharmony_ci%define ptr resd 40cabdff1aSopenharmony_ci%endif 41cabdff1aSopenharmony_ci 42cabdff1aSopenharmony_ci%assign i 16 43cabdff1aSopenharmony_ci%rep 14 44cabdff1aSopenharmony_cicextern tab_ %+ i %+ _float ; ff_tab_i_float... 45cabdff1aSopenharmony_ci%assign i (i << 1) 46cabdff1aSopenharmony_ci%endrep 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_cistruc AVTXContext 49cabdff1aSopenharmony_ci .len: resd 1 ; Length 50cabdff1aSopenharmony_ci .inv resd 1 ; Inverse flag 51cabdff1aSopenharmony_ci .map: ptr 1 ; Lookup table(s) 52cabdff1aSopenharmony_ci .exp: ptr 1 ; Exponentiation factors 53cabdff1aSopenharmony_ci .tmp: ptr 1 ; Temporary data 54cabdff1aSopenharmony_ci 55cabdff1aSopenharmony_ci .sub: ptr 1 ; Subcontexts 56cabdff1aSopenharmony_ci .fn: ptr 4 ; Subcontext functions 57cabdff1aSopenharmony_ci .nb_sub: resd 1 ; Subcontext count 58cabdff1aSopenharmony_ci 59cabdff1aSopenharmony_ci ; Everything else is inaccessible 60cabdff1aSopenharmony_ciendstruc 61cabdff1aSopenharmony_ci 62cabdff1aSopenharmony_ciSECTION_RODATA 32 63cabdff1aSopenharmony_ci 64cabdff1aSopenharmony_ci%define POS 0x00000000 65cabdff1aSopenharmony_ci%define NEG 0x80000000 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_ci%define M_SQRT1_2 0.707106781186547524401 68cabdff1aSopenharmony_ci%define COS16_1 0.92387950420379638671875 69cabdff1aSopenharmony_ci%define COS16_3 0.3826834261417388916015625 70cabdff1aSopenharmony_ci 71cabdff1aSopenharmony_cid8_mult_odd: dd M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, \ 72cabdff1aSopenharmony_ci M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2 73cabdff1aSopenharmony_ci 74cabdff1aSopenharmony_cis8_mult_odd: dd 1.0, 1.0, -1.0, 1.0, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 75cabdff1aSopenharmony_cis8_perm_even: dd 1, 3, 0, 2, 1, 3, 2, 0 76cabdff1aSopenharmony_cis8_perm_odd1: dd 3, 3, 1, 1, 1, 1, 3, 3 77cabdff1aSopenharmony_cis8_perm_odd2: dd 1, 2, 0, 3, 1, 0, 0, 1 78cabdff1aSopenharmony_ci 79cabdff1aSopenharmony_cis16_mult_even: dd 1.0, 1.0, M_SQRT1_2, M_SQRT1_2, 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2 80cabdff1aSopenharmony_cis16_mult_odd1: dd COS16_1, COS16_1, COS16_3, COS16_3, COS16_1, -COS16_1, COS16_3, -COS16_3 81cabdff1aSopenharmony_cis16_mult_odd2: dd COS16_3, -COS16_3, COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1 82cabdff1aSopenharmony_cis16_perm: dd 0, 1, 2, 3, 1, 0, 3, 2 83cabdff1aSopenharmony_ci 84cabdff1aSopenharmony_cimask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG 85cabdff1aSopenharmony_cimask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG 86cabdff1aSopenharmony_cimask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS 87cabdff1aSopenharmony_cimask_mpmppmpm: dd NEG, POS, NEG, POS, POS, NEG, POS, NEG 88cabdff1aSopenharmony_cimask_pmmppmmp: dd POS, NEG, NEG, POS, POS, NEG, NEG, POS 89cabdff1aSopenharmony_cimask_pmpmpmpm: times 4 dd POS, NEG 90cabdff1aSopenharmony_ci 91cabdff1aSopenharmony_ciSECTION .text 92cabdff1aSopenharmony_ci 93cabdff1aSopenharmony_ci; Load complex values (64 bits) via a lookup table 94cabdff1aSopenharmony_ci; %1 - output register 95cabdff1aSopenharmony_ci; %2 - GRP of base input memory address 96cabdff1aSopenharmony_ci; %3 - GPR of LUT (int32_t indices) address 97cabdff1aSopenharmony_ci; %4 - LUT offset 98cabdff1aSopenharmony_ci; %5 - temporary GPR (only used if vgather is not used) 99cabdff1aSopenharmony_ci; %6 - temporary register (for avx only) 100cabdff1aSopenharmony_ci; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set) 101cabdff1aSopenharmony_ci%macro LOAD64_LUT 5-7 102cabdff1aSopenharmony_ci%if %0 > 6 && cpuflag(avx2) 103cabdff1aSopenharmony_ci pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25 104cabdff1aSopenharmony_ci movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction 105cabdff1aSopenharmony_ci vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args 106cabdff1aSopenharmony_ci%else 107cabdff1aSopenharmony_ci mov %5d, [%3 + %4 + 0] 108cabdff1aSopenharmony_ci movsd xmm%1, [%2 + %5q*8] 109cabdff1aSopenharmony_ci%if mmsize == 32 110cabdff1aSopenharmony_ci mov %5d, [%3 + %4 + 8] 111cabdff1aSopenharmony_ci movsd xmm%6, [%2 + %5q*8] 112cabdff1aSopenharmony_ci%endif 113cabdff1aSopenharmony_ci mov %5d, [%3 + %4 + 4] 114cabdff1aSopenharmony_ci movhps xmm%1, [%2 + %5q*8] 115cabdff1aSopenharmony_ci%if mmsize == 32 116cabdff1aSopenharmony_ci mov %5d, [%3 + %4 + 12] 117cabdff1aSopenharmony_ci movhps xmm%6, [%2 + %5q*8] 118cabdff1aSopenharmony_ci vinsertf128 %1, %1, xmm%6, 1 119cabdff1aSopenharmony_ci%endif 120cabdff1aSopenharmony_ci%endif 121cabdff1aSopenharmony_ci%endmacro 122cabdff1aSopenharmony_ci 123cabdff1aSopenharmony_ci; Single 2-point in-place complex FFT (will do 2 transforms at once in AVX mode) 124cabdff1aSopenharmony_ci; %1 - coefficients (r0.reim, r1.reim) 125cabdff1aSopenharmony_ci; %2 - temporary 126cabdff1aSopenharmony_ci%macro FFT2 2 127cabdff1aSopenharmony_ci shufps %2, %1, %1, q3322 128cabdff1aSopenharmony_ci shufps %1, %1, %1, q1100 129cabdff1aSopenharmony_ci 130cabdff1aSopenharmony_ci addsubps %1, %1, %2 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci shufps %1, %1, %1, q2031 133cabdff1aSopenharmony_ci%endmacro 134cabdff1aSopenharmony_ci 135cabdff1aSopenharmony_ci; Single 4-point in-place complex FFT (will do 2 transforms at once in [AVX] mode) 136cabdff1aSopenharmony_ci; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim) 137cabdff1aSopenharmony_ci; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim) 138cabdff1aSopenharmony_ci; %3 - temporary 139cabdff1aSopenharmony_ci%macro FFT4 3 140cabdff1aSopenharmony_ci subps %3, %1, %2 ; r1234, [r5678] 141cabdff1aSopenharmony_ci addps %1, %1, %2 ; t1234, [t5678] 142cabdff1aSopenharmony_ci 143cabdff1aSopenharmony_ci shufps %2, %1, %3, q1010 ; t12, r12 144cabdff1aSopenharmony_ci shufps %1, %1, %3, q2332 ; t34, r43 145cabdff1aSopenharmony_ci 146cabdff1aSopenharmony_ci subps %3, %2, %1 ; a34, b32 147cabdff1aSopenharmony_ci addps %2, %2, %1 ; a12, b14 148cabdff1aSopenharmony_ci 149cabdff1aSopenharmony_ci shufps %1, %2, %3, q1010 ; a1234 even 150cabdff1aSopenharmony_ci 151cabdff1aSopenharmony_ci shufps %2, %2, %3, q2332 ; b1423 152cabdff1aSopenharmony_ci shufps %2, %2, %2, q1320 ; b1234 odd 153cabdff1aSopenharmony_ci%endmacro 154cabdff1aSopenharmony_ci 155cabdff1aSopenharmony_ci; Single/Dual 8-point in-place complex FFT (will do 2 transforms in [AVX] mode) 156cabdff1aSopenharmony_ci; %1 - even coefficients (a0.reim, a2.reim, [b0.reim, b2.reim]) 157cabdff1aSopenharmony_ci; %2 - even coefficients (a4.reim, a6.reim, [b4.reim, b6.reim]) 158cabdff1aSopenharmony_ci; %3 - odd coefficients (a1.reim, a3.reim, [b1.reim, b3.reim]) 159cabdff1aSopenharmony_ci; %4 - odd coefficients (a5.reim, a7.reim, [b5.reim, b7.reim]) 160cabdff1aSopenharmony_ci; %5 - temporary 161cabdff1aSopenharmony_ci; %6 - temporary 162cabdff1aSopenharmony_ci%macro FFT8 6 163cabdff1aSopenharmony_ci addps %5, %1, %3 ; q1-8 164cabdff1aSopenharmony_ci addps %6, %2, %4 ; k1-8 165cabdff1aSopenharmony_ci 166cabdff1aSopenharmony_ci subps %1, %1, %3 ; r1-8 167cabdff1aSopenharmony_ci subps %2, %2, %4 ; j1-8 168cabdff1aSopenharmony_ci 169cabdff1aSopenharmony_ci shufps %4, %1, %1, q2323 ; r4343 170cabdff1aSopenharmony_ci shufps %3, %5, %6, q3032 ; q34, k14 171cabdff1aSopenharmony_ci 172cabdff1aSopenharmony_ci shufps %1, %1, %1, q1010 ; r1212 173cabdff1aSopenharmony_ci shufps %5, %5, %6, q1210 ; q12, k32 174cabdff1aSopenharmony_ci 175cabdff1aSopenharmony_ci xorps %4, %4, [mask_pmmppmmp] ; r4343 * pmmp 176cabdff1aSopenharmony_ci addps %6, %5, %3 ; s12, g12 177cabdff1aSopenharmony_ci 178cabdff1aSopenharmony_ci mulps %2, %2, [d8_mult_odd] ; r8 * d8_mult_odd 179cabdff1aSopenharmony_ci subps %5, %5, %3 ; s34, g43 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_ci addps %3, %1, %4 ; z1234 182cabdff1aSopenharmony_ci unpcklpd %1, %6, %5 ; s1234 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_ci shufps %4, %2, %2, q2301 ; j2143 185cabdff1aSopenharmony_ci shufps %6, %6, %5, q2332 ; g1234 186cabdff1aSopenharmony_ci 187cabdff1aSopenharmony_ci addsubps %2, %2, %4 ; l2143 188cabdff1aSopenharmony_ci shufps %5, %2, %2, q0123 ; l3412 189cabdff1aSopenharmony_ci addsubps %5, %5, %2 ; t1234 190cabdff1aSopenharmony_ci 191cabdff1aSopenharmony_ci subps %2, %1, %6 ; h1234 even 192cabdff1aSopenharmony_ci subps %4, %3, %5 ; u1234 odd 193cabdff1aSopenharmony_ci 194cabdff1aSopenharmony_ci addps %1, %1, %6 ; w1234 even 195cabdff1aSopenharmony_ci addps %3, %3, %5 ; o1234 odd 196cabdff1aSopenharmony_ci%endmacro 197cabdff1aSopenharmony_ci 198cabdff1aSopenharmony_ci; Single 8-point in-place complex FFT in 20 instructions 199cabdff1aSopenharmony_ci; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim) 200cabdff1aSopenharmony_ci; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim) 201cabdff1aSopenharmony_ci; %3 - temporary 202cabdff1aSopenharmony_ci; %4 - temporary 203cabdff1aSopenharmony_ci%macro FFT8_AVX 4 204cabdff1aSopenharmony_ci subps %3, %1, %2 ; r1234, r5678 205cabdff1aSopenharmony_ci addps %1, %1, %2 ; q1234, q5678 206cabdff1aSopenharmony_ci 207cabdff1aSopenharmony_ci vpermilps %2, %3, [s8_perm_odd1] ; r4422, r6688 208cabdff1aSopenharmony_ci shufps %4, %1, %1, q3322 ; q1122, q5566 209cabdff1aSopenharmony_ci 210cabdff1aSopenharmony_ci movsldup %3, %3 ; r1133, r5577 211cabdff1aSopenharmony_ci shufps %1, %1, %1, q1100 ; q3344, q7788 212cabdff1aSopenharmony_ci 213cabdff1aSopenharmony_ci addsubps %3, %3, %2 ; z1234, z5678 214cabdff1aSopenharmony_ci addsubps %1, %1, %4 ; s3142, s7586 215cabdff1aSopenharmony_ci 216cabdff1aSopenharmony_ci mulps %3, %3, [s8_mult_odd] ; z * s8_mult_odd 217cabdff1aSopenharmony_ci vpermilps %1, %1, [s8_perm_even] ; s1234, s5687 ! 218cabdff1aSopenharmony_ci 219cabdff1aSopenharmony_ci shufps %2, %3, %3, q2332 ; junk, z7887 220cabdff1aSopenharmony_ci xorps %4, %1, [mask_mmmmpppm] ; e1234, e5687 ! 221cabdff1aSopenharmony_ci 222cabdff1aSopenharmony_ci vpermilps %3, %3, [s8_perm_odd2] ; z2314, z6556 223cabdff1aSopenharmony_ci vperm2f128 %1, %1, %4, 0x03 ; e5687, s1234 224cabdff1aSopenharmony_ci 225cabdff1aSopenharmony_ci addsubps %2, %2, %3 ; junk, t5678 226cabdff1aSopenharmony_ci subps %1, %1, %4 ; w1234, w5678 even 227cabdff1aSopenharmony_ci 228cabdff1aSopenharmony_ci vperm2f128 %2, %2, %2, 0x11 ; t5678, t5678 229cabdff1aSopenharmony_ci vperm2f128 %3, %3, %3, 0x00 ; z2314, z2314 230cabdff1aSopenharmony_ci 231cabdff1aSopenharmony_ci xorps %2, %2, [mask_ppmpmmpm] ; t * ppmpmmpm 232cabdff1aSopenharmony_ci addps %2, %3, %2 ; u1234, u5678 odd 233cabdff1aSopenharmony_ci%endmacro 234cabdff1aSopenharmony_ci 235cabdff1aSopenharmony_ci; Single 16-point in-place complex FFT 236cabdff1aSopenharmony_ci; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim) 237cabdff1aSopenharmony_ci; %2 - even coefficients (r8.reim, r10.reim, r12.reim, r14.reim) 238cabdff1aSopenharmony_ci; %3 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim) 239cabdff1aSopenharmony_ci; %4 - odd coefficients (r9.reim, r11.reim, r13.reim, r15.reim) 240cabdff1aSopenharmony_ci; %5, %6 - temporary 241cabdff1aSopenharmony_ci; %7, %8 - temporary (optional) 242cabdff1aSopenharmony_ci%macro FFT16 6-8 243cabdff1aSopenharmony_ci FFT4 %3, %4, %5 244cabdff1aSopenharmony_ci%if %0 > 7 245cabdff1aSopenharmony_ci FFT8_AVX %1, %2, %6, %7 246cabdff1aSopenharmony_ci movaps %8, [mask_mpmppmpm] 247cabdff1aSopenharmony_ci movaps %7, [s16_perm] 248cabdff1aSopenharmony_ci%define mask %8 249cabdff1aSopenharmony_ci%define perm %7 250cabdff1aSopenharmony_ci%elif %0 > 6 251cabdff1aSopenharmony_ci FFT8_AVX %1, %2, %6, %7 252cabdff1aSopenharmony_ci movaps %7, [s16_perm] 253cabdff1aSopenharmony_ci%define mask [mask_mpmppmpm] 254cabdff1aSopenharmony_ci%define perm %7 255cabdff1aSopenharmony_ci%else 256cabdff1aSopenharmony_ci FFT8_AVX %1, %2, %6, %5 257cabdff1aSopenharmony_ci%define mask [mask_mpmppmpm] 258cabdff1aSopenharmony_ci%define perm [s16_perm] 259cabdff1aSopenharmony_ci%endif 260cabdff1aSopenharmony_ci xorps %5, %5, %5 ; 0 261cabdff1aSopenharmony_ci 262cabdff1aSopenharmony_ci shufps %6, %4, %4, q2301 ; z12.imre, z13.imre... 263cabdff1aSopenharmony_ci shufps %5, %5, %3, q2301 ; 0, 0, z8.imre... 264cabdff1aSopenharmony_ci 265cabdff1aSopenharmony_ci mulps %4, %4, [s16_mult_odd1] ; z.reim * costab 266cabdff1aSopenharmony_ci xorps %5, %5, [mask_mppmmpmp] 267cabdff1aSopenharmony_ci%if cpuflag(fma3) 268cabdff1aSopenharmony_ci fmaddps %6, %6, [s16_mult_odd2], %4 ; s[8..15] 269cabdff1aSopenharmony_ci addps %5, %3, %5 ; s[0...7] 270cabdff1aSopenharmony_ci%else 271cabdff1aSopenharmony_ci mulps %6, %6, [s16_mult_odd2] ; z.imre * costab 272cabdff1aSopenharmony_ci 273cabdff1aSopenharmony_ci addps %5, %3, %5 ; s[0...7] 274cabdff1aSopenharmony_ci addps %6, %4, %6 ; s[8..15] 275cabdff1aSopenharmony_ci%endif 276cabdff1aSopenharmony_ci mulps %5, %5, [s16_mult_even] ; s[0...7]*costab 277cabdff1aSopenharmony_ci 278cabdff1aSopenharmony_ci xorps %4, %6, mask ; s[8..15]*mpmppmpm 279cabdff1aSopenharmony_ci xorps %3, %5, mask ; s[0...7]*mpmppmpm 280cabdff1aSopenharmony_ci 281cabdff1aSopenharmony_ci vperm2f128 %4, %4, %4, 0x01 ; s[12..15, 8..11] 282cabdff1aSopenharmony_ci vperm2f128 %3, %3, %3, 0x01 ; s[4..7, 0..3] 283cabdff1aSopenharmony_ci 284cabdff1aSopenharmony_ci addps %6, %6, %4 ; y56, u56, y34, u34 285cabdff1aSopenharmony_ci addps %5, %5, %3 ; w56, x56, w34, x34 286cabdff1aSopenharmony_ci 287cabdff1aSopenharmony_ci vpermilps %6, %6, perm ; y56, u56, y43, u43 288cabdff1aSopenharmony_ci vpermilps %5, %5, perm ; w56, x56, w43, x43 289cabdff1aSopenharmony_ci 290cabdff1aSopenharmony_ci subps %4, %2, %6 ; odd part 2 291cabdff1aSopenharmony_ci addps %3, %2, %6 ; odd part 1 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_ci subps %2, %1, %5 ; even part 2 294cabdff1aSopenharmony_ci addps %1, %1, %5 ; even part 1 295cabdff1aSopenharmony_ci%undef mask 296cabdff1aSopenharmony_ci%undef perm 297cabdff1aSopenharmony_ci%endmacro 298cabdff1aSopenharmony_ci 299cabdff1aSopenharmony_ci; Cobmines m0...m8 (tx1[even, even, odd, odd], tx2,3[even], tx2,3[odd]) coeffs 300cabdff1aSopenharmony_ci; Uses all 16 of registers. 301cabdff1aSopenharmony_ci; Output is slightly permuted such that tx2,3's coefficients are interleaved 302cabdff1aSopenharmony_ci; on a 2-point basis (look at `doc/transforms.md`) 303cabdff1aSopenharmony_ci%macro SPLIT_RADIX_COMBINE 17 304cabdff1aSopenharmony_ci%if %1 && mmsize == 32 305cabdff1aSopenharmony_ci vperm2f128 %14, %6, %7, 0x20 ; m2[0], m2[1], m3[0], m3[1] even 306cabdff1aSopenharmony_ci vperm2f128 %16, %9, %8, 0x20 ; m2[0], m2[1], m3[0], m3[1] odd 307cabdff1aSopenharmony_ci vperm2f128 %15, %6, %7, 0x31 ; m2[2], m2[3], m3[2], m3[3] even 308cabdff1aSopenharmony_ci vperm2f128 %17, %9, %8, 0x31 ; m2[2], m2[3], m3[2], m3[3] odd 309cabdff1aSopenharmony_ci%endif 310cabdff1aSopenharmony_ci 311cabdff1aSopenharmony_ci shufps %12, %10, %10, q2200 ; cos00224466 312cabdff1aSopenharmony_ci shufps %13, %11, %11, q1133 ; wim77553311 313cabdff1aSopenharmony_ci movshdup %10, %10 ; cos11335577 314cabdff1aSopenharmony_ci shufps %11, %11, %11, q0022 ; wim66442200 315cabdff1aSopenharmony_ci 316cabdff1aSopenharmony_ci%if %1 && mmsize == 32 317cabdff1aSopenharmony_ci shufps %6, %14, %14, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre even 318cabdff1aSopenharmony_ci shufps %8, %16, %16, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre odd 319cabdff1aSopenharmony_ci shufps %7, %15, %15, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre even 320cabdff1aSopenharmony_ci shufps %9, %17, %17, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre odd 321cabdff1aSopenharmony_ci 322cabdff1aSopenharmony_ci mulps %14, %14, %13 ; m2[0123]reim * wim7531 even 323cabdff1aSopenharmony_ci mulps %16, %16, %11 ; m2[0123]reim * wim7531 odd 324cabdff1aSopenharmony_ci mulps %15, %15, %13 ; m3[0123]reim * wim7531 even 325cabdff1aSopenharmony_ci mulps %17, %17, %11 ; m3[0123]reim * wim7531 odd 326cabdff1aSopenharmony_ci%else 327cabdff1aSopenharmony_ci mulps %14, %6, %13 ; m2,3[01]reim * wim7531 even 328cabdff1aSopenharmony_ci mulps %16, %8, %11 ; m2,3[01]reim * wim7531 odd 329cabdff1aSopenharmony_ci mulps %15, %7, %13 ; m2,3[23]reim * wim7531 even 330cabdff1aSopenharmony_ci mulps %17, %9, %11 ; m2,3[23]reim * wim7531 odd 331cabdff1aSopenharmony_ci ; reorder the multiplies to save movs reg, reg in the %if above 332cabdff1aSopenharmony_ci shufps %6, %6, %6, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even 333cabdff1aSopenharmony_ci shufps %8, %8, %8, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre odd 334cabdff1aSopenharmony_ci shufps %7, %7, %7, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even 335cabdff1aSopenharmony_ci shufps %9, %9, %9, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre odd 336cabdff1aSopenharmony_ci%endif 337cabdff1aSopenharmony_ci 338cabdff1aSopenharmony_ci%if cpuflag(fma3) ; 11 - 5 = 6 instructions saved through FMA! 339cabdff1aSopenharmony_ci fmaddsubps %6, %6, %12, %14 ; w[0..8] even 340cabdff1aSopenharmony_ci fmaddsubps %8, %8, %10, %16 ; w[0..8] odd 341cabdff1aSopenharmony_ci fmsubaddps %7, %7, %12, %15 ; j[0..8] even 342cabdff1aSopenharmony_ci fmsubaddps %9, %9, %10, %17 ; j[0..8] odd 343cabdff1aSopenharmony_ci movaps %13, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!" 344cabdff1aSopenharmony_ci%else 345cabdff1aSopenharmony_ci mulps %6, %6, %12 ; m2,3[01]imre * cos0246 346cabdff1aSopenharmony_ci mulps %8, %8, %10 ; m2,3[01]imre * cos0246 347cabdff1aSopenharmony_ci movaps %13, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!" 348cabdff1aSopenharmony_ci mulps %7, %7, %12 ; m2,3[23]reim * cos0246 349cabdff1aSopenharmony_ci mulps %9, %9, %10 ; m2,3[23]reim * cos0246 350cabdff1aSopenharmony_ci addsubps %6, %6, %14 ; w[0..8] 351cabdff1aSopenharmony_ci addsubps %8, %8, %16 ; w[0..8] 352cabdff1aSopenharmony_ci xorps %15, %15, %13 ; +-m2,3[23]imre * wim7531 353cabdff1aSopenharmony_ci xorps %17, %17, %13 ; +-m2,3[23]imre * wim7531 354cabdff1aSopenharmony_ci addps %7, %7, %15 ; j[0..8] 355cabdff1aSopenharmony_ci addps %9, %9, %17 ; j[0..8] 356cabdff1aSopenharmony_ci%endif 357cabdff1aSopenharmony_ci 358cabdff1aSopenharmony_ci addps %14, %6, %7 ; t10235476 even 359cabdff1aSopenharmony_ci addps %16, %8, %9 ; t10235476 odd 360cabdff1aSopenharmony_ci subps %15, %6, %7 ; +-r[0..7] even 361cabdff1aSopenharmony_ci subps %17, %8, %9 ; +-r[0..7] odd 362cabdff1aSopenharmony_ci 363cabdff1aSopenharmony_ci shufps %14, %14, %14, q2301 ; t[0..7] even 364cabdff1aSopenharmony_ci shufps %16, %16, %16, q2301 ; t[0..7] odd 365cabdff1aSopenharmony_ci xorps %15, %15, %13 ; r[0..7] even 366cabdff1aSopenharmony_ci xorps %17, %17, %13 ; r[0..7] odd 367cabdff1aSopenharmony_ci 368cabdff1aSopenharmony_ci subps %6, %2, %14 ; m2,3[01] even 369cabdff1aSopenharmony_ci subps %8, %4, %16 ; m2,3[01] odd 370cabdff1aSopenharmony_ci subps %7, %3, %15 ; m2,3[23] even 371cabdff1aSopenharmony_ci subps %9, %5, %17 ; m2,3[23] odd 372cabdff1aSopenharmony_ci 373cabdff1aSopenharmony_ci addps %2, %2, %14 ; m0 even 374cabdff1aSopenharmony_ci addps %4, %4, %16 ; m0 odd 375cabdff1aSopenharmony_ci addps %3, %3, %15 ; m1 even 376cabdff1aSopenharmony_ci addps %5, %5, %17 ; m1 odd 377cabdff1aSopenharmony_ci%endmacro 378cabdff1aSopenharmony_ci 379cabdff1aSopenharmony_ci; Same as above, only does one parity at a time, takes 3 temporary registers, 380cabdff1aSopenharmony_ci; however, if the twiddles aren't needed after this, the registers they use 381cabdff1aSopenharmony_ci; can be used as any of the temporary registers. 382cabdff1aSopenharmony_ci%macro SPLIT_RADIX_COMBINE_HALF 10 383cabdff1aSopenharmony_ci%if %1 384cabdff1aSopenharmony_ci shufps %8, %6, %6, q2200 ; cos00224466 385cabdff1aSopenharmony_ci shufps %9, %7, %7, q1133 ; wim77553311 386cabdff1aSopenharmony_ci%else 387cabdff1aSopenharmony_ci shufps %8, %6, %6, q3311 ; cos11335577 388cabdff1aSopenharmony_ci shufps %9, %7, %7, q0022 ; wim66442200 389cabdff1aSopenharmony_ci%endif 390cabdff1aSopenharmony_ci 391cabdff1aSopenharmony_ci mulps %10, %4, %9 ; m2,3[01]reim * wim7531 even 392cabdff1aSopenharmony_ci mulps %9, %9, %5 ; m2,3[23]reim * wim7531 even 393cabdff1aSopenharmony_ci 394cabdff1aSopenharmony_ci shufps %4, %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even 395cabdff1aSopenharmony_ci shufps %5, %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even 396cabdff1aSopenharmony_ci 397cabdff1aSopenharmony_ci%if cpuflag(fma3) 398cabdff1aSopenharmony_ci fmaddsubps %4, %4, %8, %10 ; w[0..8] even 399cabdff1aSopenharmony_ci fmsubaddps %5, %5, %8, %9 ; j[0..8] even 400cabdff1aSopenharmony_ci movaps %10, [mask_pmpmpmpm] 401cabdff1aSopenharmony_ci%else 402cabdff1aSopenharmony_ci mulps %4, %4, %8 ; m2,3[01]imre * cos0246 403cabdff1aSopenharmony_ci mulps %5, %5, %8 ; m2,3[23]reim * cos0246 404cabdff1aSopenharmony_ci addsubps %4, %4, %10 ; w[0..8] 405cabdff1aSopenharmony_ci movaps %10, [mask_pmpmpmpm] 406cabdff1aSopenharmony_ci xorps %9, %9, %10 ; +-m2,3[23]imre * wim7531 407cabdff1aSopenharmony_ci addps %5, %5, %9 ; j[0..8] 408cabdff1aSopenharmony_ci%endif 409cabdff1aSopenharmony_ci 410cabdff1aSopenharmony_ci addps %8, %4, %5 ; t10235476 411cabdff1aSopenharmony_ci subps %9, %4, %5 ; +-r[0..7] 412cabdff1aSopenharmony_ci 413cabdff1aSopenharmony_ci shufps %8, %8, %8, q2301 ; t[0..7] 414cabdff1aSopenharmony_ci xorps %9, %9, %10 ; r[0..7] 415cabdff1aSopenharmony_ci 416cabdff1aSopenharmony_ci subps %4, %2, %8 ; %3,3[01] 417cabdff1aSopenharmony_ci subps %5, %3, %9 ; %3,3[23] 418cabdff1aSopenharmony_ci 419cabdff1aSopenharmony_ci addps %2, %2, %8 ; m0 420cabdff1aSopenharmony_ci addps %3, %3, %9 ; m1 421cabdff1aSopenharmony_ci%endmacro 422cabdff1aSopenharmony_ci 423cabdff1aSopenharmony_ci; Same as above, tries REALLY hard to use 2 temporary registers. 424cabdff1aSopenharmony_ci%macro SPLIT_RADIX_COMBINE_LITE 9 425cabdff1aSopenharmony_ci%if %1 426cabdff1aSopenharmony_ci shufps %8, %6, %6, q2200 ; cos00224466 427cabdff1aSopenharmony_ci shufps %9, %7, %7, q1133 ; wim77553311 428cabdff1aSopenharmony_ci%else 429cabdff1aSopenharmony_ci shufps %8, %6, %6, q3311 ; cos11335577 430cabdff1aSopenharmony_ci shufps %9, %7, %7, q0022 ; wim66442200 431cabdff1aSopenharmony_ci%endif 432cabdff1aSopenharmony_ci 433cabdff1aSopenharmony_ci mulps %9, %9, %4 ; m2,3[01]reim * wim7531 even 434cabdff1aSopenharmony_ci shufps %4, %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even 435cabdff1aSopenharmony_ci 436cabdff1aSopenharmony_ci%if cpuflag(fma3) 437cabdff1aSopenharmony_ci fmaddsubps %4, %4, %8, %9 ; w[0..8] even 438cabdff1aSopenharmony_ci%else 439cabdff1aSopenharmony_ci mulps %4, %4, %8 ; m2,3[01]imre * cos0246 440cabdff1aSopenharmony_ci addsubps %4, %4, %9 ; w[0..8] 441cabdff1aSopenharmony_ci%endif 442cabdff1aSopenharmony_ci 443cabdff1aSopenharmony_ci%if %1 444cabdff1aSopenharmony_ci shufps %9, %7, %7, q1133 ; wim77553311 445cabdff1aSopenharmony_ci%else 446cabdff1aSopenharmony_ci shufps %9, %7, %7, q0022 ; wim66442200 447cabdff1aSopenharmony_ci%endif 448cabdff1aSopenharmony_ci 449cabdff1aSopenharmony_ci mulps %9, %9, %5 ; m2,3[23]reim * wim7531 even 450cabdff1aSopenharmony_ci shufps %5, %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even 451cabdff1aSopenharmony_ci%if cpuflag (fma3) 452cabdff1aSopenharmony_ci fmsubaddps %5, %5, %8, %9 ; j[0..8] even 453cabdff1aSopenharmony_ci%else 454cabdff1aSopenharmony_ci mulps %5, %5, %8 ; m2,3[23]reim * cos0246 455cabdff1aSopenharmony_ci xorps %9, %9, [mask_pmpmpmpm] ; +-m2,3[23]imre * wim7531 456cabdff1aSopenharmony_ci addps %5, %5, %9 ; j[0..8] 457cabdff1aSopenharmony_ci%endif 458cabdff1aSopenharmony_ci 459cabdff1aSopenharmony_ci addps %8, %4, %5 ; t10235476 460cabdff1aSopenharmony_ci subps %9, %4, %5 ; +-r[0..7] 461cabdff1aSopenharmony_ci 462cabdff1aSopenharmony_ci shufps %8, %8, %8, q2301 ; t[0..7] 463cabdff1aSopenharmony_ci xorps %9, %9, [mask_pmpmpmpm] ; r[0..7] 464cabdff1aSopenharmony_ci 465cabdff1aSopenharmony_ci subps %4, %2, %8 ; %3,3[01] 466cabdff1aSopenharmony_ci subps %5, %3, %9 ; %3,3[23] 467cabdff1aSopenharmony_ci 468cabdff1aSopenharmony_ci addps %2, %2, %8 ; m0 469cabdff1aSopenharmony_ci addps %3, %3, %9 ; m1 470cabdff1aSopenharmony_ci%endmacro 471cabdff1aSopenharmony_ci 472cabdff1aSopenharmony_ci%macro SPLIT_RADIX_COMBINE_64 0 473cabdff1aSopenharmony_ci SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2 474cabdff1aSopenharmony_ci 475cabdff1aSopenharmony_ci movaps [outq + 0*mmsize], m0 476cabdff1aSopenharmony_ci movaps [outq + 4*mmsize], m1 477cabdff1aSopenharmony_ci movaps [outq + 8*mmsize], tx1_e0 478cabdff1aSopenharmony_ci movaps [outq + 12*mmsize], tx2_e0 479cabdff1aSopenharmony_ci 480cabdff1aSopenharmony_ci SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, m0 481cabdff1aSopenharmony_ci 482cabdff1aSopenharmony_ci movaps [outq + 2*mmsize], m2 483cabdff1aSopenharmony_ci movaps [outq + 6*mmsize], m3 484cabdff1aSopenharmony_ci movaps [outq + 10*mmsize], tx1_o0 485cabdff1aSopenharmony_ci movaps [outq + 14*mmsize], tx2_o0 486cabdff1aSopenharmony_ci 487cabdff1aSopenharmony_ci movaps tw_e, [tab_64_float + mmsize] 488cabdff1aSopenharmony_ci vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23 489cabdff1aSopenharmony_ci 490cabdff1aSopenharmony_ci movaps m0, [outq + 1*mmsize] 491cabdff1aSopenharmony_ci movaps m1, [outq + 3*mmsize] 492cabdff1aSopenharmony_ci movaps m2, [outq + 5*mmsize] 493cabdff1aSopenharmony_ci movaps m3, [outq + 7*mmsize] 494cabdff1aSopenharmony_ci 495cabdff1aSopenharmony_ci SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \ 496cabdff1aSopenharmony_ci tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers 497cabdff1aSopenharmony_ci 498cabdff1aSopenharmony_ci movaps [outq + 1*mmsize], m0 499cabdff1aSopenharmony_ci movaps [outq + 3*mmsize], m1 500cabdff1aSopenharmony_ci movaps [outq + 5*mmsize], m2 501cabdff1aSopenharmony_ci movaps [outq + 7*mmsize], m3 502cabdff1aSopenharmony_ci 503cabdff1aSopenharmony_ci movaps [outq + 9*mmsize], tx1_e1 504cabdff1aSopenharmony_ci movaps [outq + 11*mmsize], tx1_o1 505cabdff1aSopenharmony_ci movaps [outq + 13*mmsize], tx2_e1 506cabdff1aSopenharmony_ci movaps [outq + 15*mmsize], tx2_o1 507cabdff1aSopenharmony_ci%endmacro 508cabdff1aSopenharmony_ci 509cabdff1aSopenharmony_ci; Perform a single even/odd split radix combination with loads and stores 510cabdff1aSopenharmony_ci; The _4 indicates this is a quarter of the iterations required to complete a full 511cabdff1aSopenharmony_ci; combine loop 512cabdff1aSopenharmony_ci; %1 must contain len*2, %2 must contain len*4, %3 must contain len*6 513cabdff1aSopenharmony_ci%macro SPLIT_RADIX_LOAD_COMBINE_4 8 514cabdff1aSopenharmony_ci movaps m8, [rtabq + (%5)*mmsize + %7] 515cabdff1aSopenharmony_ci vperm2f128 m9, m9, [itabq - (%5)*mmsize + %8], 0x23 516cabdff1aSopenharmony_ci 517cabdff1aSopenharmony_ci movaps m0, [outq + (0 + %4)*mmsize + %6] 518cabdff1aSopenharmony_ci movaps m2, [outq + (2 + %4)*mmsize + %6] 519cabdff1aSopenharmony_ci movaps m1, [outq + %1 + (0 + %4)*mmsize + %6] 520cabdff1aSopenharmony_ci movaps m3, [outq + %1 + (2 + %4)*mmsize + %6] 521cabdff1aSopenharmony_ci 522cabdff1aSopenharmony_ci movaps m4, [outq + %2 + (0 + %4)*mmsize + %6] 523cabdff1aSopenharmony_ci movaps m6, [outq + %2 + (2 + %4)*mmsize + %6] 524cabdff1aSopenharmony_ci movaps m5, [outq + %3 + (0 + %4)*mmsize + %6] 525cabdff1aSopenharmony_ci movaps m7, [outq + %3 + (2 + %4)*mmsize + %6] 526cabdff1aSopenharmony_ci 527cabdff1aSopenharmony_ci SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \ 528cabdff1aSopenharmony_ci m4, m5, m6, m7, \ 529cabdff1aSopenharmony_ci m8, m9, \ 530cabdff1aSopenharmony_ci m10, m11, m12, m13, m14, m15 531cabdff1aSopenharmony_ci 532cabdff1aSopenharmony_ci movaps [outq + (0 + %4)*mmsize + %6], m0 533cabdff1aSopenharmony_ci movaps [outq + (2 + %4)*mmsize + %6], m2 534cabdff1aSopenharmony_ci movaps [outq + %1 + (0 + %4)*mmsize + %6], m1 535cabdff1aSopenharmony_ci movaps [outq + %1 + (2 + %4)*mmsize + %6], m3 536cabdff1aSopenharmony_ci 537cabdff1aSopenharmony_ci movaps [outq + %2 + (0 + %4)*mmsize + %6], m4 538cabdff1aSopenharmony_ci movaps [outq + %2 + (2 + %4)*mmsize + %6], m6 539cabdff1aSopenharmony_ci movaps [outq + %3 + (0 + %4)*mmsize + %6], m5 540cabdff1aSopenharmony_ci movaps [outq + %3 + (2 + %4)*mmsize + %6], m7 541cabdff1aSopenharmony_ci%endmacro 542cabdff1aSopenharmony_ci 543cabdff1aSopenharmony_ci%macro SPLIT_RADIX_LOAD_COMBINE_FULL 2-5 544cabdff1aSopenharmony_ci%if %0 > 2 545cabdff1aSopenharmony_ci%define offset_c %3 546cabdff1aSopenharmony_ci%else 547cabdff1aSopenharmony_ci%define offset_c 0 548cabdff1aSopenharmony_ci%endif 549cabdff1aSopenharmony_ci%if %0 > 3 550cabdff1aSopenharmony_ci%define offset_r %4 551cabdff1aSopenharmony_ci%else 552cabdff1aSopenharmony_ci%define offset_r 0 553cabdff1aSopenharmony_ci%endif 554cabdff1aSopenharmony_ci%if %0 > 4 555cabdff1aSopenharmony_ci%define offset_i %5 556cabdff1aSopenharmony_ci%else 557cabdff1aSopenharmony_ci%define offset_i 0 558cabdff1aSopenharmony_ci%endif 559cabdff1aSopenharmony_ci 560cabdff1aSopenharmony_ci SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 0, 0, offset_c, offset_r, offset_i 561cabdff1aSopenharmony_ci SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 1, 1, offset_c, offset_r, offset_i 562cabdff1aSopenharmony_ci SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 4, 2, offset_c, offset_r, offset_i 563cabdff1aSopenharmony_ci SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 5, 3, offset_c, offset_r, offset_i 564cabdff1aSopenharmony_ci%endmacro 565cabdff1aSopenharmony_ci 566cabdff1aSopenharmony_ci; Perform a single even/odd split radix combination with loads, deinterleaves and 567cabdff1aSopenharmony_ci; stores. The _2 indicates this is a half of the iterations required to complete 568cabdff1aSopenharmony_ci; a full combine+deinterleave loop 569cabdff1aSopenharmony_ci; %3 must contain len*2, %4 must contain len*4, %5 must contain len*6 570cabdff1aSopenharmony_ci%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 6 571cabdff1aSopenharmony_ci movaps m8, [rtabq + (0 + %2)*mmsize] 572cabdff1aSopenharmony_ci vperm2f128 m9, m9, [itabq - (0 + %2)*mmsize], 0x23 573cabdff1aSopenharmony_ci 574cabdff1aSopenharmony_ci movaps m0, [outq + (0 + 0 + %1)*mmsize + %6] 575cabdff1aSopenharmony_ci movaps m2, [outq + (2 + 0 + %1)*mmsize + %6] 576cabdff1aSopenharmony_ci movaps m1, [outq + %3 + (0 + 0 + %1)*mmsize + %6] 577cabdff1aSopenharmony_ci movaps m3, [outq + %3 + (2 + 0 + %1)*mmsize + %6] 578cabdff1aSopenharmony_ci 579cabdff1aSopenharmony_ci movaps m4, [outq + %4 + (0 + 0 + %1)*mmsize + %6] 580cabdff1aSopenharmony_ci movaps m6, [outq + %4 + (2 + 0 + %1)*mmsize + %6] 581cabdff1aSopenharmony_ci movaps m5, [outq + %5 + (0 + 0 + %1)*mmsize + %6] 582cabdff1aSopenharmony_ci movaps m7, [outq + %5 + (2 + 0 + %1)*mmsize + %6] 583cabdff1aSopenharmony_ci 584cabdff1aSopenharmony_ci SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \ 585cabdff1aSopenharmony_ci m4, m5, m6, m7, \ 586cabdff1aSopenharmony_ci m8, m9, \ 587cabdff1aSopenharmony_ci m10, m11, m12, m13, m14, m15 588cabdff1aSopenharmony_ci 589cabdff1aSopenharmony_ci unpckhpd m10, m0, m2 590cabdff1aSopenharmony_ci unpckhpd m11, m1, m3 591cabdff1aSopenharmony_ci unpckhpd m12, m4, m6 592cabdff1aSopenharmony_ci unpckhpd m13, m5, m7 593cabdff1aSopenharmony_ci unpcklpd m0, m0, m2 594cabdff1aSopenharmony_ci unpcklpd m1, m1, m3 595cabdff1aSopenharmony_ci unpcklpd m4, m4, m6 596cabdff1aSopenharmony_ci unpcklpd m5, m5, m7 597cabdff1aSopenharmony_ci 598cabdff1aSopenharmony_ci vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 0], m0, 0 599cabdff1aSopenharmony_ci vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 16], m10, 0 600cabdff1aSopenharmony_ci vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 0], m1, 0 601cabdff1aSopenharmony_ci vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 16], m11, 0 602cabdff1aSopenharmony_ci 603cabdff1aSopenharmony_ci vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 0], m4, 0 604cabdff1aSopenharmony_ci vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 16], m12, 0 605cabdff1aSopenharmony_ci vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 0], m5, 0 606cabdff1aSopenharmony_ci vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 16], m13, 0 607cabdff1aSopenharmony_ci 608cabdff1aSopenharmony_ci vperm2f128 m10, m10, m0, 0x13 609cabdff1aSopenharmony_ci vperm2f128 m11, m11, m1, 0x13 610cabdff1aSopenharmony_ci vperm2f128 m12, m12, m4, 0x13 611cabdff1aSopenharmony_ci vperm2f128 m13, m13, m5, 0x13 612cabdff1aSopenharmony_ci 613cabdff1aSopenharmony_ci movaps m8, [rtabq + (1 + %2)*mmsize] 614cabdff1aSopenharmony_ci vperm2f128 m9, m9, [itabq - (1 + %2)*mmsize], 0x23 615cabdff1aSopenharmony_ci 616cabdff1aSopenharmony_ci movaps m0, [outq + (0 + 1 + %1)*mmsize + %6] 617cabdff1aSopenharmony_ci movaps m2, [outq + (2 + 1 + %1)*mmsize + %6] 618cabdff1aSopenharmony_ci movaps m1, [outq + %3 + (0 + 1 + %1)*mmsize + %6] 619cabdff1aSopenharmony_ci movaps m3, [outq + %3 + (2 + 1 + %1)*mmsize + %6] 620cabdff1aSopenharmony_ci 621cabdff1aSopenharmony_ci movaps [outq + (0 + 1 + %1)*mmsize + %6], m10 ; m0 conflict 622cabdff1aSopenharmony_ci movaps [outq + %3 + (0 + 1 + %1)*mmsize + %6], m11 ; m1 conflict 623cabdff1aSopenharmony_ci 624cabdff1aSopenharmony_ci movaps m4, [outq + %4 + (0 + 1 + %1)*mmsize + %6] 625cabdff1aSopenharmony_ci movaps m6, [outq + %4 + (2 + 1 + %1)*mmsize + %6] 626cabdff1aSopenharmony_ci movaps m5, [outq + %5 + (0 + 1 + %1)*mmsize + %6] 627cabdff1aSopenharmony_ci movaps m7, [outq + %5 + (2 + 1 + %1)*mmsize + %6] 628cabdff1aSopenharmony_ci 629cabdff1aSopenharmony_ci movaps [outq + %4 + (0 + 1 + %1)*mmsize + %6], m12 ; m4 conflict 630cabdff1aSopenharmony_ci movaps [outq + %5 + (0 + 1 + %1)*mmsize + %6], m13 ; m5 conflict 631cabdff1aSopenharmony_ci 632cabdff1aSopenharmony_ci SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \ 633cabdff1aSopenharmony_ci m4, m5, m6, m7, \ 634cabdff1aSopenharmony_ci m8, m9, \ 635cabdff1aSopenharmony_ci m10, m11, m12, m13, m14, m15 ; temporary registers 636cabdff1aSopenharmony_ci 637cabdff1aSopenharmony_ci unpcklpd m8, m0, m2 638cabdff1aSopenharmony_ci unpcklpd m9, m1, m3 639cabdff1aSopenharmony_ci unpcklpd m10, m4, m6 640cabdff1aSopenharmony_ci unpcklpd m11, m5, m7 641cabdff1aSopenharmony_ci unpckhpd m0, m0, m2 642cabdff1aSopenharmony_ci unpckhpd m1, m1, m3 643cabdff1aSopenharmony_ci unpckhpd m4, m4, m6 644cabdff1aSopenharmony_ci unpckhpd m5, m5, m7 645cabdff1aSopenharmony_ci 646cabdff1aSopenharmony_ci vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 0], m8, 0 647cabdff1aSopenharmony_ci vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 16], m0, 0 648cabdff1aSopenharmony_ci vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 0], m8, 1 649cabdff1aSopenharmony_ci vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 16], m0, 1 650cabdff1aSopenharmony_ci 651cabdff1aSopenharmony_ci vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 0], m9, 0 652cabdff1aSopenharmony_ci vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 16], m1, 0 653cabdff1aSopenharmony_ci vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 0], m9, 1 654cabdff1aSopenharmony_ci vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 16], m1, 1 655cabdff1aSopenharmony_ci 656cabdff1aSopenharmony_ci vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 0], m10, 0 657cabdff1aSopenharmony_ci vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 16], m4, 0 658cabdff1aSopenharmony_ci vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 0], m10, 1 659cabdff1aSopenharmony_ci vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 16], m4, 1 660cabdff1aSopenharmony_ci 661cabdff1aSopenharmony_ci vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 0], m11, 0 662cabdff1aSopenharmony_ci vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 16], m5, 0 663cabdff1aSopenharmony_ci vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 0], m11, 1 664cabdff1aSopenharmony_ci vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 16], m5, 1 665cabdff1aSopenharmony_ci%endmacro 666cabdff1aSopenharmony_ci 667cabdff1aSopenharmony_ci%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL 2-3 668cabdff1aSopenharmony_ci%if %0 > 2 669cabdff1aSopenharmony_ci%define offset %3 670cabdff1aSopenharmony_ci%else 671cabdff1aSopenharmony_ci%define offset 0 672cabdff1aSopenharmony_ci%endif 673cabdff1aSopenharmony_ci SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 0, 0, %1, %1*2, %2, offset 674cabdff1aSopenharmony_ci SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 4, 2, %1, %1*2, %2, offset 675cabdff1aSopenharmony_ci%endmacro 676cabdff1aSopenharmony_ci 677cabdff1aSopenharmony_ciINIT_XMM sse3 678cabdff1aSopenharmony_cicglobal fft2_float, 4, 4, 2, ctx, out, in, stride 679cabdff1aSopenharmony_ci movaps m0, [inq] 680cabdff1aSopenharmony_ci FFT2 m0, m1 681cabdff1aSopenharmony_ci movaps [outq], m0 682cabdff1aSopenharmony_ci RET 683cabdff1aSopenharmony_ci 684cabdff1aSopenharmony_ci%macro FFT4 2 685cabdff1aSopenharmony_ciINIT_XMM sse2 686cabdff1aSopenharmony_cicglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride 687cabdff1aSopenharmony_ci movaps m0, [inq + 0*mmsize] 688cabdff1aSopenharmony_ci movaps m1, [inq + 1*mmsize] 689cabdff1aSopenharmony_ci 690cabdff1aSopenharmony_ci%if %2 691cabdff1aSopenharmony_ci shufps m2, m1, m0, q3210 692cabdff1aSopenharmony_ci shufps m0, m0, m1, q3210 693cabdff1aSopenharmony_ci movaps m1, m2 694cabdff1aSopenharmony_ci%endif 695cabdff1aSopenharmony_ci 696cabdff1aSopenharmony_ci FFT4 m0, m1, m2 697cabdff1aSopenharmony_ci 698cabdff1aSopenharmony_ci unpcklpd m2, m0, m1 699cabdff1aSopenharmony_ci unpckhpd m0, m0, m1 700cabdff1aSopenharmony_ci 701cabdff1aSopenharmony_ci movaps [outq + 0*mmsize], m2 702cabdff1aSopenharmony_ci movaps [outq + 1*mmsize], m0 703cabdff1aSopenharmony_ci 704cabdff1aSopenharmony_ci RET 705cabdff1aSopenharmony_ci%endmacro 706cabdff1aSopenharmony_ci 707cabdff1aSopenharmony_ciFFT4 fwd, 0 708cabdff1aSopenharmony_ciFFT4 inv, 1 709cabdff1aSopenharmony_ci 710cabdff1aSopenharmony_ci%macro FFT8_SSE_FN 2 711cabdff1aSopenharmony_ciINIT_XMM sse3 712cabdff1aSopenharmony_cicglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp 713cabdff1aSopenharmony_ci%if %2 714cabdff1aSopenharmony_ci mov ctxq, [ctxq + AVTXContext.map] 715cabdff1aSopenharmony_ci LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq 716cabdff1aSopenharmony_ci LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq 717cabdff1aSopenharmony_ci LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq 718cabdff1aSopenharmony_ci LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq 719cabdff1aSopenharmony_ci%else 720cabdff1aSopenharmony_ci movaps m0, [inq + 0*mmsize] 721cabdff1aSopenharmony_ci movaps m1, [inq + 1*mmsize] 722cabdff1aSopenharmony_ci movaps m2, [inq + 2*mmsize] 723cabdff1aSopenharmony_ci movaps m3, [inq + 3*mmsize] 724cabdff1aSopenharmony_ci%endif 725cabdff1aSopenharmony_ci 726cabdff1aSopenharmony_ci FFT8 m0, m1, m2, m3, m4, m5 727cabdff1aSopenharmony_ci 728cabdff1aSopenharmony_ci unpcklpd m4, m0, m3 729cabdff1aSopenharmony_ci unpcklpd m5, m1, m2 730cabdff1aSopenharmony_ci unpckhpd m0, m0, m3 731cabdff1aSopenharmony_ci unpckhpd m1, m1, m2 732cabdff1aSopenharmony_ci 733cabdff1aSopenharmony_ci movups [outq + 0*mmsize], m4 734cabdff1aSopenharmony_ci movups [outq + 1*mmsize], m0 735cabdff1aSopenharmony_ci movups [outq + 2*mmsize], m5 736cabdff1aSopenharmony_ci movups [outq + 3*mmsize], m1 737cabdff1aSopenharmony_ci 738cabdff1aSopenharmony_ci RET 739cabdff1aSopenharmony_ci%endmacro 740cabdff1aSopenharmony_ci 741cabdff1aSopenharmony_ciFFT8_SSE_FN float, 1 742cabdff1aSopenharmony_ciFFT8_SSE_FN ns_float, 0 743cabdff1aSopenharmony_ci 744cabdff1aSopenharmony_ci%macro FFT8_AVX_FN 2 745cabdff1aSopenharmony_ciINIT_YMM avx 746cabdff1aSopenharmony_cicglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp 747cabdff1aSopenharmony_ci%if %2 748cabdff1aSopenharmony_ci mov ctxq, [ctxq + AVTXContext.map] 749cabdff1aSopenharmony_ci LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2 750cabdff1aSopenharmony_ci LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3 751cabdff1aSopenharmony_ci%else 752cabdff1aSopenharmony_ci movaps m0, [inq + 0*mmsize] 753cabdff1aSopenharmony_ci movaps m1, [inq + 1*mmsize] 754cabdff1aSopenharmony_ci%endif 755cabdff1aSopenharmony_ci 756cabdff1aSopenharmony_ci FFT8_AVX m0, m1, m2, m3 757cabdff1aSopenharmony_ci 758cabdff1aSopenharmony_ci unpcklpd m2, m0, m1 759cabdff1aSopenharmony_ci unpckhpd m0, m0, m1 760cabdff1aSopenharmony_ci 761cabdff1aSopenharmony_ci ; Around 2% faster than 2x vperm2f128 + 2x movapd 762cabdff1aSopenharmony_ci vextractf128 [outq + 16*0], m2, 0 763cabdff1aSopenharmony_ci vextractf128 [outq + 16*1], m0, 0 764cabdff1aSopenharmony_ci vextractf128 [outq + 16*2], m2, 1 765cabdff1aSopenharmony_ci vextractf128 [outq + 16*3], m0, 1 766cabdff1aSopenharmony_ci 767cabdff1aSopenharmony_ci RET 768cabdff1aSopenharmony_ci%endmacro 769cabdff1aSopenharmony_ci 770cabdff1aSopenharmony_ciFFT8_AVX_FN float, 1 771cabdff1aSopenharmony_ciFFT8_AVX_FN ns_float, 0 772cabdff1aSopenharmony_ci 773cabdff1aSopenharmony_ci%macro FFT16_FN 3 774cabdff1aSopenharmony_ciINIT_YMM %1 775cabdff1aSopenharmony_cicglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp 776cabdff1aSopenharmony_ci%if %3 777cabdff1aSopenharmony_ci movaps m0, [inq + 0*mmsize] 778cabdff1aSopenharmony_ci movaps m1, [inq + 1*mmsize] 779cabdff1aSopenharmony_ci movaps m2, [inq + 2*mmsize] 780cabdff1aSopenharmony_ci movaps m3, [inq + 3*mmsize] 781cabdff1aSopenharmony_ci%else 782cabdff1aSopenharmony_ci mov ctxq, [ctxq + AVTXContext.map] 783cabdff1aSopenharmony_ci LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4 784cabdff1aSopenharmony_ci LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5 785cabdff1aSopenharmony_ci LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6 786cabdff1aSopenharmony_ci LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7 787cabdff1aSopenharmony_ci%endif 788cabdff1aSopenharmony_ci 789cabdff1aSopenharmony_ci FFT16 m0, m1, m2, m3, m4, m5, m6, m7 790cabdff1aSopenharmony_ci 791cabdff1aSopenharmony_ci unpcklpd m5, m1, m3 792cabdff1aSopenharmony_ci unpcklpd m4, m0, m2 793cabdff1aSopenharmony_ci unpckhpd m1, m1, m3 794cabdff1aSopenharmony_ci unpckhpd m0, m0, m2 795cabdff1aSopenharmony_ci 796cabdff1aSopenharmony_ci vextractf128 [outq + 16*0], m4, 0 797cabdff1aSopenharmony_ci vextractf128 [outq + 16*1], m0, 0 798cabdff1aSopenharmony_ci vextractf128 [outq + 16*2], m4, 1 799cabdff1aSopenharmony_ci vextractf128 [outq + 16*3], m0, 1 800cabdff1aSopenharmony_ci vextractf128 [outq + 16*4], m5, 0 801cabdff1aSopenharmony_ci vextractf128 [outq + 16*5], m1, 0 802cabdff1aSopenharmony_ci vextractf128 [outq + 16*6], m5, 1 803cabdff1aSopenharmony_ci vextractf128 [outq + 16*7], m1, 1 804cabdff1aSopenharmony_ci 805cabdff1aSopenharmony_ci RET 806cabdff1aSopenharmony_ci%endmacro 807cabdff1aSopenharmony_ci 808cabdff1aSopenharmony_ciFFT16_FN avx, float, 0 809cabdff1aSopenharmony_ciFFT16_FN avx, ns_float, 1 810cabdff1aSopenharmony_ciFFT16_FN fma3, float, 0 811cabdff1aSopenharmony_ciFFT16_FN fma3, ns_float, 1 812cabdff1aSopenharmony_ci 813cabdff1aSopenharmony_ci%macro FFT32_FN 3 814cabdff1aSopenharmony_ciINIT_YMM %1 815cabdff1aSopenharmony_cicglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp 816cabdff1aSopenharmony_ci%if %3 817cabdff1aSopenharmony_ci movaps m4, [inq + 4*mmsize] 818cabdff1aSopenharmony_ci movaps m5, [inq + 5*mmsize] 819cabdff1aSopenharmony_ci movaps m6, [inq + 6*mmsize] 820cabdff1aSopenharmony_ci movaps m7, [inq + 7*mmsize] 821cabdff1aSopenharmony_ci%else 822cabdff1aSopenharmony_ci mov ctxq, [ctxq + AVTXContext.map] 823cabdff1aSopenharmony_ci LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m12 824cabdff1aSopenharmony_ci LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m9, m13 825cabdff1aSopenharmony_ci LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m10, m14 826cabdff1aSopenharmony_ci LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m11, m15 827cabdff1aSopenharmony_ci%endif 828cabdff1aSopenharmony_ci 829cabdff1aSopenharmony_ci FFT8 m4, m5, m6, m7, m8, m9 830cabdff1aSopenharmony_ci 831cabdff1aSopenharmony_ci%if %3 832cabdff1aSopenharmony_ci movaps m0, [inq + 0*mmsize] 833cabdff1aSopenharmony_ci movaps m1, [inq + 1*mmsize] 834cabdff1aSopenharmony_ci movaps m2, [inq + 2*mmsize] 835cabdff1aSopenharmony_ci movaps m3, [inq + 3*mmsize] 836cabdff1aSopenharmony_ci%else 837cabdff1aSopenharmony_ci LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8, m12 838cabdff1aSopenharmony_ci LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m9, m13 839cabdff1aSopenharmony_ci LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m10, m14 840cabdff1aSopenharmony_ci LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m11, m15 841cabdff1aSopenharmony_ci%endif 842cabdff1aSopenharmony_ci 843cabdff1aSopenharmony_ci movaps m8, [tab_32_float] 844cabdff1aSopenharmony_ci vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23 845cabdff1aSopenharmony_ci 846cabdff1aSopenharmony_ci FFT16 m0, m1, m2, m3, m10, m11, m12, m13 847cabdff1aSopenharmony_ci 848cabdff1aSopenharmony_ci SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \ 849cabdff1aSopenharmony_ci m10, m11, m12, m13, m14, m15 ; temporary registers 850cabdff1aSopenharmony_ci 851cabdff1aSopenharmony_ci unpcklpd m9, m1, m3 852cabdff1aSopenharmony_ci unpcklpd m10, m5, m7 853cabdff1aSopenharmony_ci unpcklpd m8, m0, m2 854cabdff1aSopenharmony_ci unpcklpd m11, m4, m6 855cabdff1aSopenharmony_ci unpckhpd m1, m1, m3 856cabdff1aSopenharmony_ci unpckhpd m5, m5, m7 857cabdff1aSopenharmony_ci unpckhpd m0, m0, m2 858cabdff1aSopenharmony_ci unpckhpd m4, m4, m6 859cabdff1aSopenharmony_ci 860cabdff1aSopenharmony_ci vextractf128 [outq + 16* 0], m8, 0 861cabdff1aSopenharmony_ci vextractf128 [outq + 16* 1], m0, 0 862cabdff1aSopenharmony_ci vextractf128 [outq + 16* 2], m8, 1 863cabdff1aSopenharmony_ci vextractf128 [outq + 16* 3], m0, 1 864cabdff1aSopenharmony_ci vextractf128 [outq + 16* 4], m9, 0 865cabdff1aSopenharmony_ci vextractf128 [outq + 16* 5], m1, 0 866cabdff1aSopenharmony_ci vextractf128 [outq + 16* 6], m9, 1 867cabdff1aSopenharmony_ci vextractf128 [outq + 16* 7], m1, 1 868cabdff1aSopenharmony_ci 869cabdff1aSopenharmony_ci vextractf128 [outq + 16* 8], m11, 0 870cabdff1aSopenharmony_ci vextractf128 [outq + 16* 9], m4, 0 871cabdff1aSopenharmony_ci vextractf128 [outq + 16*10], m11, 1 872cabdff1aSopenharmony_ci vextractf128 [outq + 16*11], m4, 1 873cabdff1aSopenharmony_ci vextractf128 [outq + 16*12], m10, 0 874cabdff1aSopenharmony_ci vextractf128 [outq + 16*13], m5, 0 875cabdff1aSopenharmony_ci vextractf128 [outq + 16*14], m10, 1 876cabdff1aSopenharmony_ci vextractf128 [outq + 16*15], m5, 1 877cabdff1aSopenharmony_ci 878cabdff1aSopenharmony_ci RET 879cabdff1aSopenharmony_ci%endmacro 880cabdff1aSopenharmony_ci 881cabdff1aSopenharmony_ci%if ARCH_X86_64 882cabdff1aSopenharmony_ciFFT32_FN avx, float, 0 883cabdff1aSopenharmony_ciFFT32_FN avx, ns_float, 1 884cabdff1aSopenharmony_ciFFT32_FN fma3, float, 0 885cabdff1aSopenharmony_ciFFT32_FN fma3, ns_float, 1 886cabdff1aSopenharmony_ci%endif 887cabdff1aSopenharmony_ci 888cabdff1aSopenharmony_ci%macro FFT_SPLIT_RADIX_DEF 1-2 889cabdff1aSopenharmony_ciALIGN 16 890cabdff1aSopenharmony_ci.%1 %+ pt: 891cabdff1aSopenharmony_ci PUSH lenq 892cabdff1aSopenharmony_ci mov lenq, (%1/4) 893cabdff1aSopenharmony_ci 894cabdff1aSopenharmony_ci add outq, (%1*4) - (%1/1) 895cabdff1aSopenharmony_ci call .32pt 896cabdff1aSopenharmony_ci 897cabdff1aSopenharmony_ci add outq, (%1*2) - (%1/2) ; the synth loops also increment outq 898cabdff1aSopenharmony_ci call .32pt 899cabdff1aSopenharmony_ci 900cabdff1aSopenharmony_ci POP lenq 901cabdff1aSopenharmony_ci sub outq, (%1*4) + (%1*2) + (%1/2) 902cabdff1aSopenharmony_ci 903cabdff1aSopenharmony_ci lea rtabq, [tab_ %+ %1 %+ _float] 904cabdff1aSopenharmony_ci lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7] 905cabdff1aSopenharmony_ci 906cabdff1aSopenharmony_ci%if %0 > 1 907cabdff1aSopenharmony_ci cmp tgtq, %1 908cabdff1aSopenharmony_ci je .deinterleave 909cabdff1aSopenharmony_ci 910cabdff1aSopenharmony_ci mov tmpq, %1 911cabdff1aSopenharmony_ci 912cabdff1aSopenharmony_ci.synth_ %+ %1: 913cabdff1aSopenharmony_ci SPLIT_RADIX_LOAD_COMBINE_FULL 2*%1, 6*%1, 0, 0, 0 914cabdff1aSopenharmony_ci add outq, 8*mmsize 915cabdff1aSopenharmony_ci add rtabq, 4*mmsize 916cabdff1aSopenharmony_ci sub itabq, 4*mmsize 917cabdff1aSopenharmony_ci sub tmpq, 4*mmsize 918cabdff1aSopenharmony_ci jg .synth_ %+ %1 919cabdff1aSopenharmony_ci 920cabdff1aSopenharmony_ci cmp lenq, %1 921cabdff1aSopenharmony_ci jg %2 ; can't do math here, nasm doesn't get it 922cabdff1aSopenharmony_ci ret 923cabdff1aSopenharmony_ci%endif 924cabdff1aSopenharmony_ci%endmacro 925cabdff1aSopenharmony_ci 926cabdff1aSopenharmony_ci%macro FFT_SPLIT_RADIX_FN 3 927cabdff1aSopenharmony_ciINIT_YMM %1 928cabdff1aSopenharmony_cicglobal fft_sr_ %+ %2, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt 929cabdff1aSopenharmony_ci movsxd lenq, dword [lutq + AVTXContext.len] 930cabdff1aSopenharmony_ci mov lutq, [lutq + AVTXContext.map] 931cabdff1aSopenharmony_ci mov tgtq, lenq 932cabdff1aSopenharmony_ci 933cabdff1aSopenharmony_ci; Bottom-most/32-point transform =============================================== 934cabdff1aSopenharmony_ciALIGN 16 935cabdff1aSopenharmony_ci.32pt: 936cabdff1aSopenharmony_ci%if %3 937cabdff1aSopenharmony_ci movaps m4, [inq + 4*mmsize] 938cabdff1aSopenharmony_ci movaps m5, [inq + 5*mmsize] 939cabdff1aSopenharmony_ci movaps m6, [inq + 6*mmsize] 940cabdff1aSopenharmony_ci movaps m7, [inq + 7*mmsize] 941cabdff1aSopenharmony_ci%else 942cabdff1aSopenharmony_ci LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8, m12 943cabdff1aSopenharmony_ci LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m9, m13 944cabdff1aSopenharmony_ci LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m10, m14 945cabdff1aSopenharmony_ci LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m11, m15 946cabdff1aSopenharmony_ci%endif 947cabdff1aSopenharmony_ci 948cabdff1aSopenharmony_ci FFT8 m4, m5, m6, m7, m8, m9 949cabdff1aSopenharmony_ci 950cabdff1aSopenharmony_ci%if %3 951cabdff1aSopenharmony_ci movaps m0, [inq + 0*mmsize] 952cabdff1aSopenharmony_ci movaps m1, [inq + 1*mmsize] 953cabdff1aSopenharmony_ci movaps m2, [inq + 2*mmsize] 954cabdff1aSopenharmony_ci movaps m3, [inq + 3*mmsize] 955cabdff1aSopenharmony_ci%else 956cabdff1aSopenharmony_ci LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m12 957cabdff1aSopenharmony_ci LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m9, m13 958cabdff1aSopenharmony_ci LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m10, m14 959cabdff1aSopenharmony_ci LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m11, m15 960cabdff1aSopenharmony_ci%endif 961cabdff1aSopenharmony_ci 962cabdff1aSopenharmony_ci movaps m8, [tab_32_float] 963cabdff1aSopenharmony_ci vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23 964cabdff1aSopenharmony_ci 965cabdff1aSopenharmony_ci FFT16 m0, m1, m2, m3, m10, m11, m12, m13 966cabdff1aSopenharmony_ci 967cabdff1aSopenharmony_ci SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \ 968cabdff1aSopenharmony_ci m10, m11, m12, m13, m14, m15 ; temporary registers 969cabdff1aSopenharmony_ci 970cabdff1aSopenharmony_ci movaps [outq + 1*mmsize], m1 971cabdff1aSopenharmony_ci movaps [outq + 3*mmsize], m3 972cabdff1aSopenharmony_ci movaps [outq + 5*mmsize], m5 973cabdff1aSopenharmony_ci movaps [outq + 7*mmsize], m7 974cabdff1aSopenharmony_ci 975cabdff1aSopenharmony_ci%if %3 976cabdff1aSopenharmony_ci add inq, 8*mmsize 977cabdff1aSopenharmony_ci%else 978cabdff1aSopenharmony_ci add lutq, (mmsize/2)*8 979cabdff1aSopenharmony_ci%endif 980cabdff1aSopenharmony_ci cmp lenq, 32 981cabdff1aSopenharmony_ci jg .64pt 982cabdff1aSopenharmony_ci 983cabdff1aSopenharmony_ci movaps [outq + 0*mmsize], m0 984cabdff1aSopenharmony_ci movaps [outq + 2*mmsize], m2 985cabdff1aSopenharmony_ci movaps [outq + 4*mmsize], m4 986cabdff1aSopenharmony_ci movaps [outq + 6*mmsize], m6 987cabdff1aSopenharmony_ci 988cabdff1aSopenharmony_ci ret 989cabdff1aSopenharmony_ci 990cabdff1aSopenharmony_ci; 64-point transform =========================================================== 991cabdff1aSopenharmony_ciALIGN 16 992cabdff1aSopenharmony_ci.64pt: 993cabdff1aSopenharmony_ci; Helper defines, these make it easier to track what's happening 994cabdff1aSopenharmony_ci%define tx1_e0 m4 995cabdff1aSopenharmony_ci%define tx1_e1 m5 996cabdff1aSopenharmony_ci%define tx1_o0 m6 997cabdff1aSopenharmony_ci%define tx1_o1 m7 998cabdff1aSopenharmony_ci%define tx2_e0 m8 999cabdff1aSopenharmony_ci%define tx2_e1 m9 1000cabdff1aSopenharmony_ci%define tx2_o0 m10 1001cabdff1aSopenharmony_ci%define tx2_o1 m11 1002cabdff1aSopenharmony_ci%define tw_e m12 1003cabdff1aSopenharmony_ci%define tw_o m13 1004cabdff1aSopenharmony_ci%define tmp1 m14 1005cabdff1aSopenharmony_ci%define tmp2 m15 1006cabdff1aSopenharmony_ci 1007cabdff1aSopenharmony_ci SWAP m4, m1 1008cabdff1aSopenharmony_ci SWAP m6, m3 1009cabdff1aSopenharmony_ci 1010cabdff1aSopenharmony_ci%if %3 1011cabdff1aSopenharmony_ci movaps tx1_e0, [inq + 0*mmsize] 1012cabdff1aSopenharmony_ci movaps tx1_e1, [inq + 1*mmsize] 1013cabdff1aSopenharmony_ci movaps tx1_o0, [inq + 2*mmsize] 1014cabdff1aSopenharmony_ci movaps tx1_o1, [inq + 3*mmsize] 1015cabdff1aSopenharmony_ci%else 1016cabdff1aSopenharmony_ci LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tmp1 1017cabdff1aSopenharmony_ci LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tw_o, tmp2 1018cabdff1aSopenharmony_ci LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tmp1 1019cabdff1aSopenharmony_ci LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tw_o, tmp2 1020cabdff1aSopenharmony_ci%endif 1021cabdff1aSopenharmony_ci 1022cabdff1aSopenharmony_ci FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1 1023cabdff1aSopenharmony_ci 1024cabdff1aSopenharmony_ci%if %3 1025cabdff1aSopenharmony_ci movaps tx2_e0, [inq + 4*mmsize] 1026cabdff1aSopenharmony_ci movaps tx2_e1, [inq + 5*mmsize] 1027cabdff1aSopenharmony_ci movaps tx2_o0, [inq + 6*mmsize] 1028cabdff1aSopenharmony_ci movaps tx2_o1, [inq + 7*mmsize] 1029cabdff1aSopenharmony_ci%else 1030cabdff1aSopenharmony_ci LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tw_e, tmp1 1031cabdff1aSopenharmony_ci LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_o, tmp2 1032cabdff1aSopenharmony_ci LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tw_e, tmp1 1033cabdff1aSopenharmony_ci LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_o, tmp2 1034cabdff1aSopenharmony_ci%endif 1035cabdff1aSopenharmony_ci 1036cabdff1aSopenharmony_ci FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o 1037cabdff1aSopenharmony_ci 1038cabdff1aSopenharmony_ci movaps tw_e, [tab_64_float] 1039cabdff1aSopenharmony_ci vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23 1040cabdff1aSopenharmony_ci 1041cabdff1aSopenharmony_ci%if %3 1042cabdff1aSopenharmony_ci add inq, 8*mmsize 1043cabdff1aSopenharmony_ci%else 1044cabdff1aSopenharmony_ci add lutq, (mmsize/2)*8 1045cabdff1aSopenharmony_ci%endif 1046cabdff1aSopenharmony_ci cmp tgtq, 64 1047cabdff1aSopenharmony_ci je .deinterleave 1048cabdff1aSopenharmony_ci 1049cabdff1aSopenharmony_ci SPLIT_RADIX_COMBINE_64 1050cabdff1aSopenharmony_ci 1051cabdff1aSopenharmony_ci cmp lenq, 64 1052cabdff1aSopenharmony_ci jg .128pt 1053cabdff1aSopenharmony_ci ret 1054cabdff1aSopenharmony_ci 1055cabdff1aSopenharmony_ci; 128-point transform ========================================================== 1056cabdff1aSopenharmony_ciALIGN 16 1057cabdff1aSopenharmony_ci.128pt: 1058cabdff1aSopenharmony_ci PUSH lenq 1059cabdff1aSopenharmony_ci mov lenq, 32 1060cabdff1aSopenharmony_ci 1061cabdff1aSopenharmony_ci add outq, 16*mmsize 1062cabdff1aSopenharmony_ci call .32pt 1063cabdff1aSopenharmony_ci 1064cabdff1aSopenharmony_ci add outq, 8*mmsize 1065cabdff1aSopenharmony_ci call .32pt 1066cabdff1aSopenharmony_ci 1067cabdff1aSopenharmony_ci POP lenq 1068cabdff1aSopenharmony_ci sub outq, 24*mmsize 1069cabdff1aSopenharmony_ci 1070cabdff1aSopenharmony_ci lea rtabq, [tab_128_float] 1071cabdff1aSopenharmony_ci lea itabq, [tab_128_float + 128 - 4*7] 1072cabdff1aSopenharmony_ci 1073cabdff1aSopenharmony_ci cmp tgtq, 128 1074cabdff1aSopenharmony_ci je .deinterleave 1075cabdff1aSopenharmony_ci 1076cabdff1aSopenharmony_ci SPLIT_RADIX_LOAD_COMBINE_FULL 2*128, 6*128 1077cabdff1aSopenharmony_ci 1078cabdff1aSopenharmony_ci cmp lenq, 128 1079cabdff1aSopenharmony_ci jg .256pt 1080cabdff1aSopenharmony_ci ret 1081cabdff1aSopenharmony_ci 1082cabdff1aSopenharmony_ci; 256-point transform ========================================================== 1083cabdff1aSopenharmony_ciALIGN 16 1084cabdff1aSopenharmony_ci.256pt: 1085cabdff1aSopenharmony_ci PUSH lenq 1086cabdff1aSopenharmony_ci mov lenq, 64 1087cabdff1aSopenharmony_ci 1088cabdff1aSopenharmony_ci add outq, 32*mmsize 1089cabdff1aSopenharmony_ci call .32pt 1090cabdff1aSopenharmony_ci 1091cabdff1aSopenharmony_ci add outq, 16*mmsize 1092cabdff1aSopenharmony_ci call .32pt 1093cabdff1aSopenharmony_ci 1094cabdff1aSopenharmony_ci POP lenq 1095cabdff1aSopenharmony_ci sub outq, 48*mmsize 1096cabdff1aSopenharmony_ci 1097cabdff1aSopenharmony_ci lea rtabq, [tab_256_float] 1098cabdff1aSopenharmony_ci lea itabq, [tab_256_float + 256 - 4*7] 1099cabdff1aSopenharmony_ci 1100cabdff1aSopenharmony_ci cmp tgtq, 256 1101cabdff1aSopenharmony_ci je .deinterleave 1102cabdff1aSopenharmony_ci 1103cabdff1aSopenharmony_ci SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256 1104cabdff1aSopenharmony_ci SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256, 8*mmsize, 4*mmsize, -4*mmsize 1105cabdff1aSopenharmony_ci 1106cabdff1aSopenharmony_ci cmp lenq, 256 1107cabdff1aSopenharmony_ci jg .512pt 1108cabdff1aSopenharmony_ci ret 1109cabdff1aSopenharmony_ci 1110cabdff1aSopenharmony_ci; 512-point transform ========================================================== 1111cabdff1aSopenharmony_ciALIGN 16 1112cabdff1aSopenharmony_ci.512pt: 1113cabdff1aSopenharmony_ci PUSH lenq 1114cabdff1aSopenharmony_ci mov lenq, 128 1115cabdff1aSopenharmony_ci 1116cabdff1aSopenharmony_ci add outq, 64*mmsize 1117cabdff1aSopenharmony_ci call .32pt 1118cabdff1aSopenharmony_ci 1119cabdff1aSopenharmony_ci add outq, 32*mmsize 1120cabdff1aSopenharmony_ci call .32pt 1121cabdff1aSopenharmony_ci 1122cabdff1aSopenharmony_ci POP lenq 1123cabdff1aSopenharmony_ci sub outq, 96*mmsize 1124cabdff1aSopenharmony_ci 1125cabdff1aSopenharmony_ci lea rtabq, [tab_512_float] 1126cabdff1aSopenharmony_ci lea itabq, [tab_512_float + 512 - 4*7] 1127cabdff1aSopenharmony_ci 1128cabdff1aSopenharmony_ci cmp tgtq, 512 1129cabdff1aSopenharmony_ci je .deinterleave 1130cabdff1aSopenharmony_ci 1131cabdff1aSopenharmony_ci mov tmpq, 4 1132cabdff1aSopenharmony_ci 1133cabdff1aSopenharmony_ci.synth_512: 1134cabdff1aSopenharmony_ci SPLIT_RADIX_LOAD_COMBINE_FULL 2*512, 6*512 1135cabdff1aSopenharmony_ci add outq, 8*mmsize 1136cabdff1aSopenharmony_ci add rtabq, 4*mmsize 1137cabdff1aSopenharmony_ci sub itabq, 4*mmsize 1138cabdff1aSopenharmony_ci sub tmpq, 1 1139cabdff1aSopenharmony_ci jg .synth_512 1140cabdff1aSopenharmony_ci 1141cabdff1aSopenharmony_ci cmp lenq, 512 1142cabdff1aSopenharmony_ci jg .1024pt 1143cabdff1aSopenharmony_ci ret 1144cabdff1aSopenharmony_ci 1145cabdff1aSopenharmony_ci; 1024-point transform ========================================================== 1146cabdff1aSopenharmony_ciALIGN 16 1147cabdff1aSopenharmony_ci.1024pt: 1148cabdff1aSopenharmony_ci PUSH lenq 1149cabdff1aSopenharmony_ci mov lenq, 256 1150cabdff1aSopenharmony_ci 1151cabdff1aSopenharmony_ci add outq, 96*mmsize 1152cabdff1aSopenharmony_ci call .32pt 1153cabdff1aSopenharmony_ci 1154cabdff1aSopenharmony_ci add outq, 64*mmsize 1155cabdff1aSopenharmony_ci call .32pt 1156cabdff1aSopenharmony_ci 1157cabdff1aSopenharmony_ci POP lenq 1158cabdff1aSopenharmony_ci sub outq, 192*mmsize 1159cabdff1aSopenharmony_ci 1160cabdff1aSopenharmony_ci lea rtabq, [tab_1024_float] 1161cabdff1aSopenharmony_ci lea itabq, [tab_1024_float + 1024 - 4*7] 1162cabdff1aSopenharmony_ci 1163cabdff1aSopenharmony_ci cmp tgtq, 1024 1164cabdff1aSopenharmony_ci je .deinterleave 1165cabdff1aSopenharmony_ci 1166cabdff1aSopenharmony_ci mov tmpq, 8 1167cabdff1aSopenharmony_ci 1168cabdff1aSopenharmony_ci.synth_1024: 1169cabdff1aSopenharmony_ci SPLIT_RADIX_LOAD_COMBINE_FULL 2*1024, 6*1024 1170cabdff1aSopenharmony_ci add outq, 8*mmsize 1171cabdff1aSopenharmony_ci add rtabq, 4*mmsize 1172cabdff1aSopenharmony_ci sub itabq, 4*mmsize 1173cabdff1aSopenharmony_ci sub tmpq, 1 1174cabdff1aSopenharmony_ci jg .synth_1024 1175cabdff1aSopenharmony_ci 1176cabdff1aSopenharmony_ci cmp lenq, 1024 1177cabdff1aSopenharmony_ci jg .2048pt 1178cabdff1aSopenharmony_ci ret 1179cabdff1aSopenharmony_ci 1180cabdff1aSopenharmony_ci; 2048 to 131072-point transforms ============================================== 1181cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_DEF 2048, .4096pt 1182cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_DEF 4096, .8192pt 1183cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_DEF 8192, .16384pt 1184cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_DEF 16384, .32768pt 1185cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_DEF 32768, .65536pt 1186cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_DEF 65536, .131072pt 1187cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_DEF 131072 1188cabdff1aSopenharmony_ci 1189cabdff1aSopenharmony_ci;=============================================================================== 1190cabdff1aSopenharmony_ci; Final synthesis + deinterleaving code 1191cabdff1aSopenharmony_ci;=============================================================================== 1192cabdff1aSopenharmony_ci.deinterleave: 1193cabdff1aSopenharmony_ci cmp lenq, 64 1194cabdff1aSopenharmony_ci je .64pt_deint 1195cabdff1aSopenharmony_ci 1196cabdff1aSopenharmony_ci imul tmpq, lenq, 2 1197cabdff1aSopenharmony_ci lea lutq, [4*lenq + tmpq] 1198cabdff1aSopenharmony_ci 1199cabdff1aSopenharmony_ci.synth_deinterleave: 1200cabdff1aSopenharmony_ci SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL tmpq, lutq 1201cabdff1aSopenharmony_ci add outq, 8*mmsize 1202cabdff1aSopenharmony_ci add rtabq, 4*mmsize 1203cabdff1aSopenharmony_ci sub itabq, 4*mmsize 1204cabdff1aSopenharmony_ci sub lenq, 4*mmsize 1205cabdff1aSopenharmony_ci jg .synth_deinterleave 1206cabdff1aSopenharmony_ci 1207cabdff1aSopenharmony_ci RET 1208cabdff1aSopenharmony_ci 1209cabdff1aSopenharmony_ci; 64-point deinterleave which only has to load 4 registers ===================== 1210cabdff1aSopenharmony_ci.64pt_deint: 1211cabdff1aSopenharmony_ci SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2 1212cabdff1aSopenharmony_ci SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, tw_e 1213cabdff1aSopenharmony_ci 1214cabdff1aSopenharmony_ci unpcklpd tmp1, m0, m2 1215cabdff1aSopenharmony_ci unpcklpd tmp2, m1, m3 1216cabdff1aSopenharmony_ci unpcklpd tw_o, tx1_e0, tx1_o0 1217cabdff1aSopenharmony_ci unpcklpd tw_e, tx2_e0, tx2_o0 1218cabdff1aSopenharmony_ci unpckhpd m0, m0, m2 1219cabdff1aSopenharmony_ci unpckhpd m1, m1, m3 1220cabdff1aSopenharmony_ci unpckhpd tx1_e0, tx1_e0, tx1_o0 1221cabdff1aSopenharmony_ci unpckhpd tx2_e0, tx2_e0, tx2_o0 1222cabdff1aSopenharmony_ci 1223cabdff1aSopenharmony_ci vextractf128 [outq + 0*mmsize + 0], tmp1, 0 1224cabdff1aSopenharmony_ci vextractf128 [outq + 0*mmsize + 16], m0, 0 1225cabdff1aSopenharmony_ci vextractf128 [outq + 4*mmsize + 0], tmp2, 0 1226cabdff1aSopenharmony_ci vextractf128 [outq + 4*mmsize + 16], m1, 0 1227cabdff1aSopenharmony_ci 1228cabdff1aSopenharmony_ci vextractf128 [outq + 8*mmsize + 0], tw_o, 0 1229cabdff1aSopenharmony_ci vextractf128 [outq + 8*mmsize + 16], tx1_e0, 0 1230cabdff1aSopenharmony_ci vextractf128 [outq + 9*mmsize + 0], tw_o, 1 1231cabdff1aSopenharmony_ci vextractf128 [outq + 9*mmsize + 16], tx1_e0, 1 1232cabdff1aSopenharmony_ci 1233cabdff1aSopenharmony_ci vperm2f128 tmp1, tmp1, m0, 0x31 1234cabdff1aSopenharmony_ci vperm2f128 tmp2, tmp2, m1, 0x31 1235cabdff1aSopenharmony_ci 1236cabdff1aSopenharmony_ci vextractf128 [outq + 12*mmsize + 0], tw_e, 0 1237cabdff1aSopenharmony_ci vextractf128 [outq + 12*mmsize + 16], tx2_e0, 0 1238cabdff1aSopenharmony_ci vextractf128 [outq + 13*mmsize + 0], tw_e, 1 1239cabdff1aSopenharmony_ci vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1 1240cabdff1aSopenharmony_ci 1241cabdff1aSopenharmony_ci movaps tw_e, [tab_64_float + mmsize] 1242cabdff1aSopenharmony_ci vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23 1243cabdff1aSopenharmony_ci 1244cabdff1aSopenharmony_ci movaps m0, [outq + 1*mmsize] 1245cabdff1aSopenharmony_ci movaps m1, [outq + 3*mmsize] 1246cabdff1aSopenharmony_ci movaps m2, [outq + 5*mmsize] 1247cabdff1aSopenharmony_ci movaps m3, [outq + 7*mmsize] 1248cabdff1aSopenharmony_ci 1249cabdff1aSopenharmony_ci movaps [outq + 1*mmsize], tmp1 1250cabdff1aSopenharmony_ci movaps [outq + 5*mmsize], tmp2 1251cabdff1aSopenharmony_ci 1252cabdff1aSopenharmony_ci SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \ 1253cabdff1aSopenharmony_ci tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers 1254cabdff1aSopenharmony_ci 1255cabdff1aSopenharmony_ci unpcklpd tmp1, m0, m1 1256cabdff1aSopenharmony_ci unpcklpd tmp2, m2, m3 1257cabdff1aSopenharmony_ci unpcklpd tw_e, tx1_e1, tx1_o1 1258cabdff1aSopenharmony_ci unpcklpd tw_o, tx2_e1, tx2_o1 1259cabdff1aSopenharmony_ci unpckhpd m0, m0, m1 1260cabdff1aSopenharmony_ci unpckhpd m2, m2, m3 1261cabdff1aSopenharmony_ci unpckhpd tx1_e1, tx1_e1, tx1_o1 1262cabdff1aSopenharmony_ci unpckhpd tx2_e1, tx2_e1, tx2_o1 1263cabdff1aSopenharmony_ci 1264cabdff1aSopenharmony_ci vextractf128 [outq + 2*mmsize + 0], tmp1, 0 1265cabdff1aSopenharmony_ci vextractf128 [outq + 2*mmsize + 16], m0, 0 1266cabdff1aSopenharmony_ci vextractf128 [outq + 3*mmsize + 0], tmp1, 1 1267cabdff1aSopenharmony_ci vextractf128 [outq + 3*mmsize + 16], m0, 1 1268cabdff1aSopenharmony_ci 1269cabdff1aSopenharmony_ci vextractf128 [outq + 6*mmsize + 0], tmp2, 0 1270cabdff1aSopenharmony_ci vextractf128 [outq + 6*mmsize + 16], m2, 0 1271cabdff1aSopenharmony_ci vextractf128 [outq + 7*mmsize + 0], tmp2, 1 1272cabdff1aSopenharmony_ci vextractf128 [outq + 7*mmsize + 16], m2, 1 1273cabdff1aSopenharmony_ci 1274cabdff1aSopenharmony_ci vextractf128 [outq + 10*mmsize + 0], tw_e, 0 1275cabdff1aSopenharmony_ci vextractf128 [outq + 10*mmsize + 16], tx1_e1, 0 1276cabdff1aSopenharmony_ci vextractf128 [outq + 11*mmsize + 0], tw_e, 1 1277cabdff1aSopenharmony_ci vextractf128 [outq + 11*mmsize + 16], tx1_e1, 1 1278cabdff1aSopenharmony_ci 1279cabdff1aSopenharmony_ci vextractf128 [outq + 14*mmsize + 0], tw_o, 0 1280cabdff1aSopenharmony_ci vextractf128 [outq + 14*mmsize + 16], tx2_e1, 0 1281cabdff1aSopenharmony_ci vextractf128 [outq + 15*mmsize + 0], tw_o, 1 1282cabdff1aSopenharmony_ci vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1 1283cabdff1aSopenharmony_ci 1284cabdff1aSopenharmony_ci RET 1285cabdff1aSopenharmony_ci%endmacro 1286cabdff1aSopenharmony_ci 1287cabdff1aSopenharmony_ci%if ARCH_X86_64 1288cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_FN fma3, float, 0 1289cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_FN fma3, ns_float, 1 1290cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 1291cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_FN avx2, float, 0 1292cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_FN avx2, ns_float, 1 1293cabdff1aSopenharmony_ci%endif 1294cabdff1aSopenharmony_ci%endif 1295