1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* Copyright (c) Lynne
3cabdff1aSopenharmony_ci;*
4cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci;*
11cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
15cabdff1aSopenharmony_ci;*
16cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci;******************************************************************************
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci; Open `doc/transforms.md` to see the code upon which the transforms here were
22cabdff1aSopenharmony_ci; based upon and compare.
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci; TODO:
25cabdff1aSopenharmony_ci;       carry over registers from smaller transforms to save on ~8 loads/stores
26cabdff1aSopenharmony_ci;       check if vinsertf could be faster than verpm2f128 for duplication
27cabdff1aSopenharmony_ci;       even faster FFT8 (current one is very #instructions optimized)
28cabdff1aSopenharmony_ci;       replace some xors with blends + addsubs?
29cabdff1aSopenharmony_ci;       replace some shuffles with vblends?
30cabdff1aSopenharmony_ci;       avx512 split-radix
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_ci%define private_prefix ff_tx
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_ci%if ARCH_X86_64
37cabdff1aSopenharmony_ci%define ptr resq
38cabdff1aSopenharmony_ci%else
39cabdff1aSopenharmony_ci%define ptr resd
40cabdff1aSopenharmony_ci%endif
41cabdff1aSopenharmony_ci
42cabdff1aSopenharmony_ci%assign i 16
43cabdff1aSopenharmony_ci%rep 14
44cabdff1aSopenharmony_cicextern tab_ %+ i %+ _float ; ff_tab_i_float...
45cabdff1aSopenharmony_ci%assign i (i << 1)
46cabdff1aSopenharmony_ci%endrep
47cabdff1aSopenharmony_ci
48cabdff1aSopenharmony_cistruc AVTXContext
49cabdff1aSopenharmony_ci    .len:          resd 1 ; Length
50cabdff1aSopenharmony_ci    .inv           resd 1 ; Inverse flag
51cabdff1aSopenharmony_ci    .map:           ptr 1 ; Lookup table(s)
52cabdff1aSopenharmony_ci    .exp:           ptr 1 ; Exponentiation factors
53cabdff1aSopenharmony_ci    .tmp:           ptr 1 ; Temporary data
54cabdff1aSopenharmony_ci
55cabdff1aSopenharmony_ci    .sub:           ptr 1 ; Subcontexts
56cabdff1aSopenharmony_ci    .fn:            ptr 4 ; Subcontext functions
57cabdff1aSopenharmony_ci    .nb_sub:       resd 1 ; Subcontext count
58cabdff1aSopenharmony_ci
59cabdff1aSopenharmony_ci    ; Everything else is inaccessible
60cabdff1aSopenharmony_ciendstruc
61cabdff1aSopenharmony_ci
62cabdff1aSopenharmony_ciSECTION_RODATA 32
63cabdff1aSopenharmony_ci
64cabdff1aSopenharmony_ci%define POS 0x00000000
65cabdff1aSopenharmony_ci%define NEG 0x80000000
66cabdff1aSopenharmony_ci
67cabdff1aSopenharmony_ci%define M_SQRT1_2 0.707106781186547524401
68cabdff1aSopenharmony_ci%define COS16_1   0.92387950420379638671875
69cabdff1aSopenharmony_ci%define COS16_3   0.3826834261417388916015625
70cabdff1aSopenharmony_ci
71cabdff1aSopenharmony_cid8_mult_odd:   dd M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, \
72cabdff1aSopenharmony_ci                  M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2
73cabdff1aSopenharmony_ci
74cabdff1aSopenharmony_cis8_mult_odd:   dd 1.0, 1.0, -1.0, 1.0, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
75cabdff1aSopenharmony_cis8_perm_even:  dd 1, 3, 0, 2, 1, 3, 2, 0
76cabdff1aSopenharmony_cis8_perm_odd1:  dd 3, 3, 1, 1, 1, 1, 3, 3
77cabdff1aSopenharmony_cis8_perm_odd2:  dd 1, 2, 0, 3, 1, 0, 0, 1
78cabdff1aSopenharmony_ci
79cabdff1aSopenharmony_cis16_mult_even: dd 1.0, 1.0, M_SQRT1_2, M_SQRT1_2, 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2
80cabdff1aSopenharmony_cis16_mult_odd1: dd COS16_1,  COS16_1,  COS16_3,  COS16_3,  COS16_1, -COS16_1,  COS16_3, -COS16_3
81cabdff1aSopenharmony_cis16_mult_odd2: dd COS16_3, -COS16_3,  COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1
82cabdff1aSopenharmony_cis16_perm:      dd 0, 1, 2, 3, 1, 0, 3, 2
83cabdff1aSopenharmony_ci
84cabdff1aSopenharmony_cimask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG
85cabdff1aSopenharmony_cimask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG
86cabdff1aSopenharmony_cimask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS
87cabdff1aSopenharmony_cimask_mpmppmpm: dd NEG, POS, NEG, POS, POS, NEG, POS, NEG
88cabdff1aSopenharmony_cimask_pmmppmmp: dd POS, NEG, NEG, POS, POS, NEG, NEG, POS
89cabdff1aSopenharmony_cimask_pmpmpmpm: times 4 dd POS, NEG
90cabdff1aSopenharmony_ci
91cabdff1aSopenharmony_ciSECTION .text
92cabdff1aSopenharmony_ci
93cabdff1aSopenharmony_ci; Load complex values (64 bits) via a lookup table
94cabdff1aSopenharmony_ci; %1 - output register
95cabdff1aSopenharmony_ci; %2 - GRP of base input memory address
96cabdff1aSopenharmony_ci; %3 - GPR of LUT (int32_t indices) address
97cabdff1aSopenharmony_ci; %4 - LUT offset
98cabdff1aSopenharmony_ci; %5 - temporary GPR (only used if vgather is not used)
99cabdff1aSopenharmony_ci; %6 - temporary register (for avx only)
100cabdff1aSopenharmony_ci; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set)
101cabdff1aSopenharmony_ci%macro LOAD64_LUT 5-7
102cabdff1aSopenharmony_ci%if %0 > 6 && cpuflag(avx2)
103cabdff1aSopenharmony_ci    pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
104cabdff1aSopenharmony_ci    movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction
105cabdff1aSopenharmony_ci    vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args
106cabdff1aSopenharmony_ci%else
107cabdff1aSopenharmony_ci    mov      %5d, [%3 + %4 + 0]
108cabdff1aSopenharmony_ci    movsd  xmm%1, [%2 + %5q*8]
109cabdff1aSopenharmony_ci%if mmsize == 32
110cabdff1aSopenharmony_ci    mov      %5d, [%3 + %4 + 8]
111cabdff1aSopenharmony_ci    movsd  xmm%6, [%2 + %5q*8]
112cabdff1aSopenharmony_ci%endif
113cabdff1aSopenharmony_ci    mov      %5d, [%3 + %4 + 4]
114cabdff1aSopenharmony_ci    movhps xmm%1, [%2 + %5q*8]
115cabdff1aSopenharmony_ci%if mmsize == 32
116cabdff1aSopenharmony_ci    mov      %5d, [%3 + %4 + 12]
117cabdff1aSopenharmony_ci    movhps xmm%6, [%2 + %5q*8]
118cabdff1aSopenharmony_ci    vinsertf128 %1, %1, xmm%6, 1
119cabdff1aSopenharmony_ci%endif
120cabdff1aSopenharmony_ci%endif
121cabdff1aSopenharmony_ci%endmacro
122cabdff1aSopenharmony_ci
123cabdff1aSopenharmony_ci; Single 2-point in-place complex FFT (will do 2 transforms at once in AVX mode)
124cabdff1aSopenharmony_ci; %1 - coefficients (r0.reim, r1.reim)
125cabdff1aSopenharmony_ci; %2 - temporary
126cabdff1aSopenharmony_ci%macro FFT2 2
127cabdff1aSopenharmony_ci    shufps   %2, %1, %1, q3322
128cabdff1aSopenharmony_ci    shufps   %1, %1, %1, q1100
129cabdff1aSopenharmony_ci
130cabdff1aSopenharmony_ci    addsubps %1, %1, %2
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_ci    shufps   %1, %1, %1, q2031
133cabdff1aSopenharmony_ci%endmacro
134cabdff1aSopenharmony_ci
135cabdff1aSopenharmony_ci; Single 4-point in-place complex FFT (will do 2 transforms at once in [AVX] mode)
136cabdff1aSopenharmony_ci; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
137cabdff1aSopenharmony_ci; %2 - odd coefficients  (r1.reim, r3.reim, r5.reim, r7.reim)
138cabdff1aSopenharmony_ci; %3 - temporary
139cabdff1aSopenharmony_ci%macro FFT4 3
140cabdff1aSopenharmony_ci    subps  %3, %1, %2         ;  r1234, [r5678]
141cabdff1aSopenharmony_ci    addps  %1, %1, %2         ;  t1234, [t5678]
142cabdff1aSopenharmony_ci
143cabdff1aSopenharmony_ci    shufps %2, %1, %3, q1010  ;  t12, r12
144cabdff1aSopenharmony_ci    shufps %1, %1, %3, q2332  ;  t34, r43
145cabdff1aSopenharmony_ci
146cabdff1aSopenharmony_ci    subps  %3, %2, %1         ;  a34, b32
147cabdff1aSopenharmony_ci    addps  %2, %2, %1         ;  a12, b14
148cabdff1aSopenharmony_ci
149cabdff1aSopenharmony_ci    shufps %1, %2, %3, q1010  ;  a1234     even
150cabdff1aSopenharmony_ci
151cabdff1aSopenharmony_ci    shufps %2, %2, %3, q2332  ;  b1423
152cabdff1aSopenharmony_ci    shufps %2, %2, %2, q1320  ;  b1234     odd
153cabdff1aSopenharmony_ci%endmacro
154cabdff1aSopenharmony_ci
155cabdff1aSopenharmony_ci; Single/Dual 8-point in-place complex FFT (will do 2 transforms in [AVX] mode)
156cabdff1aSopenharmony_ci; %1 - even coefficients (a0.reim, a2.reim, [b0.reim, b2.reim])
157cabdff1aSopenharmony_ci; %2 - even coefficients (a4.reim, a6.reim, [b4.reim, b6.reim])
158cabdff1aSopenharmony_ci; %3 - odd coefficients  (a1.reim, a3.reim, [b1.reim, b3.reim])
159cabdff1aSopenharmony_ci; %4 - odd coefficients  (a5.reim, a7.reim, [b5.reim, b7.reim])
160cabdff1aSopenharmony_ci; %5 - temporary
161cabdff1aSopenharmony_ci; %6 - temporary
162cabdff1aSopenharmony_ci%macro FFT8 6
163cabdff1aSopenharmony_ci    addps    %5, %1, %3               ; q1-8
164cabdff1aSopenharmony_ci    addps    %6, %2, %4               ; k1-8
165cabdff1aSopenharmony_ci
166cabdff1aSopenharmony_ci    subps    %1, %1, %3               ; r1-8
167cabdff1aSopenharmony_ci    subps    %2, %2, %4               ; j1-8
168cabdff1aSopenharmony_ci
169cabdff1aSopenharmony_ci    shufps   %4, %1, %1, q2323        ; r4343
170cabdff1aSopenharmony_ci    shufps   %3, %5, %6, q3032        ; q34, k14
171cabdff1aSopenharmony_ci
172cabdff1aSopenharmony_ci    shufps   %1, %1, %1, q1010        ; r1212
173cabdff1aSopenharmony_ci    shufps   %5, %5, %6, q1210        ; q12, k32
174cabdff1aSopenharmony_ci
175cabdff1aSopenharmony_ci    xorps    %4, %4, [mask_pmmppmmp]  ; r4343 * pmmp
176cabdff1aSopenharmony_ci    addps    %6, %5, %3               ; s12, g12
177cabdff1aSopenharmony_ci
178cabdff1aSopenharmony_ci    mulps    %2, %2, [d8_mult_odd]    ; r8 * d8_mult_odd
179cabdff1aSopenharmony_ci    subps    %5, %5, %3               ; s34, g43
180cabdff1aSopenharmony_ci
181cabdff1aSopenharmony_ci    addps    %3, %1, %4               ; z1234
182cabdff1aSopenharmony_ci    unpcklpd %1, %6, %5               ; s1234
183cabdff1aSopenharmony_ci
184cabdff1aSopenharmony_ci    shufps   %4, %2, %2, q2301        ; j2143
185cabdff1aSopenharmony_ci    shufps   %6, %6, %5, q2332        ; g1234
186cabdff1aSopenharmony_ci
187cabdff1aSopenharmony_ci    addsubps %2, %2, %4               ; l2143
188cabdff1aSopenharmony_ci    shufps   %5, %2, %2, q0123        ; l3412
189cabdff1aSopenharmony_ci    addsubps %5, %5, %2               ; t1234
190cabdff1aSopenharmony_ci
191cabdff1aSopenharmony_ci    subps    %2, %1, %6               ; h1234 even
192cabdff1aSopenharmony_ci    subps    %4, %3, %5               ; u1234 odd
193cabdff1aSopenharmony_ci
194cabdff1aSopenharmony_ci    addps    %1, %1, %6               ; w1234 even
195cabdff1aSopenharmony_ci    addps    %3, %3, %5               ; o1234 odd
196cabdff1aSopenharmony_ci%endmacro
197cabdff1aSopenharmony_ci
198cabdff1aSopenharmony_ci; Single 8-point in-place complex FFT in 20 instructions
199cabdff1aSopenharmony_ci; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
200cabdff1aSopenharmony_ci; %2 - odd coefficients  (r1.reim, r3.reim, r5.reim, r7.reim)
201cabdff1aSopenharmony_ci; %3 - temporary
202cabdff1aSopenharmony_ci; %4 - temporary
203cabdff1aSopenharmony_ci%macro FFT8_AVX 4
204cabdff1aSopenharmony_ci    subps      %3, %1, %2               ;  r1234, r5678
205cabdff1aSopenharmony_ci    addps      %1, %1, %2               ;  q1234, q5678
206cabdff1aSopenharmony_ci
207cabdff1aSopenharmony_ci    vpermilps  %2, %3, [s8_perm_odd1]   ;  r4422, r6688
208cabdff1aSopenharmony_ci    shufps     %4, %1, %1, q3322        ;  q1122, q5566
209cabdff1aSopenharmony_ci
210cabdff1aSopenharmony_ci    movsldup   %3, %3                   ;  r1133, r5577
211cabdff1aSopenharmony_ci    shufps     %1, %1, %1, q1100        ;  q3344, q7788
212cabdff1aSopenharmony_ci
213cabdff1aSopenharmony_ci    addsubps   %3, %3, %2               ;  z1234, z5678
214cabdff1aSopenharmony_ci    addsubps   %1, %1, %4               ;  s3142, s7586
215cabdff1aSopenharmony_ci
216cabdff1aSopenharmony_ci    mulps      %3, %3, [s8_mult_odd]    ;  z * s8_mult_odd
217cabdff1aSopenharmony_ci    vpermilps  %1, %1, [s8_perm_even]   ;  s1234, s5687 !
218cabdff1aSopenharmony_ci
219cabdff1aSopenharmony_ci    shufps     %2, %3, %3, q2332        ;   junk, z7887
220cabdff1aSopenharmony_ci    xorps      %4, %1, [mask_mmmmpppm]  ;  e1234, e5687 !
221cabdff1aSopenharmony_ci
222cabdff1aSopenharmony_ci    vpermilps  %3, %3, [s8_perm_odd2]   ;  z2314, z6556
223cabdff1aSopenharmony_ci    vperm2f128 %1, %1, %4, 0x03         ;  e5687, s1234
224cabdff1aSopenharmony_ci
225cabdff1aSopenharmony_ci    addsubps   %2, %2, %3               ;   junk, t5678
226cabdff1aSopenharmony_ci    subps      %1, %1, %4               ;  w1234, w5678 even
227cabdff1aSopenharmony_ci
228cabdff1aSopenharmony_ci    vperm2f128 %2, %2, %2, 0x11         ;  t5678, t5678
229cabdff1aSopenharmony_ci    vperm2f128 %3, %3, %3, 0x00         ;  z2314, z2314
230cabdff1aSopenharmony_ci
231cabdff1aSopenharmony_ci    xorps      %2, %2, [mask_ppmpmmpm]  ;  t * ppmpmmpm
232cabdff1aSopenharmony_ci    addps      %2, %3, %2               ;  u1234, u5678 odd
233cabdff1aSopenharmony_ci%endmacro
234cabdff1aSopenharmony_ci
235cabdff1aSopenharmony_ci; Single 16-point in-place complex FFT
236cabdff1aSopenharmony_ci; %1 - even coefficients (r0.reim, r2.reim,  r4.reim,  r6.reim)
237cabdff1aSopenharmony_ci; %2 - even coefficients (r8.reim, r10.reim, r12.reim, r14.reim)
238cabdff1aSopenharmony_ci; %3 - odd coefficients  (r1.reim, r3.reim,  r5.reim,  r7.reim)
239cabdff1aSopenharmony_ci; %4 - odd coefficients  (r9.reim, r11.reim, r13.reim, r15.reim)
240cabdff1aSopenharmony_ci; %5, %6 - temporary
241cabdff1aSopenharmony_ci; %7, %8 - temporary (optional)
242cabdff1aSopenharmony_ci%macro FFT16 6-8
243cabdff1aSopenharmony_ci    FFT4       %3, %4, %5
244cabdff1aSopenharmony_ci%if %0 > 7
245cabdff1aSopenharmony_ci    FFT8_AVX   %1, %2, %6, %7
246cabdff1aSopenharmony_ci    movaps     %8, [mask_mpmppmpm]
247cabdff1aSopenharmony_ci    movaps     %7, [s16_perm]
248cabdff1aSopenharmony_ci%define mask %8
249cabdff1aSopenharmony_ci%define perm %7
250cabdff1aSopenharmony_ci%elif %0 > 6
251cabdff1aSopenharmony_ci    FFT8_AVX   %1, %2, %6, %7
252cabdff1aSopenharmony_ci    movaps     %7, [s16_perm]
253cabdff1aSopenharmony_ci%define mask [mask_mpmppmpm]
254cabdff1aSopenharmony_ci%define perm %7
255cabdff1aSopenharmony_ci%else
256cabdff1aSopenharmony_ci    FFT8_AVX   %1, %2, %6, %5
257cabdff1aSopenharmony_ci%define mask [mask_mpmppmpm]
258cabdff1aSopenharmony_ci%define perm [s16_perm]
259cabdff1aSopenharmony_ci%endif
260cabdff1aSopenharmony_ci    xorps      %5, %5, %5                   ; 0
261cabdff1aSopenharmony_ci
262cabdff1aSopenharmony_ci    shufps     %6, %4, %4, q2301            ; z12.imre, z13.imre...
263cabdff1aSopenharmony_ci    shufps     %5, %5, %3, q2301            ; 0, 0, z8.imre...
264cabdff1aSopenharmony_ci
265cabdff1aSopenharmony_ci    mulps      %4, %4, [s16_mult_odd1]      ; z.reim * costab
266cabdff1aSopenharmony_ci    xorps      %5, %5, [mask_mppmmpmp]
267cabdff1aSopenharmony_ci%if cpuflag(fma3)
268cabdff1aSopenharmony_ci    fmaddps    %6, %6, [s16_mult_odd2], %4  ; s[8..15]
269cabdff1aSopenharmony_ci    addps      %5, %3, %5                   ; s[0...7]
270cabdff1aSopenharmony_ci%else
271cabdff1aSopenharmony_ci    mulps      %6, %6, [s16_mult_odd2]      ; z.imre * costab
272cabdff1aSopenharmony_ci
273cabdff1aSopenharmony_ci    addps      %5, %3, %5                   ; s[0...7]
274cabdff1aSopenharmony_ci    addps      %6, %4, %6                   ; s[8..15]
275cabdff1aSopenharmony_ci%endif
276cabdff1aSopenharmony_ci    mulps      %5, %5, [s16_mult_even]      ; s[0...7]*costab
277cabdff1aSopenharmony_ci
278cabdff1aSopenharmony_ci    xorps      %4, %6, mask                 ; s[8..15]*mpmppmpm
279cabdff1aSopenharmony_ci    xorps      %3, %5, mask                 ; s[0...7]*mpmppmpm
280cabdff1aSopenharmony_ci
281cabdff1aSopenharmony_ci    vperm2f128 %4, %4, %4, 0x01             ; s[12..15, 8..11]
282cabdff1aSopenharmony_ci    vperm2f128 %3, %3, %3, 0x01             ; s[4..7, 0..3]
283cabdff1aSopenharmony_ci
284cabdff1aSopenharmony_ci    addps      %6, %6, %4                   ; y56, u56, y34, u34
285cabdff1aSopenharmony_ci    addps      %5, %5, %3                   ; w56, x56, w34, x34
286cabdff1aSopenharmony_ci
287cabdff1aSopenharmony_ci    vpermilps  %6, %6, perm                 ; y56, u56, y43, u43
288cabdff1aSopenharmony_ci    vpermilps  %5, %5, perm                 ; w56, x56, w43, x43
289cabdff1aSopenharmony_ci
290cabdff1aSopenharmony_ci    subps      %4, %2, %6                   ; odd  part 2
291cabdff1aSopenharmony_ci    addps      %3, %2, %6                   ; odd  part 1
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ci    subps      %2, %1, %5                   ; even part 2
294cabdff1aSopenharmony_ci    addps      %1, %1, %5                   ; even part 1
295cabdff1aSopenharmony_ci%undef mask
296cabdff1aSopenharmony_ci%undef perm
297cabdff1aSopenharmony_ci%endmacro
298cabdff1aSopenharmony_ci
299cabdff1aSopenharmony_ci; Cobmines m0...m8 (tx1[even, even, odd, odd], tx2,3[even], tx2,3[odd]) coeffs
300cabdff1aSopenharmony_ci; Uses all 16 of registers.
301cabdff1aSopenharmony_ci; Output is slightly permuted such that tx2,3's coefficients are interleaved
302cabdff1aSopenharmony_ci; on a 2-point basis (look at `doc/transforms.md`)
303cabdff1aSopenharmony_ci%macro SPLIT_RADIX_COMBINE 17
304cabdff1aSopenharmony_ci%if %1 && mmsize == 32
305cabdff1aSopenharmony_ci    vperm2f128 %14, %6, %7, 0x20     ; m2[0], m2[1], m3[0], m3[1] even
306cabdff1aSopenharmony_ci    vperm2f128 %16, %9, %8, 0x20     ; m2[0], m2[1], m3[0], m3[1] odd
307cabdff1aSopenharmony_ci    vperm2f128 %15, %6, %7, 0x31     ; m2[2], m2[3], m3[2], m3[3] even
308cabdff1aSopenharmony_ci    vperm2f128 %17, %9, %8, 0x31     ; m2[2], m2[3], m3[2], m3[3] odd
309cabdff1aSopenharmony_ci%endif
310cabdff1aSopenharmony_ci
311cabdff1aSopenharmony_ci    shufps     %12, %10, %10, q2200  ; cos00224466
312cabdff1aSopenharmony_ci    shufps     %13, %11, %11, q1133  ; wim77553311
313cabdff1aSopenharmony_ci    movshdup   %10, %10              ; cos11335577
314cabdff1aSopenharmony_ci    shufps     %11, %11, %11, q0022  ; wim66442200
315cabdff1aSopenharmony_ci
316cabdff1aSopenharmony_ci%if %1 && mmsize == 32
317cabdff1aSopenharmony_ci    shufps     %6, %14, %14, q2301   ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre even
318cabdff1aSopenharmony_ci    shufps     %8, %16, %16, q2301   ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre odd
319cabdff1aSopenharmony_ci    shufps     %7, %15, %15, q2301   ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre even
320cabdff1aSopenharmony_ci    shufps     %9, %17, %17, q2301   ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre odd
321cabdff1aSopenharmony_ci
322cabdff1aSopenharmony_ci    mulps      %14, %14, %13         ; m2[0123]reim * wim7531 even
323cabdff1aSopenharmony_ci    mulps      %16, %16, %11         ; m2[0123]reim * wim7531 odd
324cabdff1aSopenharmony_ci    mulps      %15, %15, %13         ; m3[0123]reim * wim7531 even
325cabdff1aSopenharmony_ci    mulps      %17, %17, %11         ; m3[0123]reim * wim7531 odd
326cabdff1aSopenharmony_ci%else
327cabdff1aSopenharmony_ci    mulps      %14, %6, %13          ; m2,3[01]reim * wim7531 even
328cabdff1aSopenharmony_ci    mulps      %16, %8, %11          ; m2,3[01]reim * wim7531 odd
329cabdff1aSopenharmony_ci    mulps      %15, %7, %13          ; m2,3[23]reim * wim7531 even
330cabdff1aSopenharmony_ci    mulps      %17, %9, %11          ; m2,3[23]reim * wim7531 odd
331cabdff1aSopenharmony_ci    ; reorder the multiplies to save movs reg, reg in the %if above
332cabdff1aSopenharmony_ci    shufps     %6, %6, %6, q2301     ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
333cabdff1aSopenharmony_ci    shufps     %8, %8, %8, q2301     ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre odd
334cabdff1aSopenharmony_ci    shufps     %7, %7, %7, q2301     ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
335cabdff1aSopenharmony_ci    shufps     %9, %9, %9, q2301     ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre odd
336cabdff1aSopenharmony_ci%endif
337cabdff1aSopenharmony_ci
338cabdff1aSopenharmony_ci%if cpuflag(fma3) ; 11 - 5 = 6 instructions saved through FMA!
339cabdff1aSopenharmony_ci    fmaddsubps %6, %6, %12, %14      ; w[0..8] even
340cabdff1aSopenharmony_ci    fmaddsubps %8, %8, %10, %16      ; w[0..8] odd
341cabdff1aSopenharmony_ci    fmsubaddps %7, %7, %12, %15      ; j[0..8] even
342cabdff1aSopenharmony_ci    fmsubaddps %9, %9, %10, %17      ; j[0..8] odd
343cabdff1aSopenharmony_ci    movaps     %13, [mask_pmpmpmpm]  ; "subaddps? pfft, who needs that!"
344cabdff1aSopenharmony_ci%else
345cabdff1aSopenharmony_ci    mulps      %6, %6, %12           ; m2,3[01]imre * cos0246
346cabdff1aSopenharmony_ci    mulps      %8, %8, %10           ; m2,3[01]imre * cos0246
347cabdff1aSopenharmony_ci    movaps     %13, [mask_pmpmpmpm]  ; "subaddps? pfft, who needs that!"
348cabdff1aSopenharmony_ci    mulps      %7, %7, %12           ; m2,3[23]reim * cos0246
349cabdff1aSopenharmony_ci    mulps      %9, %9, %10           ; m2,3[23]reim * cos0246
350cabdff1aSopenharmony_ci    addsubps   %6, %6, %14           ; w[0..8]
351cabdff1aSopenharmony_ci    addsubps   %8, %8, %16           ; w[0..8]
352cabdff1aSopenharmony_ci    xorps      %15, %15, %13         ; +-m2,3[23]imre * wim7531
353cabdff1aSopenharmony_ci    xorps      %17, %17, %13         ; +-m2,3[23]imre * wim7531
354cabdff1aSopenharmony_ci    addps      %7, %7, %15           ; j[0..8]
355cabdff1aSopenharmony_ci    addps      %9, %9, %17           ; j[0..8]
356cabdff1aSopenharmony_ci%endif
357cabdff1aSopenharmony_ci
358cabdff1aSopenharmony_ci    addps      %14, %6, %7           ; t10235476 even
359cabdff1aSopenharmony_ci    addps      %16, %8, %9           ; t10235476 odd
360cabdff1aSopenharmony_ci    subps      %15, %6, %7           ; +-r[0..7] even
361cabdff1aSopenharmony_ci    subps      %17, %8, %9           ; +-r[0..7] odd
362cabdff1aSopenharmony_ci
363cabdff1aSopenharmony_ci    shufps     %14, %14, %14, q2301  ; t[0..7] even
364cabdff1aSopenharmony_ci    shufps     %16, %16, %16, q2301  ; t[0..7] odd
365cabdff1aSopenharmony_ci    xorps      %15, %15, %13         ; r[0..7] even
366cabdff1aSopenharmony_ci    xorps      %17, %17, %13         ; r[0..7] odd
367cabdff1aSopenharmony_ci
368cabdff1aSopenharmony_ci    subps      %6, %2, %14           ; m2,3[01] even
369cabdff1aSopenharmony_ci    subps      %8, %4, %16           ; m2,3[01] odd
370cabdff1aSopenharmony_ci    subps      %7, %3, %15           ; m2,3[23] even
371cabdff1aSopenharmony_ci    subps      %9, %5, %17           ; m2,3[23] odd
372cabdff1aSopenharmony_ci
373cabdff1aSopenharmony_ci    addps      %2, %2, %14           ; m0 even
374cabdff1aSopenharmony_ci    addps      %4, %4, %16           ; m0 odd
375cabdff1aSopenharmony_ci    addps      %3, %3, %15           ; m1 even
376cabdff1aSopenharmony_ci    addps      %5, %5, %17           ; m1 odd
377cabdff1aSopenharmony_ci%endmacro
378cabdff1aSopenharmony_ci
379cabdff1aSopenharmony_ci; Same as above, only does one parity at a time, takes 3 temporary registers,
380cabdff1aSopenharmony_ci; however, if the twiddles aren't needed after this, the registers they use
381cabdff1aSopenharmony_ci; can be used as any of the temporary registers.
382cabdff1aSopenharmony_ci%macro SPLIT_RADIX_COMBINE_HALF 10
383cabdff1aSopenharmony_ci%if %1
384cabdff1aSopenharmony_ci    shufps     %8, %6, %6, q2200     ; cos00224466
385cabdff1aSopenharmony_ci    shufps     %9, %7, %7, q1133     ; wim77553311
386cabdff1aSopenharmony_ci%else
387cabdff1aSopenharmony_ci    shufps     %8, %6, %6, q3311     ; cos11335577
388cabdff1aSopenharmony_ci    shufps     %9, %7, %7, q0022     ; wim66442200
389cabdff1aSopenharmony_ci%endif
390cabdff1aSopenharmony_ci
391cabdff1aSopenharmony_ci    mulps      %10, %4, %9           ; m2,3[01]reim * wim7531 even
392cabdff1aSopenharmony_ci    mulps      %9, %9, %5            ; m2,3[23]reim * wim7531 even
393cabdff1aSopenharmony_ci
394cabdff1aSopenharmony_ci    shufps     %4, %4, %4, q2301     ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
395cabdff1aSopenharmony_ci    shufps     %5, %5, %5, q2301     ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
396cabdff1aSopenharmony_ci
397cabdff1aSopenharmony_ci%if cpuflag(fma3)
398cabdff1aSopenharmony_ci    fmaddsubps %4, %4, %8, %10       ; w[0..8] even
399cabdff1aSopenharmony_ci    fmsubaddps %5, %5, %8, %9        ; j[0..8] even
400cabdff1aSopenharmony_ci    movaps     %10, [mask_pmpmpmpm]
401cabdff1aSopenharmony_ci%else
402cabdff1aSopenharmony_ci    mulps      %4, %4, %8            ; m2,3[01]imre * cos0246
403cabdff1aSopenharmony_ci    mulps      %5, %5, %8            ; m2,3[23]reim * cos0246
404cabdff1aSopenharmony_ci    addsubps   %4, %4, %10           ; w[0..8]
405cabdff1aSopenharmony_ci    movaps     %10, [mask_pmpmpmpm]
406cabdff1aSopenharmony_ci    xorps      %9, %9, %10           ; +-m2,3[23]imre * wim7531
407cabdff1aSopenharmony_ci    addps      %5, %5, %9            ; j[0..8]
408cabdff1aSopenharmony_ci%endif
409cabdff1aSopenharmony_ci
410cabdff1aSopenharmony_ci    addps      %8, %4, %5            ; t10235476
411cabdff1aSopenharmony_ci    subps      %9, %4, %5            ; +-r[0..7]
412cabdff1aSopenharmony_ci
413cabdff1aSopenharmony_ci    shufps     %8, %8, %8, q2301     ; t[0..7]
414cabdff1aSopenharmony_ci    xorps      %9, %9, %10           ; r[0..7]
415cabdff1aSopenharmony_ci
416cabdff1aSopenharmony_ci    subps      %4, %2, %8            ; %3,3[01]
417cabdff1aSopenharmony_ci    subps      %5, %3, %9            ; %3,3[23]
418cabdff1aSopenharmony_ci
419cabdff1aSopenharmony_ci    addps      %2, %2, %8            ; m0
420cabdff1aSopenharmony_ci    addps      %3, %3, %9            ; m1
421cabdff1aSopenharmony_ci%endmacro
422cabdff1aSopenharmony_ci
423cabdff1aSopenharmony_ci; Same as above, tries REALLY hard to use 2 temporary registers.
424cabdff1aSopenharmony_ci%macro SPLIT_RADIX_COMBINE_LITE 9
425cabdff1aSopenharmony_ci%if %1
426cabdff1aSopenharmony_ci    shufps     %8, %6, %6, q2200        ; cos00224466
427cabdff1aSopenharmony_ci    shufps     %9, %7, %7, q1133        ; wim77553311
428cabdff1aSopenharmony_ci%else
429cabdff1aSopenharmony_ci    shufps     %8, %6, %6, q3311        ; cos11335577
430cabdff1aSopenharmony_ci    shufps     %9, %7, %7, q0022        ; wim66442200
431cabdff1aSopenharmony_ci%endif
432cabdff1aSopenharmony_ci
433cabdff1aSopenharmony_ci    mulps      %9, %9, %4               ; m2,3[01]reim * wim7531 even
434cabdff1aSopenharmony_ci    shufps     %4, %4, %4, q2301        ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
435cabdff1aSopenharmony_ci
436cabdff1aSopenharmony_ci%if cpuflag(fma3)
437cabdff1aSopenharmony_ci    fmaddsubps %4, %4, %8, %9           ; w[0..8] even
438cabdff1aSopenharmony_ci%else
439cabdff1aSopenharmony_ci    mulps      %4, %4, %8               ; m2,3[01]imre * cos0246
440cabdff1aSopenharmony_ci    addsubps   %4, %4, %9               ; w[0..8]
441cabdff1aSopenharmony_ci%endif
442cabdff1aSopenharmony_ci
443cabdff1aSopenharmony_ci%if %1
444cabdff1aSopenharmony_ci    shufps     %9, %7, %7, q1133        ; wim77553311
445cabdff1aSopenharmony_ci%else
446cabdff1aSopenharmony_ci    shufps     %9, %7, %7, q0022        ; wim66442200
447cabdff1aSopenharmony_ci%endif
448cabdff1aSopenharmony_ci
449cabdff1aSopenharmony_ci    mulps      %9, %9, %5               ; m2,3[23]reim * wim7531 even
450cabdff1aSopenharmony_ci    shufps     %5, %5, %5, q2301        ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
451cabdff1aSopenharmony_ci%if cpuflag (fma3)
452cabdff1aSopenharmony_ci    fmsubaddps %5, %5, %8, %9           ; j[0..8] even
453cabdff1aSopenharmony_ci%else
454cabdff1aSopenharmony_ci    mulps      %5, %5, %8               ; m2,3[23]reim * cos0246
455cabdff1aSopenharmony_ci    xorps      %9, %9, [mask_pmpmpmpm]  ; +-m2,3[23]imre * wim7531
456cabdff1aSopenharmony_ci    addps      %5, %5, %9               ; j[0..8]
457cabdff1aSopenharmony_ci%endif
458cabdff1aSopenharmony_ci
459cabdff1aSopenharmony_ci    addps      %8, %4, %5               ; t10235476
460cabdff1aSopenharmony_ci    subps      %9, %4, %5               ; +-r[0..7]
461cabdff1aSopenharmony_ci
462cabdff1aSopenharmony_ci    shufps     %8, %8, %8, q2301        ; t[0..7]
463cabdff1aSopenharmony_ci    xorps      %9, %9, [mask_pmpmpmpm]  ; r[0..7]
464cabdff1aSopenharmony_ci
465cabdff1aSopenharmony_ci    subps      %4, %2, %8               ; %3,3[01]
466cabdff1aSopenharmony_ci    subps      %5, %3, %9               ; %3,3[23]
467cabdff1aSopenharmony_ci
468cabdff1aSopenharmony_ci    addps      %2, %2, %8               ; m0
469cabdff1aSopenharmony_ci    addps      %3, %3, %9               ; m1
470cabdff1aSopenharmony_ci%endmacro
471cabdff1aSopenharmony_ci
472cabdff1aSopenharmony_ci%macro SPLIT_RADIX_COMBINE_64 0
473cabdff1aSopenharmony_ci    SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2
474cabdff1aSopenharmony_ci
475cabdff1aSopenharmony_ci    movaps [outq +  0*mmsize], m0
476cabdff1aSopenharmony_ci    movaps [outq +  4*mmsize], m1
477cabdff1aSopenharmony_ci    movaps [outq +  8*mmsize], tx1_e0
478cabdff1aSopenharmony_ci    movaps [outq + 12*mmsize], tx2_e0
479cabdff1aSopenharmony_ci
480cabdff1aSopenharmony_ci    SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, m0
481cabdff1aSopenharmony_ci
482cabdff1aSopenharmony_ci    movaps [outq +  2*mmsize], m2
483cabdff1aSopenharmony_ci    movaps [outq +  6*mmsize], m3
484cabdff1aSopenharmony_ci    movaps [outq + 10*mmsize], tx1_o0
485cabdff1aSopenharmony_ci    movaps [outq + 14*mmsize], tx2_o0
486cabdff1aSopenharmony_ci
487cabdff1aSopenharmony_ci    movaps tw_e,           [tab_64_float + mmsize]
488cabdff1aSopenharmony_ci    vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
489cabdff1aSopenharmony_ci
490cabdff1aSopenharmony_ci    movaps m0, [outq +  1*mmsize]
491cabdff1aSopenharmony_ci    movaps m1, [outq +  3*mmsize]
492cabdff1aSopenharmony_ci    movaps m2, [outq +  5*mmsize]
493cabdff1aSopenharmony_ci    movaps m3, [outq +  7*mmsize]
494cabdff1aSopenharmony_ci
495cabdff1aSopenharmony_ci    SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \
496cabdff1aSopenharmony_ci                           tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers
497cabdff1aSopenharmony_ci
498cabdff1aSopenharmony_ci    movaps [outq +  1*mmsize], m0
499cabdff1aSopenharmony_ci    movaps [outq +  3*mmsize], m1
500cabdff1aSopenharmony_ci    movaps [outq +  5*mmsize], m2
501cabdff1aSopenharmony_ci    movaps [outq +  7*mmsize], m3
502cabdff1aSopenharmony_ci
503cabdff1aSopenharmony_ci    movaps [outq +  9*mmsize], tx1_e1
504cabdff1aSopenharmony_ci    movaps [outq + 11*mmsize], tx1_o1
505cabdff1aSopenharmony_ci    movaps [outq + 13*mmsize], tx2_e1
506cabdff1aSopenharmony_ci    movaps [outq + 15*mmsize], tx2_o1
507cabdff1aSopenharmony_ci%endmacro
508cabdff1aSopenharmony_ci
509cabdff1aSopenharmony_ci; Perform a single even/odd split radix combination with loads and stores
510cabdff1aSopenharmony_ci; The _4 indicates this is a quarter of the iterations required to complete a full
511cabdff1aSopenharmony_ci; combine loop
512cabdff1aSopenharmony_ci; %1 must contain len*2, %2 must contain len*4, %3 must contain len*6
513cabdff1aSopenharmony_ci%macro SPLIT_RADIX_LOAD_COMBINE_4 8
514cabdff1aSopenharmony_ci    movaps m8,         [rtabq + (%5)*mmsize + %7]
515cabdff1aSopenharmony_ci    vperm2f128 m9, m9, [itabq - (%5)*mmsize + %8], 0x23
516cabdff1aSopenharmony_ci
517cabdff1aSopenharmony_ci    movaps m0, [outq +      (0 + %4)*mmsize + %6]
518cabdff1aSopenharmony_ci    movaps m2, [outq +      (2 + %4)*mmsize + %6]
519cabdff1aSopenharmony_ci    movaps m1, [outq + %1 + (0 + %4)*mmsize + %6]
520cabdff1aSopenharmony_ci    movaps m3, [outq + %1 + (2 + %4)*mmsize + %6]
521cabdff1aSopenharmony_ci
522cabdff1aSopenharmony_ci    movaps m4, [outq + %2 + (0 + %4)*mmsize + %6]
523cabdff1aSopenharmony_ci    movaps m6, [outq + %2 + (2 + %4)*mmsize + %6]
524cabdff1aSopenharmony_ci    movaps m5, [outq + %3 + (0 + %4)*mmsize + %6]
525cabdff1aSopenharmony_ci    movaps m7, [outq + %3 + (2 + %4)*mmsize + %6]
526cabdff1aSopenharmony_ci
527cabdff1aSopenharmony_ci    SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
528cabdff1aSopenharmony_ci                           m4, m5, m6, m7, \
529cabdff1aSopenharmony_ci                           m8, m9, \
530cabdff1aSopenharmony_ci                           m10, m11, m12, m13, m14, m15
531cabdff1aSopenharmony_ci
532cabdff1aSopenharmony_ci    movaps [outq +      (0 + %4)*mmsize + %6], m0
533cabdff1aSopenharmony_ci    movaps [outq +      (2 + %4)*mmsize + %6], m2
534cabdff1aSopenharmony_ci    movaps [outq + %1 + (0 + %4)*mmsize + %6], m1
535cabdff1aSopenharmony_ci    movaps [outq + %1 + (2 + %4)*mmsize + %6], m3
536cabdff1aSopenharmony_ci
537cabdff1aSopenharmony_ci    movaps [outq + %2 + (0 + %4)*mmsize + %6], m4
538cabdff1aSopenharmony_ci    movaps [outq + %2 + (2 + %4)*mmsize + %6], m6
539cabdff1aSopenharmony_ci    movaps [outq + %3 + (0 + %4)*mmsize + %6], m5
540cabdff1aSopenharmony_ci    movaps [outq + %3 + (2 + %4)*mmsize + %6], m7
541cabdff1aSopenharmony_ci%endmacro
542cabdff1aSopenharmony_ci
543cabdff1aSopenharmony_ci%macro SPLIT_RADIX_LOAD_COMBINE_FULL 2-5
544cabdff1aSopenharmony_ci%if %0 > 2
545cabdff1aSopenharmony_ci%define offset_c %3
546cabdff1aSopenharmony_ci%else
547cabdff1aSopenharmony_ci%define offset_c 0
548cabdff1aSopenharmony_ci%endif
549cabdff1aSopenharmony_ci%if %0 > 3
550cabdff1aSopenharmony_ci%define offset_r %4
551cabdff1aSopenharmony_ci%else
552cabdff1aSopenharmony_ci%define offset_r 0
553cabdff1aSopenharmony_ci%endif
554cabdff1aSopenharmony_ci%if %0 > 4
555cabdff1aSopenharmony_ci%define offset_i %5
556cabdff1aSopenharmony_ci%else
557cabdff1aSopenharmony_ci%define offset_i 0
558cabdff1aSopenharmony_ci%endif
559cabdff1aSopenharmony_ci
560cabdff1aSopenharmony_ci    SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 0, 0, offset_c, offset_r, offset_i
561cabdff1aSopenharmony_ci    SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 1, 1, offset_c, offset_r, offset_i
562cabdff1aSopenharmony_ci    SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 4, 2, offset_c, offset_r, offset_i
563cabdff1aSopenharmony_ci    SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 5, 3, offset_c, offset_r, offset_i
564cabdff1aSopenharmony_ci%endmacro
565cabdff1aSopenharmony_ci
566cabdff1aSopenharmony_ci; Perform a single even/odd split radix combination with loads, deinterleaves and
567cabdff1aSopenharmony_ci; stores. The _2 indicates this is a half of the iterations required to complete
568cabdff1aSopenharmony_ci; a full combine+deinterleave loop
569cabdff1aSopenharmony_ci; %3 must contain len*2, %4 must contain len*4, %5 must contain len*6
570cabdff1aSopenharmony_ci%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 6
571cabdff1aSopenharmony_ci    movaps m8,         [rtabq + (0 + %2)*mmsize]
572cabdff1aSopenharmony_ci    vperm2f128 m9, m9, [itabq - (0 + %2)*mmsize], 0x23
573cabdff1aSopenharmony_ci
574cabdff1aSopenharmony_ci    movaps m0, [outq +      (0 + 0 + %1)*mmsize + %6]
575cabdff1aSopenharmony_ci    movaps m2, [outq +      (2 + 0 + %1)*mmsize + %6]
576cabdff1aSopenharmony_ci    movaps m1, [outq + %3 + (0 + 0 + %1)*mmsize + %6]
577cabdff1aSopenharmony_ci    movaps m3, [outq + %3 + (2 + 0 + %1)*mmsize + %6]
578cabdff1aSopenharmony_ci
579cabdff1aSopenharmony_ci    movaps m4, [outq + %4 + (0 + 0 + %1)*mmsize + %6]
580cabdff1aSopenharmony_ci    movaps m6, [outq + %4 + (2 + 0 + %1)*mmsize + %6]
581cabdff1aSopenharmony_ci    movaps m5, [outq + %5 + (0 + 0 + %1)*mmsize + %6]
582cabdff1aSopenharmony_ci    movaps m7, [outq + %5 + (2 + 0 + %1)*mmsize + %6]
583cabdff1aSopenharmony_ci
584cabdff1aSopenharmony_ci    SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
585cabdff1aSopenharmony_ci       m4, m5, m6, m7, \
586cabdff1aSopenharmony_ci       m8, m9, \
587cabdff1aSopenharmony_ci       m10, m11, m12, m13, m14, m15
588cabdff1aSopenharmony_ci
589cabdff1aSopenharmony_ci    unpckhpd m10, m0, m2
590cabdff1aSopenharmony_ci    unpckhpd m11, m1, m3
591cabdff1aSopenharmony_ci    unpckhpd m12, m4, m6
592cabdff1aSopenharmony_ci    unpckhpd m13, m5, m7
593cabdff1aSopenharmony_ci    unpcklpd m0, m0, m2
594cabdff1aSopenharmony_ci    unpcklpd m1, m1, m3
595cabdff1aSopenharmony_ci    unpcklpd m4, m4, m6
596cabdff1aSopenharmony_ci    unpcklpd m5, m5, m7
597cabdff1aSopenharmony_ci
598cabdff1aSopenharmony_ci    vextractf128 [outq +      (0 + 0 + %1)*mmsize + %6 +  0], m0,  0
599cabdff1aSopenharmony_ci    vextractf128 [outq +      (0 + 0 + %1)*mmsize + %6 + 16], m10, 0
600cabdff1aSopenharmony_ci    vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 +  0], m1,  0
601cabdff1aSopenharmony_ci    vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 16], m11, 0
602cabdff1aSopenharmony_ci
603cabdff1aSopenharmony_ci    vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 +  0], m4,  0
604cabdff1aSopenharmony_ci    vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 16], m12, 0
605cabdff1aSopenharmony_ci    vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 +  0], m5,  0
606cabdff1aSopenharmony_ci    vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 16], m13, 0
607cabdff1aSopenharmony_ci
608cabdff1aSopenharmony_ci    vperm2f128 m10, m10, m0, 0x13
609cabdff1aSopenharmony_ci    vperm2f128 m11, m11, m1, 0x13
610cabdff1aSopenharmony_ci    vperm2f128 m12, m12, m4, 0x13
611cabdff1aSopenharmony_ci    vperm2f128 m13, m13, m5, 0x13
612cabdff1aSopenharmony_ci
613cabdff1aSopenharmony_ci    movaps m8,         [rtabq + (1 + %2)*mmsize]
614cabdff1aSopenharmony_ci    vperm2f128 m9, m9, [itabq - (1 + %2)*mmsize], 0x23
615cabdff1aSopenharmony_ci
616cabdff1aSopenharmony_ci    movaps m0, [outq +      (0 + 1 + %1)*mmsize + %6]
617cabdff1aSopenharmony_ci    movaps m2, [outq +      (2 + 1 + %1)*mmsize + %6]
618cabdff1aSopenharmony_ci    movaps m1, [outq + %3 + (0 + 1 + %1)*mmsize + %6]
619cabdff1aSopenharmony_ci    movaps m3, [outq + %3 + (2 + 1 + %1)*mmsize + %6]
620cabdff1aSopenharmony_ci
621cabdff1aSopenharmony_ci    movaps [outq +      (0 + 1 + %1)*mmsize + %6], m10 ; m0 conflict
622cabdff1aSopenharmony_ci    movaps [outq + %3 + (0 + 1 + %1)*mmsize + %6], m11 ; m1 conflict
623cabdff1aSopenharmony_ci
624cabdff1aSopenharmony_ci    movaps m4, [outq + %4 + (0 + 1 + %1)*mmsize + %6]
625cabdff1aSopenharmony_ci    movaps m6, [outq + %4 + (2 + 1 + %1)*mmsize + %6]
626cabdff1aSopenharmony_ci    movaps m5, [outq + %5 + (0 + 1 + %1)*mmsize + %6]
627cabdff1aSopenharmony_ci    movaps m7, [outq + %5 + (2 + 1 + %1)*mmsize + %6]
628cabdff1aSopenharmony_ci
629cabdff1aSopenharmony_ci    movaps [outq + %4 + (0 + 1 + %1)*mmsize + %6], m12 ; m4 conflict
630cabdff1aSopenharmony_ci    movaps [outq + %5 + (0 + 1 + %1)*mmsize + %6], m13 ; m5 conflict
631cabdff1aSopenharmony_ci
632cabdff1aSopenharmony_ci    SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
633cabdff1aSopenharmony_ci                           m4, m5, m6, m7, \
634cabdff1aSopenharmony_ci                           m8, m9, \
635cabdff1aSopenharmony_ci                           m10, m11, m12, m13, m14, m15 ; temporary registers
636cabdff1aSopenharmony_ci
637cabdff1aSopenharmony_ci    unpcklpd m8,  m0, m2
638cabdff1aSopenharmony_ci    unpcklpd m9,  m1, m3
639cabdff1aSopenharmony_ci    unpcklpd m10, m4, m6
640cabdff1aSopenharmony_ci    unpcklpd m11, m5, m7
641cabdff1aSopenharmony_ci    unpckhpd m0, m0, m2
642cabdff1aSopenharmony_ci    unpckhpd m1, m1, m3
643cabdff1aSopenharmony_ci    unpckhpd m4, m4, m6
644cabdff1aSopenharmony_ci    unpckhpd m5, m5, m7
645cabdff1aSopenharmony_ci
646cabdff1aSopenharmony_ci    vextractf128 [outq +      (2 + 0 + %1)*mmsize + %6 +  0], m8,  0
647cabdff1aSopenharmony_ci    vextractf128 [outq +      (2 + 0 + %1)*mmsize + %6 + 16], m0,  0
648cabdff1aSopenharmony_ci    vextractf128 [outq +      (2 + 1 + %1)*mmsize + %6 +  0], m8,  1
649cabdff1aSopenharmony_ci    vextractf128 [outq +      (2 + 1 + %1)*mmsize + %6 + 16], m0,  1
650cabdff1aSopenharmony_ci
651cabdff1aSopenharmony_ci    vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 +  0], m9,  0
652cabdff1aSopenharmony_ci    vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 16], m1,  0
653cabdff1aSopenharmony_ci    vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 +  0], m9,  1
654cabdff1aSopenharmony_ci    vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 16], m1,  1
655cabdff1aSopenharmony_ci
656cabdff1aSopenharmony_ci    vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 +  0], m10, 0
657cabdff1aSopenharmony_ci    vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 16], m4,  0
658cabdff1aSopenharmony_ci    vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 +  0], m10, 1
659cabdff1aSopenharmony_ci    vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 16], m4,  1
660cabdff1aSopenharmony_ci
661cabdff1aSopenharmony_ci    vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 +  0], m11, 0
662cabdff1aSopenharmony_ci    vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 16], m5,  0
663cabdff1aSopenharmony_ci    vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 +  0], m11, 1
664cabdff1aSopenharmony_ci    vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 16], m5,  1
665cabdff1aSopenharmony_ci%endmacro
666cabdff1aSopenharmony_ci
667cabdff1aSopenharmony_ci%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL 2-3
668cabdff1aSopenharmony_ci%if %0 > 2
669cabdff1aSopenharmony_ci%define offset %3
670cabdff1aSopenharmony_ci%else
671cabdff1aSopenharmony_ci%define offset 0
672cabdff1aSopenharmony_ci%endif
673cabdff1aSopenharmony_ci    SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 0, 0, %1, %1*2, %2, offset
674cabdff1aSopenharmony_ci    SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 4, 2, %1, %1*2, %2, offset
675cabdff1aSopenharmony_ci%endmacro
676cabdff1aSopenharmony_ci
677cabdff1aSopenharmony_ciINIT_XMM sse3
678cabdff1aSopenharmony_cicglobal fft2_float, 4, 4, 2, ctx, out, in, stride
679cabdff1aSopenharmony_ci    movaps m0, [inq]
680cabdff1aSopenharmony_ci    FFT2 m0, m1
681cabdff1aSopenharmony_ci    movaps [outq], m0
682cabdff1aSopenharmony_ci    RET
683cabdff1aSopenharmony_ci
684cabdff1aSopenharmony_ci%macro FFT4 2
685cabdff1aSopenharmony_ciINIT_XMM sse2
686cabdff1aSopenharmony_cicglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
687cabdff1aSopenharmony_ci    movaps m0, [inq + 0*mmsize]
688cabdff1aSopenharmony_ci    movaps m1, [inq + 1*mmsize]
689cabdff1aSopenharmony_ci
690cabdff1aSopenharmony_ci%if %2
691cabdff1aSopenharmony_ci    shufps m2, m1, m0, q3210
692cabdff1aSopenharmony_ci    shufps m0, m0, m1, q3210
693cabdff1aSopenharmony_ci    movaps m1, m2
694cabdff1aSopenharmony_ci%endif
695cabdff1aSopenharmony_ci
696cabdff1aSopenharmony_ci    FFT4 m0, m1, m2
697cabdff1aSopenharmony_ci
698cabdff1aSopenharmony_ci    unpcklpd m2, m0, m1
699cabdff1aSopenharmony_ci    unpckhpd m0, m0, m1
700cabdff1aSopenharmony_ci
701cabdff1aSopenharmony_ci    movaps [outq + 0*mmsize], m2
702cabdff1aSopenharmony_ci    movaps [outq + 1*mmsize], m0
703cabdff1aSopenharmony_ci
704cabdff1aSopenharmony_ci    RET
705cabdff1aSopenharmony_ci%endmacro
706cabdff1aSopenharmony_ci
707cabdff1aSopenharmony_ciFFT4 fwd, 0
708cabdff1aSopenharmony_ciFFT4 inv, 1
709cabdff1aSopenharmony_ci
710cabdff1aSopenharmony_ci%macro FFT8_SSE_FN 2
711cabdff1aSopenharmony_ciINIT_XMM sse3
712cabdff1aSopenharmony_cicglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp
713cabdff1aSopenharmony_ci%if %2
714cabdff1aSopenharmony_ci    mov ctxq, [ctxq + AVTXContext.map]
715cabdff1aSopenharmony_ci    LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
716cabdff1aSopenharmony_ci    LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
717cabdff1aSopenharmony_ci    LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
718cabdff1aSopenharmony_ci    LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq
719cabdff1aSopenharmony_ci%else
720cabdff1aSopenharmony_ci    movaps m0, [inq + 0*mmsize]
721cabdff1aSopenharmony_ci    movaps m1, [inq + 1*mmsize]
722cabdff1aSopenharmony_ci    movaps m2, [inq + 2*mmsize]
723cabdff1aSopenharmony_ci    movaps m3, [inq + 3*mmsize]
724cabdff1aSopenharmony_ci%endif
725cabdff1aSopenharmony_ci
726cabdff1aSopenharmony_ci    FFT8 m0, m1, m2, m3, m4, m5
727cabdff1aSopenharmony_ci
728cabdff1aSopenharmony_ci    unpcklpd m4, m0, m3
729cabdff1aSopenharmony_ci    unpcklpd m5, m1, m2
730cabdff1aSopenharmony_ci    unpckhpd m0, m0, m3
731cabdff1aSopenharmony_ci    unpckhpd m1, m1, m2
732cabdff1aSopenharmony_ci
733cabdff1aSopenharmony_ci    movups [outq + 0*mmsize], m4
734cabdff1aSopenharmony_ci    movups [outq + 1*mmsize], m0
735cabdff1aSopenharmony_ci    movups [outq + 2*mmsize], m5
736cabdff1aSopenharmony_ci    movups [outq + 3*mmsize], m1
737cabdff1aSopenharmony_ci
738cabdff1aSopenharmony_ci    RET
739cabdff1aSopenharmony_ci%endmacro
740cabdff1aSopenharmony_ci
741cabdff1aSopenharmony_ciFFT8_SSE_FN float,    1
742cabdff1aSopenharmony_ciFFT8_SSE_FN ns_float, 0
743cabdff1aSopenharmony_ci
744cabdff1aSopenharmony_ci%macro FFT8_AVX_FN 2
745cabdff1aSopenharmony_ciINIT_YMM avx
746cabdff1aSopenharmony_cicglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp
747cabdff1aSopenharmony_ci%if %2
748cabdff1aSopenharmony_ci    mov ctxq, [ctxq + AVTXContext.map]
749cabdff1aSopenharmony_ci    LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
750cabdff1aSopenharmony_ci    LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
751cabdff1aSopenharmony_ci%else
752cabdff1aSopenharmony_ci    movaps m0, [inq + 0*mmsize]
753cabdff1aSopenharmony_ci    movaps m1, [inq + 1*mmsize]
754cabdff1aSopenharmony_ci%endif
755cabdff1aSopenharmony_ci
756cabdff1aSopenharmony_ci    FFT8_AVX m0, m1, m2, m3
757cabdff1aSopenharmony_ci
758cabdff1aSopenharmony_ci    unpcklpd m2, m0, m1
759cabdff1aSopenharmony_ci    unpckhpd m0, m0, m1
760cabdff1aSopenharmony_ci
761cabdff1aSopenharmony_ci    ; Around 2% faster than 2x vperm2f128 + 2x movapd
762cabdff1aSopenharmony_ci    vextractf128 [outq + 16*0], m2, 0
763cabdff1aSopenharmony_ci    vextractf128 [outq + 16*1], m0, 0
764cabdff1aSopenharmony_ci    vextractf128 [outq + 16*2], m2, 1
765cabdff1aSopenharmony_ci    vextractf128 [outq + 16*3], m0, 1
766cabdff1aSopenharmony_ci
767cabdff1aSopenharmony_ci    RET
768cabdff1aSopenharmony_ci%endmacro
769cabdff1aSopenharmony_ci
770cabdff1aSopenharmony_ciFFT8_AVX_FN float,    1
771cabdff1aSopenharmony_ciFFT8_AVX_FN ns_float, 0
772cabdff1aSopenharmony_ci
773cabdff1aSopenharmony_ci%macro FFT16_FN 3
774cabdff1aSopenharmony_ciINIT_YMM %1
775cabdff1aSopenharmony_cicglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp
776cabdff1aSopenharmony_ci%if %3
777cabdff1aSopenharmony_ci    movaps m0, [inq + 0*mmsize]
778cabdff1aSopenharmony_ci    movaps m1, [inq + 1*mmsize]
779cabdff1aSopenharmony_ci    movaps m2, [inq + 2*mmsize]
780cabdff1aSopenharmony_ci    movaps m3, [inq + 3*mmsize]
781cabdff1aSopenharmony_ci%else
782cabdff1aSopenharmony_ci    mov ctxq, [ctxq + AVTXContext.map]
783cabdff1aSopenharmony_ci    LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
784cabdff1aSopenharmony_ci    LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
785cabdff1aSopenharmony_ci    LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6
786cabdff1aSopenharmony_ci    LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7
787cabdff1aSopenharmony_ci%endif
788cabdff1aSopenharmony_ci
789cabdff1aSopenharmony_ci    FFT16 m0, m1, m2, m3, m4, m5, m6, m7
790cabdff1aSopenharmony_ci
791cabdff1aSopenharmony_ci    unpcklpd m5, m1, m3
792cabdff1aSopenharmony_ci    unpcklpd m4, m0, m2
793cabdff1aSopenharmony_ci    unpckhpd m1, m1, m3
794cabdff1aSopenharmony_ci    unpckhpd m0, m0, m2
795cabdff1aSopenharmony_ci
796cabdff1aSopenharmony_ci    vextractf128 [outq + 16*0], m4, 0
797cabdff1aSopenharmony_ci    vextractf128 [outq + 16*1], m0, 0
798cabdff1aSopenharmony_ci    vextractf128 [outq + 16*2], m4, 1
799cabdff1aSopenharmony_ci    vextractf128 [outq + 16*3], m0, 1
800cabdff1aSopenharmony_ci    vextractf128 [outq + 16*4], m5, 0
801cabdff1aSopenharmony_ci    vextractf128 [outq + 16*5], m1, 0
802cabdff1aSopenharmony_ci    vextractf128 [outq + 16*6], m5, 1
803cabdff1aSopenharmony_ci    vextractf128 [outq + 16*7], m1, 1
804cabdff1aSopenharmony_ci
805cabdff1aSopenharmony_ci    RET
806cabdff1aSopenharmony_ci%endmacro
807cabdff1aSopenharmony_ci
808cabdff1aSopenharmony_ciFFT16_FN avx,  float,    0
809cabdff1aSopenharmony_ciFFT16_FN avx,  ns_float, 1
810cabdff1aSopenharmony_ciFFT16_FN fma3, float,    0
811cabdff1aSopenharmony_ciFFT16_FN fma3, ns_float, 1
812cabdff1aSopenharmony_ci
813cabdff1aSopenharmony_ci%macro FFT32_FN 3
814cabdff1aSopenharmony_ciINIT_YMM %1
815cabdff1aSopenharmony_cicglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp
816cabdff1aSopenharmony_ci%if %3
817cabdff1aSopenharmony_ci    movaps m4, [inq + 4*mmsize]
818cabdff1aSopenharmony_ci    movaps m5, [inq + 5*mmsize]
819cabdff1aSopenharmony_ci    movaps m6, [inq + 6*mmsize]
820cabdff1aSopenharmony_ci    movaps m7, [inq + 7*mmsize]
821cabdff1aSopenharmony_ci%else
822cabdff1aSopenharmony_ci    mov ctxq, [ctxq + AVTXContext.map]
823cabdff1aSopenharmony_ci    LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq,  m8, m12
824cabdff1aSopenharmony_ci    LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq,  m9, m13
825cabdff1aSopenharmony_ci    LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m10, m14
826cabdff1aSopenharmony_ci    LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m11, m15
827cabdff1aSopenharmony_ci%endif
828cabdff1aSopenharmony_ci
829cabdff1aSopenharmony_ci    FFT8 m4, m5, m6, m7, m8, m9
830cabdff1aSopenharmony_ci
831cabdff1aSopenharmony_ci%if %3
832cabdff1aSopenharmony_ci    movaps m0, [inq + 0*mmsize]
833cabdff1aSopenharmony_ci    movaps m1, [inq + 1*mmsize]
834cabdff1aSopenharmony_ci    movaps m2, [inq + 2*mmsize]
835cabdff1aSopenharmony_ci    movaps m3, [inq + 3*mmsize]
836cabdff1aSopenharmony_ci%else
837cabdff1aSopenharmony_ci    LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq,  m8, m12
838cabdff1aSopenharmony_ci    LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq,  m9, m13
839cabdff1aSopenharmony_ci    LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m10, m14
840cabdff1aSopenharmony_ci    LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m11, m15
841cabdff1aSopenharmony_ci%endif
842cabdff1aSopenharmony_ci
843cabdff1aSopenharmony_ci    movaps m8,         [tab_32_float]
844cabdff1aSopenharmony_ci    vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23
845cabdff1aSopenharmony_ci
846cabdff1aSopenharmony_ci    FFT16 m0, m1, m2, m3, m10, m11, m12, m13
847cabdff1aSopenharmony_ci
848cabdff1aSopenharmony_ci    SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \
849cabdff1aSopenharmony_ci                           m10, m11, m12, m13, m14, m15 ; temporary registers
850cabdff1aSopenharmony_ci
851cabdff1aSopenharmony_ci    unpcklpd  m9, m1, m3
852cabdff1aSopenharmony_ci    unpcklpd m10, m5, m7
853cabdff1aSopenharmony_ci    unpcklpd  m8, m0, m2
854cabdff1aSopenharmony_ci    unpcklpd m11, m4, m6
855cabdff1aSopenharmony_ci    unpckhpd  m1, m1, m3
856cabdff1aSopenharmony_ci    unpckhpd  m5, m5, m7
857cabdff1aSopenharmony_ci    unpckhpd  m0, m0, m2
858cabdff1aSopenharmony_ci    unpckhpd  m4, m4, m6
859cabdff1aSopenharmony_ci
860cabdff1aSopenharmony_ci    vextractf128 [outq + 16* 0],  m8, 0
861cabdff1aSopenharmony_ci    vextractf128 [outq + 16* 1],  m0, 0
862cabdff1aSopenharmony_ci    vextractf128 [outq + 16* 2],  m8, 1
863cabdff1aSopenharmony_ci    vextractf128 [outq + 16* 3],  m0, 1
864cabdff1aSopenharmony_ci    vextractf128 [outq + 16* 4],  m9, 0
865cabdff1aSopenharmony_ci    vextractf128 [outq + 16* 5],  m1, 0
866cabdff1aSopenharmony_ci    vextractf128 [outq + 16* 6],  m9, 1
867cabdff1aSopenharmony_ci    vextractf128 [outq + 16* 7],  m1, 1
868cabdff1aSopenharmony_ci
869cabdff1aSopenharmony_ci    vextractf128 [outq + 16* 8], m11, 0
870cabdff1aSopenharmony_ci    vextractf128 [outq + 16* 9],  m4, 0
871cabdff1aSopenharmony_ci    vextractf128 [outq + 16*10], m11, 1
872cabdff1aSopenharmony_ci    vextractf128 [outq + 16*11],  m4, 1
873cabdff1aSopenharmony_ci    vextractf128 [outq + 16*12], m10, 0
874cabdff1aSopenharmony_ci    vextractf128 [outq + 16*13],  m5, 0
875cabdff1aSopenharmony_ci    vextractf128 [outq + 16*14], m10, 1
876cabdff1aSopenharmony_ci    vextractf128 [outq + 16*15],  m5, 1
877cabdff1aSopenharmony_ci
878cabdff1aSopenharmony_ci    RET
879cabdff1aSopenharmony_ci%endmacro
880cabdff1aSopenharmony_ci
881cabdff1aSopenharmony_ci%if ARCH_X86_64
882cabdff1aSopenharmony_ciFFT32_FN avx,  float,    0
883cabdff1aSopenharmony_ciFFT32_FN avx,  ns_float, 1
884cabdff1aSopenharmony_ciFFT32_FN fma3, float,    0
885cabdff1aSopenharmony_ciFFT32_FN fma3, ns_float, 1
886cabdff1aSopenharmony_ci%endif
887cabdff1aSopenharmony_ci
888cabdff1aSopenharmony_ci%macro FFT_SPLIT_RADIX_DEF 1-2
889cabdff1aSopenharmony_ciALIGN 16
890cabdff1aSopenharmony_ci.%1 %+ pt:
891cabdff1aSopenharmony_ci    PUSH lenq
892cabdff1aSopenharmony_ci    mov lenq, (%1/4)
893cabdff1aSopenharmony_ci
894cabdff1aSopenharmony_ci    add outq, (%1*4) - (%1/1)
895cabdff1aSopenharmony_ci    call .32pt
896cabdff1aSopenharmony_ci
897cabdff1aSopenharmony_ci    add outq, (%1*2) - (%1/2) ; the synth loops also increment outq
898cabdff1aSopenharmony_ci    call .32pt
899cabdff1aSopenharmony_ci
900cabdff1aSopenharmony_ci    POP lenq
901cabdff1aSopenharmony_ci    sub outq, (%1*4) + (%1*2) + (%1/2)
902cabdff1aSopenharmony_ci
903cabdff1aSopenharmony_ci    lea rtabq, [tab_ %+ %1 %+ _float]
904cabdff1aSopenharmony_ci    lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7]
905cabdff1aSopenharmony_ci
906cabdff1aSopenharmony_ci%if %0 > 1
907cabdff1aSopenharmony_ci    cmp tgtq, %1
908cabdff1aSopenharmony_ci    je .deinterleave
909cabdff1aSopenharmony_ci
910cabdff1aSopenharmony_ci    mov tmpq, %1
911cabdff1aSopenharmony_ci
912cabdff1aSopenharmony_ci.synth_ %+ %1:
913cabdff1aSopenharmony_ci    SPLIT_RADIX_LOAD_COMBINE_FULL 2*%1, 6*%1, 0, 0, 0
914cabdff1aSopenharmony_ci    add outq, 8*mmsize
915cabdff1aSopenharmony_ci    add rtabq, 4*mmsize
916cabdff1aSopenharmony_ci    sub itabq, 4*mmsize
917cabdff1aSopenharmony_ci    sub tmpq, 4*mmsize
918cabdff1aSopenharmony_ci    jg .synth_ %+ %1
919cabdff1aSopenharmony_ci
920cabdff1aSopenharmony_ci    cmp lenq, %1
921cabdff1aSopenharmony_ci    jg %2 ; can't do math here, nasm doesn't get it
922cabdff1aSopenharmony_ci    ret
923cabdff1aSopenharmony_ci%endif
924cabdff1aSopenharmony_ci%endmacro
925cabdff1aSopenharmony_ci
926cabdff1aSopenharmony_ci%macro FFT_SPLIT_RADIX_FN 3
927cabdff1aSopenharmony_ciINIT_YMM %1
928cabdff1aSopenharmony_cicglobal fft_sr_ %+ %2, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
929cabdff1aSopenharmony_ci    movsxd lenq, dword [lutq + AVTXContext.len]
930cabdff1aSopenharmony_ci    mov lutq, [lutq + AVTXContext.map]
931cabdff1aSopenharmony_ci    mov tgtq, lenq
932cabdff1aSopenharmony_ci
933cabdff1aSopenharmony_ci; Bottom-most/32-point transform ===============================================
934cabdff1aSopenharmony_ciALIGN 16
935cabdff1aSopenharmony_ci.32pt:
936cabdff1aSopenharmony_ci%if %3
937cabdff1aSopenharmony_ci    movaps m4, [inq + 4*mmsize]
938cabdff1aSopenharmony_ci    movaps m5, [inq + 5*mmsize]
939cabdff1aSopenharmony_ci    movaps m6, [inq + 6*mmsize]
940cabdff1aSopenharmony_ci    movaps m7, [inq + 7*mmsize]
941cabdff1aSopenharmony_ci%else
942cabdff1aSopenharmony_ci    LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq,  m8, m12
943cabdff1aSopenharmony_ci    LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq,  m9, m13
944cabdff1aSopenharmony_ci    LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m10, m14
945cabdff1aSopenharmony_ci    LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m11, m15
946cabdff1aSopenharmony_ci%endif
947cabdff1aSopenharmony_ci
948cabdff1aSopenharmony_ci    FFT8 m4, m5, m6, m7, m8, m9
949cabdff1aSopenharmony_ci
950cabdff1aSopenharmony_ci%if %3
951cabdff1aSopenharmony_ci    movaps m0, [inq + 0*mmsize]
952cabdff1aSopenharmony_ci    movaps m1, [inq + 1*mmsize]
953cabdff1aSopenharmony_ci    movaps m2, [inq + 2*mmsize]
954cabdff1aSopenharmony_ci    movaps m3, [inq + 3*mmsize]
955cabdff1aSopenharmony_ci%else
956cabdff1aSopenharmony_ci    LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq,  m8, m12
957cabdff1aSopenharmony_ci    LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq,  m9, m13
958cabdff1aSopenharmony_ci    LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m10, m14
959cabdff1aSopenharmony_ci    LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m11, m15
960cabdff1aSopenharmony_ci%endif
961cabdff1aSopenharmony_ci
962cabdff1aSopenharmony_ci    movaps m8,         [tab_32_float]
963cabdff1aSopenharmony_ci    vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23
964cabdff1aSopenharmony_ci
965cabdff1aSopenharmony_ci    FFT16 m0, m1, m2, m3, m10, m11, m12, m13
966cabdff1aSopenharmony_ci
967cabdff1aSopenharmony_ci    SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \
968cabdff1aSopenharmony_ci                           m10, m11, m12, m13, m14, m15 ; temporary registers
969cabdff1aSopenharmony_ci
970cabdff1aSopenharmony_ci    movaps [outq + 1*mmsize], m1
971cabdff1aSopenharmony_ci    movaps [outq + 3*mmsize], m3
972cabdff1aSopenharmony_ci    movaps [outq + 5*mmsize], m5
973cabdff1aSopenharmony_ci    movaps [outq + 7*mmsize], m7
974cabdff1aSopenharmony_ci
975cabdff1aSopenharmony_ci%if %3
976cabdff1aSopenharmony_ci    add inq, 8*mmsize
977cabdff1aSopenharmony_ci%else
978cabdff1aSopenharmony_ci    add lutq, (mmsize/2)*8
979cabdff1aSopenharmony_ci%endif
980cabdff1aSopenharmony_ci    cmp lenq, 32
981cabdff1aSopenharmony_ci    jg .64pt
982cabdff1aSopenharmony_ci
983cabdff1aSopenharmony_ci    movaps [outq + 0*mmsize], m0
984cabdff1aSopenharmony_ci    movaps [outq + 2*mmsize], m2
985cabdff1aSopenharmony_ci    movaps [outq + 4*mmsize], m4
986cabdff1aSopenharmony_ci    movaps [outq + 6*mmsize], m6
987cabdff1aSopenharmony_ci
988cabdff1aSopenharmony_ci    ret
989cabdff1aSopenharmony_ci
990cabdff1aSopenharmony_ci; 64-point transform ===========================================================
991cabdff1aSopenharmony_ciALIGN 16
992cabdff1aSopenharmony_ci.64pt:
993cabdff1aSopenharmony_ci; Helper defines, these make it easier to track what's happening
994cabdff1aSopenharmony_ci%define tx1_e0 m4
995cabdff1aSopenharmony_ci%define tx1_e1 m5
996cabdff1aSopenharmony_ci%define tx1_o0 m6
997cabdff1aSopenharmony_ci%define tx1_o1 m7
998cabdff1aSopenharmony_ci%define tx2_e0 m8
999cabdff1aSopenharmony_ci%define tx2_e1 m9
1000cabdff1aSopenharmony_ci%define tx2_o0 m10
1001cabdff1aSopenharmony_ci%define tx2_o1 m11
1002cabdff1aSopenharmony_ci%define tw_e m12
1003cabdff1aSopenharmony_ci%define tw_o m13
1004cabdff1aSopenharmony_ci%define tmp1 m14
1005cabdff1aSopenharmony_ci%define tmp2 m15
1006cabdff1aSopenharmony_ci
1007cabdff1aSopenharmony_ci    SWAP m4, m1
1008cabdff1aSopenharmony_ci    SWAP m6, m3
1009cabdff1aSopenharmony_ci
1010cabdff1aSopenharmony_ci%if %3
1011cabdff1aSopenharmony_ci    movaps tx1_e0, [inq + 0*mmsize]
1012cabdff1aSopenharmony_ci    movaps tx1_e1, [inq + 1*mmsize]
1013cabdff1aSopenharmony_ci    movaps tx1_o0, [inq + 2*mmsize]
1014cabdff1aSopenharmony_ci    movaps tx1_o1, [inq + 3*mmsize]
1015cabdff1aSopenharmony_ci%else
1016cabdff1aSopenharmony_ci    LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tmp1
1017cabdff1aSopenharmony_ci    LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tw_o, tmp2
1018cabdff1aSopenharmony_ci    LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tmp1
1019cabdff1aSopenharmony_ci    LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tw_o, tmp2
1020cabdff1aSopenharmony_ci%endif
1021cabdff1aSopenharmony_ci
1022cabdff1aSopenharmony_ci    FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1
1023cabdff1aSopenharmony_ci
1024cabdff1aSopenharmony_ci%if %3
1025cabdff1aSopenharmony_ci    movaps tx2_e0, [inq + 4*mmsize]
1026cabdff1aSopenharmony_ci    movaps tx2_e1, [inq + 5*mmsize]
1027cabdff1aSopenharmony_ci    movaps tx2_o0, [inq + 6*mmsize]
1028cabdff1aSopenharmony_ci    movaps tx2_o1, [inq + 7*mmsize]
1029cabdff1aSopenharmony_ci%else
1030cabdff1aSopenharmony_ci    LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tw_e, tmp1
1031cabdff1aSopenharmony_ci    LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_o, tmp2
1032cabdff1aSopenharmony_ci    LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tw_e, tmp1
1033cabdff1aSopenharmony_ci    LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_o, tmp2
1034cabdff1aSopenharmony_ci%endif
1035cabdff1aSopenharmony_ci
1036cabdff1aSopenharmony_ci    FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
1037cabdff1aSopenharmony_ci
1038cabdff1aSopenharmony_ci    movaps tw_e,           [tab_64_float]
1039cabdff1aSopenharmony_ci    vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
1040cabdff1aSopenharmony_ci
1041cabdff1aSopenharmony_ci%if %3
1042cabdff1aSopenharmony_ci    add inq, 8*mmsize
1043cabdff1aSopenharmony_ci%else
1044cabdff1aSopenharmony_ci    add lutq, (mmsize/2)*8
1045cabdff1aSopenharmony_ci%endif
1046cabdff1aSopenharmony_ci    cmp tgtq, 64
1047cabdff1aSopenharmony_ci    je .deinterleave
1048cabdff1aSopenharmony_ci
1049cabdff1aSopenharmony_ci    SPLIT_RADIX_COMBINE_64
1050cabdff1aSopenharmony_ci
1051cabdff1aSopenharmony_ci    cmp lenq, 64
1052cabdff1aSopenharmony_ci    jg .128pt
1053cabdff1aSopenharmony_ci    ret
1054cabdff1aSopenharmony_ci
1055cabdff1aSopenharmony_ci; 128-point transform ==========================================================
1056cabdff1aSopenharmony_ciALIGN 16
1057cabdff1aSopenharmony_ci.128pt:
1058cabdff1aSopenharmony_ci    PUSH lenq
1059cabdff1aSopenharmony_ci    mov lenq, 32
1060cabdff1aSopenharmony_ci
1061cabdff1aSopenharmony_ci    add outq, 16*mmsize
1062cabdff1aSopenharmony_ci    call .32pt
1063cabdff1aSopenharmony_ci
1064cabdff1aSopenharmony_ci    add outq, 8*mmsize
1065cabdff1aSopenharmony_ci    call .32pt
1066cabdff1aSopenharmony_ci
1067cabdff1aSopenharmony_ci    POP lenq
1068cabdff1aSopenharmony_ci    sub outq, 24*mmsize
1069cabdff1aSopenharmony_ci
1070cabdff1aSopenharmony_ci    lea rtabq, [tab_128_float]
1071cabdff1aSopenharmony_ci    lea itabq, [tab_128_float + 128 - 4*7]
1072cabdff1aSopenharmony_ci
1073cabdff1aSopenharmony_ci    cmp tgtq, 128
1074cabdff1aSopenharmony_ci    je .deinterleave
1075cabdff1aSopenharmony_ci
1076cabdff1aSopenharmony_ci    SPLIT_RADIX_LOAD_COMBINE_FULL 2*128, 6*128
1077cabdff1aSopenharmony_ci
1078cabdff1aSopenharmony_ci    cmp lenq, 128
1079cabdff1aSopenharmony_ci    jg .256pt
1080cabdff1aSopenharmony_ci    ret
1081cabdff1aSopenharmony_ci
1082cabdff1aSopenharmony_ci; 256-point transform ==========================================================
1083cabdff1aSopenharmony_ciALIGN 16
1084cabdff1aSopenharmony_ci.256pt:
1085cabdff1aSopenharmony_ci    PUSH lenq
1086cabdff1aSopenharmony_ci    mov lenq, 64
1087cabdff1aSopenharmony_ci
1088cabdff1aSopenharmony_ci    add outq, 32*mmsize
1089cabdff1aSopenharmony_ci    call .32pt
1090cabdff1aSopenharmony_ci
1091cabdff1aSopenharmony_ci    add outq, 16*mmsize
1092cabdff1aSopenharmony_ci    call .32pt
1093cabdff1aSopenharmony_ci
1094cabdff1aSopenharmony_ci    POP lenq
1095cabdff1aSopenharmony_ci    sub outq, 48*mmsize
1096cabdff1aSopenharmony_ci
1097cabdff1aSopenharmony_ci    lea rtabq, [tab_256_float]
1098cabdff1aSopenharmony_ci    lea itabq, [tab_256_float + 256 - 4*7]
1099cabdff1aSopenharmony_ci
1100cabdff1aSopenharmony_ci    cmp tgtq, 256
1101cabdff1aSopenharmony_ci    je .deinterleave
1102cabdff1aSopenharmony_ci
1103cabdff1aSopenharmony_ci    SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256
1104cabdff1aSopenharmony_ci    SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256, 8*mmsize, 4*mmsize, -4*mmsize
1105cabdff1aSopenharmony_ci
1106cabdff1aSopenharmony_ci    cmp lenq, 256
1107cabdff1aSopenharmony_ci    jg .512pt
1108cabdff1aSopenharmony_ci    ret
1109cabdff1aSopenharmony_ci
1110cabdff1aSopenharmony_ci; 512-point transform ==========================================================
1111cabdff1aSopenharmony_ciALIGN 16
1112cabdff1aSopenharmony_ci.512pt:
1113cabdff1aSopenharmony_ci    PUSH lenq
1114cabdff1aSopenharmony_ci    mov lenq, 128
1115cabdff1aSopenharmony_ci
1116cabdff1aSopenharmony_ci    add outq, 64*mmsize
1117cabdff1aSopenharmony_ci    call .32pt
1118cabdff1aSopenharmony_ci
1119cabdff1aSopenharmony_ci    add outq, 32*mmsize
1120cabdff1aSopenharmony_ci    call .32pt
1121cabdff1aSopenharmony_ci
1122cabdff1aSopenharmony_ci    POP lenq
1123cabdff1aSopenharmony_ci    sub outq, 96*mmsize
1124cabdff1aSopenharmony_ci
1125cabdff1aSopenharmony_ci    lea rtabq, [tab_512_float]
1126cabdff1aSopenharmony_ci    lea itabq, [tab_512_float + 512 - 4*7]
1127cabdff1aSopenharmony_ci
1128cabdff1aSopenharmony_ci    cmp tgtq, 512
1129cabdff1aSopenharmony_ci    je .deinterleave
1130cabdff1aSopenharmony_ci
1131cabdff1aSopenharmony_ci    mov tmpq, 4
1132cabdff1aSopenharmony_ci
1133cabdff1aSopenharmony_ci.synth_512:
1134cabdff1aSopenharmony_ci    SPLIT_RADIX_LOAD_COMBINE_FULL 2*512, 6*512
1135cabdff1aSopenharmony_ci    add outq, 8*mmsize
1136cabdff1aSopenharmony_ci    add rtabq, 4*mmsize
1137cabdff1aSopenharmony_ci    sub itabq, 4*mmsize
1138cabdff1aSopenharmony_ci    sub tmpq, 1
1139cabdff1aSopenharmony_ci    jg .synth_512
1140cabdff1aSopenharmony_ci
1141cabdff1aSopenharmony_ci    cmp lenq, 512
1142cabdff1aSopenharmony_ci    jg .1024pt
1143cabdff1aSopenharmony_ci    ret
1144cabdff1aSopenharmony_ci
1145cabdff1aSopenharmony_ci; 1024-point transform ==========================================================
1146cabdff1aSopenharmony_ciALIGN 16
1147cabdff1aSopenharmony_ci.1024pt:
1148cabdff1aSopenharmony_ci    PUSH lenq
1149cabdff1aSopenharmony_ci    mov lenq, 256
1150cabdff1aSopenharmony_ci
1151cabdff1aSopenharmony_ci    add outq, 96*mmsize
1152cabdff1aSopenharmony_ci    call .32pt
1153cabdff1aSopenharmony_ci
1154cabdff1aSopenharmony_ci    add outq, 64*mmsize
1155cabdff1aSopenharmony_ci    call .32pt
1156cabdff1aSopenharmony_ci
1157cabdff1aSopenharmony_ci    POP lenq
1158cabdff1aSopenharmony_ci    sub outq, 192*mmsize
1159cabdff1aSopenharmony_ci
1160cabdff1aSopenharmony_ci    lea rtabq, [tab_1024_float]
1161cabdff1aSopenharmony_ci    lea itabq, [tab_1024_float + 1024 - 4*7]
1162cabdff1aSopenharmony_ci
1163cabdff1aSopenharmony_ci    cmp tgtq, 1024
1164cabdff1aSopenharmony_ci    je .deinterleave
1165cabdff1aSopenharmony_ci
1166cabdff1aSopenharmony_ci    mov tmpq, 8
1167cabdff1aSopenharmony_ci
1168cabdff1aSopenharmony_ci.synth_1024:
1169cabdff1aSopenharmony_ci    SPLIT_RADIX_LOAD_COMBINE_FULL 2*1024, 6*1024
1170cabdff1aSopenharmony_ci    add outq, 8*mmsize
1171cabdff1aSopenharmony_ci    add rtabq, 4*mmsize
1172cabdff1aSopenharmony_ci    sub itabq, 4*mmsize
1173cabdff1aSopenharmony_ci    sub tmpq, 1
1174cabdff1aSopenharmony_ci    jg .synth_1024
1175cabdff1aSopenharmony_ci
1176cabdff1aSopenharmony_ci    cmp lenq, 1024
1177cabdff1aSopenharmony_ci    jg .2048pt
1178cabdff1aSopenharmony_ci    ret
1179cabdff1aSopenharmony_ci
1180cabdff1aSopenharmony_ci; 2048 to 131072-point transforms ==============================================
1181cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_DEF 2048,  .4096pt
1182cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_DEF 4096,  .8192pt
1183cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_DEF 8192,  .16384pt
1184cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_DEF 16384, .32768pt
1185cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_DEF 32768, .65536pt
1186cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_DEF 65536, .131072pt
1187cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_DEF 131072
1188cabdff1aSopenharmony_ci
1189cabdff1aSopenharmony_ci;===============================================================================
1190cabdff1aSopenharmony_ci; Final synthesis + deinterleaving code
1191cabdff1aSopenharmony_ci;===============================================================================
1192cabdff1aSopenharmony_ci.deinterleave:
1193cabdff1aSopenharmony_ci    cmp lenq, 64
1194cabdff1aSopenharmony_ci    je .64pt_deint
1195cabdff1aSopenharmony_ci
1196cabdff1aSopenharmony_ci    imul tmpq, lenq, 2
1197cabdff1aSopenharmony_ci    lea lutq, [4*lenq + tmpq]
1198cabdff1aSopenharmony_ci
1199cabdff1aSopenharmony_ci.synth_deinterleave:
1200cabdff1aSopenharmony_ci    SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL tmpq, lutq
1201cabdff1aSopenharmony_ci    add outq, 8*mmsize
1202cabdff1aSopenharmony_ci    add rtabq, 4*mmsize
1203cabdff1aSopenharmony_ci    sub itabq, 4*mmsize
1204cabdff1aSopenharmony_ci    sub lenq, 4*mmsize
1205cabdff1aSopenharmony_ci    jg .synth_deinterleave
1206cabdff1aSopenharmony_ci
1207cabdff1aSopenharmony_ci    RET
1208cabdff1aSopenharmony_ci
1209cabdff1aSopenharmony_ci; 64-point deinterleave which only has to load 4 registers =====================
1210cabdff1aSopenharmony_ci.64pt_deint:
1211cabdff1aSopenharmony_ci    SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2
1212cabdff1aSopenharmony_ci    SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, tw_e
1213cabdff1aSopenharmony_ci
1214cabdff1aSopenharmony_ci    unpcklpd tmp1, m0, m2
1215cabdff1aSopenharmony_ci    unpcklpd tmp2, m1, m3
1216cabdff1aSopenharmony_ci    unpcklpd tw_o, tx1_e0, tx1_o0
1217cabdff1aSopenharmony_ci    unpcklpd tw_e, tx2_e0, tx2_o0
1218cabdff1aSopenharmony_ci    unpckhpd m0, m0, m2
1219cabdff1aSopenharmony_ci    unpckhpd m1, m1, m3
1220cabdff1aSopenharmony_ci    unpckhpd tx1_e0, tx1_e0, tx1_o0
1221cabdff1aSopenharmony_ci    unpckhpd tx2_e0, tx2_e0, tx2_o0
1222cabdff1aSopenharmony_ci
1223cabdff1aSopenharmony_ci    vextractf128 [outq +  0*mmsize +  0], tmp1,   0
1224cabdff1aSopenharmony_ci    vextractf128 [outq +  0*mmsize + 16], m0,     0
1225cabdff1aSopenharmony_ci    vextractf128 [outq +  4*mmsize +  0], tmp2,   0
1226cabdff1aSopenharmony_ci    vextractf128 [outq +  4*mmsize + 16], m1,     0
1227cabdff1aSopenharmony_ci
1228cabdff1aSopenharmony_ci    vextractf128 [outq +  8*mmsize +  0], tw_o,   0
1229cabdff1aSopenharmony_ci    vextractf128 [outq +  8*mmsize + 16], tx1_e0, 0
1230cabdff1aSopenharmony_ci    vextractf128 [outq +  9*mmsize +  0], tw_o,   1
1231cabdff1aSopenharmony_ci    vextractf128 [outq +  9*mmsize + 16], tx1_e0, 1
1232cabdff1aSopenharmony_ci
1233cabdff1aSopenharmony_ci    vperm2f128 tmp1, tmp1, m0, 0x31
1234cabdff1aSopenharmony_ci    vperm2f128 tmp2, tmp2, m1, 0x31
1235cabdff1aSopenharmony_ci
1236cabdff1aSopenharmony_ci    vextractf128 [outq + 12*mmsize +  0], tw_e,   0
1237cabdff1aSopenharmony_ci    vextractf128 [outq + 12*mmsize + 16], tx2_e0, 0
1238cabdff1aSopenharmony_ci    vextractf128 [outq + 13*mmsize +  0], tw_e,   1
1239cabdff1aSopenharmony_ci    vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1
1240cabdff1aSopenharmony_ci
1241cabdff1aSopenharmony_ci    movaps tw_e,           [tab_64_float + mmsize]
1242cabdff1aSopenharmony_ci    vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
1243cabdff1aSopenharmony_ci
1244cabdff1aSopenharmony_ci    movaps m0, [outq +  1*mmsize]
1245cabdff1aSopenharmony_ci    movaps m1, [outq +  3*mmsize]
1246cabdff1aSopenharmony_ci    movaps m2, [outq +  5*mmsize]
1247cabdff1aSopenharmony_ci    movaps m3, [outq +  7*mmsize]
1248cabdff1aSopenharmony_ci
1249cabdff1aSopenharmony_ci    movaps [outq +  1*mmsize], tmp1
1250cabdff1aSopenharmony_ci    movaps [outq +  5*mmsize], tmp2
1251cabdff1aSopenharmony_ci
1252cabdff1aSopenharmony_ci    SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \
1253cabdff1aSopenharmony_ci                           tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers
1254cabdff1aSopenharmony_ci
1255cabdff1aSopenharmony_ci    unpcklpd tmp1, m0, m1
1256cabdff1aSopenharmony_ci    unpcklpd tmp2, m2, m3
1257cabdff1aSopenharmony_ci    unpcklpd tw_e, tx1_e1, tx1_o1
1258cabdff1aSopenharmony_ci    unpcklpd tw_o, tx2_e1, tx2_o1
1259cabdff1aSopenharmony_ci    unpckhpd m0, m0, m1
1260cabdff1aSopenharmony_ci    unpckhpd m2, m2, m3
1261cabdff1aSopenharmony_ci    unpckhpd tx1_e1, tx1_e1, tx1_o1
1262cabdff1aSopenharmony_ci    unpckhpd tx2_e1, tx2_e1, tx2_o1
1263cabdff1aSopenharmony_ci
1264cabdff1aSopenharmony_ci    vextractf128 [outq +  2*mmsize +  0], tmp1,   0
1265cabdff1aSopenharmony_ci    vextractf128 [outq +  2*mmsize + 16], m0,     0
1266cabdff1aSopenharmony_ci    vextractf128 [outq +  3*mmsize +  0], tmp1,   1
1267cabdff1aSopenharmony_ci    vextractf128 [outq +  3*mmsize + 16], m0,     1
1268cabdff1aSopenharmony_ci
1269cabdff1aSopenharmony_ci    vextractf128 [outq +  6*mmsize +  0], tmp2,   0
1270cabdff1aSopenharmony_ci    vextractf128 [outq +  6*mmsize + 16], m2,     0
1271cabdff1aSopenharmony_ci    vextractf128 [outq +  7*mmsize +  0], tmp2,   1
1272cabdff1aSopenharmony_ci    vextractf128 [outq +  7*mmsize + 16], m2,     1
1273cabdff1aSopenharmony_ci
1274cabdff1aSopenharmony_ci    vextractf128 [outq + 10*mmsize +  0], tw_e,   0
1275cabdff1aSopenharmony_ci    vextractf128 [outq + 10*mmsize + 16], tx1_e1, 0
1276cabdff1aSopenharmony_ci    vextractf128 [outq + 11*mmsize +  0], tw_e,   1
1277cabdff1aSopenharmony_ci    vextractf128 [outq + 11*mmsize + 16], tx1_e1, 1
1278cabdff1aSopenharmony_ci
1279cabdff1aSopenharmony_ci    vextractf128 [outq + 14*mmsize +  0], tw_o,   0
1280cabdff1aSopenharmony_ci    vextractf128 [outq + 14*mmsize + 16], tx2_e1, 0
1281cabdff1aSopenharmony_ci    vextractf128 [outq + 15*mmsize +  0], tw_o,   1
1282cabdff1aSopenharmony_ci    vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1
1283cabdff1aSopenharmony_ci
1284cabdff1aSopenharmony_ci    RET
1285cabdff1aSopenharmony_ci%endmacro
1286cabdff1aSopenharmony_ci
1287cabdff1aSopenharmony_ci%if ARCH_X86_64
1288cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_FN fma3, float,    0
1289cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_FN fma3, ns_float, 1
1290cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
1291cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_FN avx2, float,    0
1292cabdff1aSopenharmony_ciFFT_SPLIT_RADIX_FN avx2, ns_float, 1
1293cabdff1aSopenharmony_ci%endif
1294cabdff1aSopenharmony_ci%endif
1295