1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * ARM NEON optimised FFT
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
5cabdff1aSopenharmony_ci * Copyright (c) 2009 Naotoshi Nojiri
6cabdff1aSopenharmony_ci * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * This algorithm (though not any of the implementation details) is
9cabdff1aSopenharmony_ci * based on libdjbfft by D. J. Bernstein.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * This file is part of FFmpeg.
12cabdff1aSopenharmony_ci *
13cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
14cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
15cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
16cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
17cabdff1aSopenharmony_ci *
18cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
19cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
20cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21cabdff1aSopenharmony_ci * Lesser General Public License for more details.
22cabdff1aSopenharmony_ci *
23cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
24cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
25cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26cabdff1aSopenharmony_ci */
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S"
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ci#define M_SQRT1_2 0.70710678118654752440
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ci.macro transpose d0, d1, s0, s1
33cabdff1aSopenharmony_ci        trn1            \d0, \s0, \s1
34cabdff1aSopenharmony_ci        trn2            \d1, \s0, \s1
35cabdff1aSopenharmony_ci.endm
36cabdff1aSopenharmony_ci
37cabdff1aSopenharmony_ci
38cabdff1aSopenharmony_cifunction fft4_neon
39cabdff1aSopenharmony_ci        AARCH64_VALID_JUMP_TARGET
40cabdff1aSopenharmony_ci        ld1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
41cabdff1aSopenharmony_ci
42cabdff1aSopenharmony_ci        fadd            v4.2s,  v0.2s,  v1.2s   // r0+r1,i0+i1
43cabdff1aSopenharmony_ci        fsub            v6.2s,  v0.2s,  v1.2s   // r0-r1,i0-i1
44cabdff1aSopenharmony_ci
45cabdff1aSopenharmony_ci        ext             v16.8b, v2.8b,  v3.8b,  #4
46cabdff1aSopenharmony_ci        ext             v17.8b, v3.8b,  v2.8b,  #4
47cabdff1aSopenharmony_ci
48cabdff1aSopenharmony_ci        fadd            v5.2s,  v2.2s,  v3.2s   // i2+i3,r2+r3
49cabdff1aSopenharmony_ci        fsub            v7.2s,  v16.2s, v17.2s  // r3-r2,i2-i3
50cabdff1aSopenharmony_ci
51cabdff1aSopenharmony_ci        fadd            v0.2s,  v4.2s,  v5.2s
52cabdff1aSopenharmony_ci        fsub            v2.2s,  v4.2s,  v5.2s
53cabdff1aSopenharmony_ci        fadd            v1.2s,  v6.2s,  v7.2s
54cabdff1aSopenharmony_ci        fsub            v3.2s,  v6.2s,  v7.2s
55cabdff1aSopenharmony_ci
56cabdff1aSopenharmony_ci        st1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
57cabdff1aSopenharmony_ci
58cabdff1aSopenharmony_ci        ret
59cabdff1aSopenharmony_ciendfunc
60cabdff1aSopenharmony_ci
61cabdff1aSopenharmony_cifunction fft8_neon
62cabdff1aSopenharmony_ci        AARCH64_VALID_JUMP_TARGET
63cabdff1aSopenharmony_ci        mov             x1,  x0
64cabdff1aSopenharmony_ci        ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
65cabdff1aSopenharmony_ci        ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
66cabdff1aSopenharmony_ci        ext             v22.8b, v2.8b,  v3.8b,  #4
67cabdff1aSopenharmony_ci        ext             v23.8b, v3.8b,  v2.8b,  #4
68cabdff1aSopenharmony_ci        fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
69cabdff1aSopenharmony_ci        fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
70cabdff1aSopenharmony_ci        fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
71cabdff1aSopenharmony_ci        fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
72cabdff1aSopenharmony_ci        rev64           v27.2s, v28.2s  // ???
73cabdff1aSopenharmony_ci        fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
74cabdff1aSopenharmony_ci        fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
75cabdff1aSopenharmony_ci        fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
76cabdff1aSopenharmony_ci        ext             v6.8b,  v4.8b,  v5.8b,  #4
77cabdff1aSopenharmony_ci        ext             v7.8b,  v5.8b,  v4.8b,  #4
78cabdff1aSopenharmony_ci        fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
79cabdff1aSopenharmony_ci        fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
80cabdff1aSopenharmony_ci        fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
81cabdff1aSopenharmony_ci        fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
82cabdff1aSopenharmony_ci        fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
83cabdff1aSopenharmony_ci        fadd            v0.2s,  v20.2s, v21.2s
84cabdff1aSopenharmony_ci        fsub            v2.2s,  v20.2s, v21.2s
85cabdff1aSopenharmony_ci        fadd            v1.2s,  v22.2s, v23.2s
86cabdff1aSopenharmony_ci        rev64           v26.2s, v26.2s
87cabdff1aSopenharmony_ci        rev64           v27.2s, v27.2s
88cabdff1aSopenharmony_ci        fsub            v3.2s,  v22.2s, v23.2s
89cabdff1aSopenharmony_ci        fsub            v6.2s,  v6.2s,  v7.2s
90cabdff1aSopenharmony_ci        fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
91cabdff1aSopenharmony_ci        fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
92cabdff1aSopenharmony_ci        fadd            v7.2s,  v4.2s,  v5.2s
93cabdff1aSopenharmony_ci        fsub            v18.2s, v2.2s,  v6.2s
94cabdff1aSopenharmony_ci        ext             v26.8b, v24.8b, v25.8b, #4
95cabdff1aSopenharmony_ci        ext             v27.8b, v25.8b, v24.8b, #4
96cabdff1aSopenharmony_ci        fadd            v2.2s,  v2.2s,  v6.2s
97cabdff1aSopenharmony_ci        fsub            v16.2s, v0.2s,  v7.2s
98cabdff1aSopenharmony_ci        fadd            v5.2s,  v25.2s, v24.2s
99cabdff1aSopenharmony_ci        fsub            v4.2s,  v26.2s, v27.2s
100cabdff1aSopenharmony_ci        fadd            v0.2s,  v0.2s,  v7.2s
101cabdff1aSopenharmony_ci        fsub            v17.2s, v1.2s,  v5.2s
102cabdff1aSopenharmony_ci        fsub            v19.2s, v3.2s,  v4.2s
103cabdff1aSopenharmony_ci        fadd            v3.2s,  v3.2s,  v4.2s
104cabdff1aSopenharmony_ci        fadd            v1.2s,  v1.2s,  v5.2s
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_ci        st1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
107cabdff1aSopenharmony_ci        st1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x1]
108cabdff1aSopenharmony_ci
109cabdff1aSopenharmony_ci        ret
110cabdff1aSopenharmony_ciendfunc
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_cifunction fft16_neon
113cabdff1aSopenharmony_ci        AARCH64_VALID_JUMP_TARGET
114cabdff1aSopenharmony_ci        mov             x1,  x0
115cabdff1aSopenharmony_ci        ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
116cabdff1aSopenharmony_ci        ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
117cabdff1aSopenharmony_ci        ext             v22.8b, v2.8b,  v3.8b,  #4
118cabdff1aSopenharmony_ci        ext             v23.8b, v3.8b,  v2.8b,  #4
119cabdff1aSopenharmony_ci        fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
120cabdff1aSopenharmony_ci        fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
121cabdff1aSopenharmony_ci        fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
122cabdff1aSopenharmony_ci        fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
123cabdff1aSopenharmony_ci        rev64           v27.2s, v28.2s  // ???
124cabdff1aSopenharmony_ci        fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
125cabdff1aSopenharmony_ci        fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
126cabdff1aSopenharmony_ci        fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
127cabdff1aSopenharmony_ci        ext             v6.8b,  v4.8b,  v5.8b,  #4
128cabdff1aSopenharmony_ci        ext             v7.8b,  v5.8b,  v4.8b,  #4
129cabdff1aSopenharmony_ci        fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
130cabdff1aSopenharmony_ci        fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
131cabdff1aSopenharmony_ci        fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
132cabdff1aSopenharmony_ci        fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
133cabdff1aSopenharmony_ci        fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
134cabdff1aSopenharmony_ci        fadd            v0.2s,  v20.2s, v21.2s
135cabdff1aSopenharmony_ci        fsub            v2.2s,  v20.2s, v21.2s
136cabdff1aSopenharmony_ci        fadd            v1.2s,  v22.2s, v23.2s
137cabdff1aSopenharmony_ci        rev64           v26.2s, v26.2s
138cabdff1aSopenharmony_ci        rev64           v27.2s, v27.2s
139cabdff1aSopenharmony_ci        fsub            v3.2s,  v22.2s, v23.2s
140cabdff1aSopenharmony_ci        fsub            v6.2s,  v6.2s,  v7.2s
141cabdff1aSopenharmony_ci        fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
142cabdff1aSopenharmony_ci        fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
143cabdff1aSopenharmony_ci        fadd            v7.2s,  v4.2s,  v5.2s
144cabdff1aSopenharmony_ci        fsub            v18.2s, v2.2s,  v6.2s
145cabdff1aSopenharmony_ci        ld1             {v20.4s,v21.4s}, [x0], #32
146cabdff1aSopenharmony_ci        ld1             {v22.4s,v23.4s}, [x0], #32
147cabdff1aSopenharmony_ci        ext             v26.8b, v24.8b, v25.8b, #4
148cabdff1aSopenharmony_ci        ext             v27.8b, v25.8b, v24.8b, #4
149cabdff1aSopenharmony_ci        fadd            v2.2s,  v2.2s,  v6.2s
150cabdff1aSopenharmony_ci        fsub            v16.2s, v0.2s,  v7.2s
151cabdff1aSopenharmony_ci        fadd            v5.2s,  v25.2s, v24.2s
152cabdff1aSopenharmony_ci        fsub            v4.2s,  v26.2s, v27.2s
153cabdff1aSopenharmony_ci        transpose       v24.2d, v25.2d, v20.2d, v22.2d
154cabdff1aSopenharmony_ci        transpose       v26.2d, v27.2d, v21.2d, v23.2d
155cabdff1aSopenharmony_ci        fadd            v0.2s,  v0.2s,  v7.2s
156cabdff1aSopenharmony_ci        fsub            v17.2s, v1.2s,  v5.2s
157cabdff1aSopenharmony_ci        fsub            v19.2s, v3.2s,  v4.2s
158cabdff1aSopenharmony_ci        fadd            v3.2s,  v3.2s,  v4.2s
159cabdff1aSopenharmony_ci        fadd            v1.2s,  v1.2s,  v5.2s
160cabdff1aSopenharmony_ci        ext             v20.16b, v21.16b, v21.16b,  #4
161cabdff1aSopenharmony_ci        ext             v21.16b, v23.16b, v23.16b,  #4
162cabdff1aSopenharmony_ci
163cabdff1aSopenharmony_ci        zip1            v0.2d,  v0.2d,  v1.2d   // {z[0],   z[1]}
164cabdff1aSopenharmony_ci        zip1            v1.2d,  v2.2d,  v3.2d   // {z[2],   z[3]}
165cabdff1aSopenharmony_ci        zip1            v2.2d,  v16.2d, v17.2d  // {z[o1],  z[o1+1]}
166cabdff1aSopenharmony_ci        zip1            v3.2d,  v18.2d, v19.2d  // {z[o1+2],z[o1+3]}
167cabdff1aSopenharmony_ci
168cabdff1aSopenharmony_ci        // 2 x fft4
169cabdff1aSopenharmony_ci        transpose       v22.2d, v23.2d, v20.2d, v21.2d
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_ci        fadd            v4.4s,  v24.4s, v25.4s
172cabdff1aSopenharmony_ci        fadd            v5.4s,  v26.4s, v27.4s
173cabdff1aSopenharmony_ci        fsub            v6.4s,  v24.4s, v25.4s
174cabdff1aSopenharmony_ci        fsub            v7.4s,  v22.4s, v23.4s
175cabdff1aSopenharmony_ci
176cabdff1aSopenharmony_ci        ld1             {v23.4s},  [x14]
177cabdff1aSopenharmony_ci
178cabdff1aSopenharmony_ci        fadd            v24.4s, v4.4s,  v5.4s   // {z[o2+0],z[o2+1]}
179cabdff1aSopenharmony_ci        fsub            v26.4s, v4.4s,  v5.4s   // {z[o2+2],z[o2+3]}
180cabdff1aSopenharmony_ci        fadd            v25.4s, v6.4s,  v7.4s   // {z[o3+0],z[o3+1]}
181cabdff1aSopenharmony_ci        fsub            v27.4s, v6.4s,  v7.4s   // {z[o3+2],z[o3+3]}
182cabdff1aSopenharmony_ci
183cabdff1aSopenharmony_ci        //fft_pass_neon_16
184cabdff1aSopenharmony_ci        rev64           v7.4s,  v25.4s
185cabdff1aSopenharmony_ci        fmul            v25.4s, v25.4s, v23.s[1]
186cabdff1aSopenharmony_ci        fmul            v7.4s,  v7.4s,  v29.4s
187cabdff1aSopenharmony_ci        fmla            v25.4s, v7.4s,  v23.s[3] // {t1a,t2a,t5a,t6a}
188cabdff1aSopenharmony_ci
189cabdff1aSopenharmony_ci        zip1            v20.4s, v24.4s, v25.4s
190cabdff1aSopenharmony_ci        zip2            v21.4s, v24.4s, v25.4s
191cabdff1aSopenharmony_ci        fneg            v22.4s, v20.4s
192cabdff1aSopenharmony_ci        fadd            v4.4s,  v21.4s, v20.4s
193cabdff1aSopenharmony_ci        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
194cabdff1aSopenharmony_ci        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
195cabdff1aSopenharmony_ci
196cabdff1aSopenharmony_ci        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
197cabdff1aSopenharmony_ci        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
198cabdff1aSopenharmony_ci
199cabdff1aSopenharmony_ci        fsub            v20.4s, v0.4s,  v4.4s   // {z[o2],z[o2+1]}
200cabdff1aSopenharmony_ci        fadd            v16.4s, v0.4s,  v4.4s   // {z[0], z[1]}
201cabdff1aSopenharmony_ci        fsub            v22.4s, v2.4s,  v5.4s   // {z[o3],z[o3+1]}
202cabdff1aSopenharmony_ci        fadd            v18.4s, v2.4s,  v5.4s   // {z[o1],z[o1+1]}
203cabdff1aSopenharmony_ci
204cabdff1aSopenharmony_ci//second half
205cabdff1aSopenharmony_ci        rev64           v6.4s,  v26.4s
206cabdff1aSopenharmony_ci        fmul            v26.4s, v26.4s, v23.s[2]
207cabdff1aSopenharmony_ci        rev64           v7.4s,  v27.4s
208cabdff1aSopenharmony_ci        fmul            v27.4s, v27.4s, v23.s[3]
209cabdff1aSopenharmony_ci        fmul            v6.4s,  v6.4s,  v29.4s
210cabdff1aSopenharmony_ci        fmul            v7.4s,  v7.4s,  v29.4s
211cabdff1aSopenharmony_ci        fmla            v26.4s, v6.4s,  v23.s[2] // {t1,t2,t5,t6}
212cabdff1aSopenharmony_ci        fmla            v27.4s, v7.4s,  v23.s[1] // {t1a,t2a,t5a,t6a}
213cabdff1aSopenharmony_ci
214cabdff1aSopenharmony_ci        zip1            v24.4s, v26.4s, v27.4s
215cabdff1aSopenharmony_ci        zip2            v25.4s, v26.4s, v27.4s
216cabdff1aSopenharmony_ci        fneg            v26.4s, v24.4s
217cabdff1aSopenharmony_ci        fadd            v4.4s,  v25.4s, v24.4s
218cabdff1aSopenharmony_ci        fsub            v6.4s,  v24.4s, v25.4s  // just the second half
219cabdff1aSopenharmony_ci        fadd            v5.4s,  v25.4s, v26.4s  // just the first half
220cabdff1aSopenharmony_ci
221cabdff1aSopenharmony_ci        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
222cabdff1aSopenharmony_ci        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
223cabdff1aSopenharmony_ci
224cabdff1aSopenharmony_ci        fadd            v17.4s, v1.4s, v4.4s    // {z[2], z[3]}
225cabdff1aSopenharmony_ci        fsub            v21.4s, v1.4s, v4.4s    // {z[o2+2],z[o2+3]}
226cabdff1aSopenharmony_ci        fadd            v19.4s, v3.4s, v5.4s    // {z[o1+2],z[o1+3]}
227cabdff1aSopenharmony_ci        fsub            v23.4s, v3.4s, v5.4s    // {z[o3+2],z[o3+3]}
228cabdff1aSopenharmony_ci
229cabdff1aSopenharmony_ci        st1             {v16.4s,v17.4s}, [x1], #32
230cabdff1aSopenharmony_ci        st1             {v18.4s,v19.4s}, [x1], #32
231cabdff1aSopenharmony_ci        st1             {v20.4s,v21.4s}, [x1], #32
232cabdff1aSopenharmony_ci        st1             {v22.4s,v23.4s}, [x1], #32
233cabdff1aSopenharmony_ci
234cabdff1aSopenharmony_ci        ret
235cabdff1aSopenharmony_ciendfunc
236cabdff1aSopenharmony_ci
237cabdff1aSopenharmony_ci
238cabdff1aSopenharmony_ciconst  trans4_float, align=4
239cabdff1aSopenharmony_ci        .byte    0,  1,  2,  3
240cabdff1aSopenharmony_ci        .byte    8,  9, 10, 11
241cabdff1aSopenharmony_ci        .byte    4,  5,  6,  7
242cabdff1aSopenharmony_ci        .byte   12, 13, 14, 15
243cabdff1aSopenharmony_ciendconst
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ciconst  trans8_float, align=4
246cabdff1aSopenharmony_ci        .byte   24, 25, 26, 27
247cabdff1aSopenharmony_ci        .byte    0,  1,  2,  3
248cabdff1aSopenharmony_ci        .byte   28, 29, 30, 31
249cabdff1aSopenharmony_ci        .byte    4,  5,  6,  7
250cabdff1aSopenharmony_ciendconst
251cabdff1aSopenharmony_ci
252cabdff1aSopenharmony_cifunction fft_pass_neon
253cabdff1aSopenharmony_ci        sub             x6,  x2,  #1            // n - 1, loop counter
254cabdff1aSopenharmony_ci        lsl             x5,  x2,  #3            // 2 * n * sizeof FFTSample
255cabdff1aSopenharmony_ci        lsl             x1,  x2,  #4            // 2 * n * sizeof FFTComplex
256cabdff1aSopenharmony_ci        add             x5,  x4,  x5            // wim
257cabdff1aSopenharmony_ci        add             x3,  x1,  x2,  lsl #5   // 4 * n * sizeof FFTComplex
258cabdff1aSopenharmony_ci        add             x2,  x0,  x2,  lsl #5   // &z[o2]
259cabdff1aSopenharmony_ci        add             x3,  x0,  x3            // &z[o3]
260cabdff1aSopenharmony_ci        add             x1,  x0,  x1            // &z[o1]
261cabdff1aSopenharmony_ci        ld1             {v20.4s},[x2]           // {z[o2],z[o2+1]}
262cabdff1aSopenharmony_ci        ld1             {v22.4s},[x3]           // {z[o3],z[o3+1]}
263cabdff1aSopenharmony_ci        ld1             {v4.2s},  [x4], #8      // {wre[0],wre[1]}
264cabdff1aSopenharmony_ci        trn2            v25.2d, v20.2d, v22.2d
265cabdff1aSopenharmony_ci        sub             x5,  x5,  #4            // wim--
266cabdff1aSopenharmony_ci        trn1            v24.2d, v20.2d, v22.2d
267cabdff1aSopenharmony_ci        ld1             {v5.s}[0],  [x5], x7    // d5[0] = wim[-1]
268cabdff1aSopenharmony_ci        rev64           v7.4s,  v25.4s
269cabdff1aSopenharmony_ci        fmul            v25.4s, v25.4s, v4.s[1]
270cabdff1aSopenharmony_ci        ld1             {v16.4s}, [x0]          // {z[0],z[1]}
271cabdff1aSopenharmony_ci        fmul            v7.4s,  v7.4s,  v29.4s
272cabdff1aSopenharmony_ci        ld1             {v17.4s}, [x1]          // {z[o1],z[o1+1]}
273cabdff1aSopenharmony_ci        prfm            pldl1keep, [x2, #16]
274cabdff1aSopenharmony_ci        prfm            pldl1keep, [x3, #16]
275cabdff1aSopenharmony_ci        fmla            v25.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
276cabdff1aSopenharmony_ci        prfm            pldl1keep, [x0, #16]
277cabdff1aSopenharmony_ci        prfm            pldl1keep, [x1, #16]
278cabdff1aSopenharmony_ci
279cabdff1aSopenharmony_ci        zip1            v20.4s, v24.4s, v25.4s
280cabdff1aSopenharmony_ci        zip2            v21.4s, v24.4s, v25.4s
281cabdff1aSopenharmony_ci        fneg            v22.4s, v20.4s
282cabdff1aSopenharmony_ci        fadd            v4.4s,  v21.4s, v20.4s
283cabdff1aSopenharmony_ci        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
284cabdff1aSopenharmony_ci        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
285cabdff1aSopenharmony_ci
286cabdff1aSopenharmony_ci        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
287cabdff1aSopenharmony_ci        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
288cabdff1aSopenharmony_ci
289cabdff1aSopenharmony_ci        fadd            v20.4s, v16.4s, v4.4s
290cabdff1aSopenharmony_ci        fsub            v22.4s, v16.4s, v4.4s
291cabdff1aSopenharmony_ci        fadd            v21.4s, v17.4s, v5.4s
292cabdff1aSopenharmony_ci        st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
293cabdff1aSopenharmony_ci        fsub            v23.4s, v17.4s, v5.4s
294cabdff1aSopenharmony_ci
295cabdff1aSopenharmony_ci        st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
296cabdff1aSopenharmony_ci        st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
297cabdff1aSopenharmony_ci        st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
298cabdff1aSopenharmony_ci1:
299cabdff1aSopenharmony_ci        ld1             {v20.4s},[x2]    // {z[o2],z[o2+1]}
300cabdff1aSopenharmony_ci        ld1             {v22.4s},[x3]    // {z[o3],z[o3+1]}
301cabdff1aSopenharmony_ci        ld1             {v4.2s}, [x4], #8       // {wre[0],wre[1]}
302cabdff1aSopenharmony_ci        transpose       v26.2d, v27.2d, v20.2d, v22.2d
303cabdff1aSopenharmony_ci        ld1             {v5.2s}, [x5], x7       // {wim[-1],wim[0]}
304cabdff1aSopenharmony_ci        rev64           v6.4s,  v26.4s
305cabdff1aSopenharmony_ci        fmul            v26.4s, v26.4s, v4.s[0]
306cabdff1aSopenharmony_ci        rev64           v7.4s,  v27.4s
307cabdff1aSopenharmony_ci        fmul            v27.4s, v27.4s, v4.s[1]
308cabdff1aSopenharmony_ci        fmul            v6.4s,  v6.4s,  v29.4s
309cabdff1aSopenharmony_ci        fmul            v7.4s,  v7.4s,  v29.4s
310cabdff1aSopenharmony_ci        ld1             {v16.4s},[x0]           // {z[0],z[1]}
311cabdff1aSopenharmony_ci        fmla            v26.4s, v6.4s,  v5.s[1] // {t1,t2,t5,t6}
312cabdff1aSopenharmony_ci        fmla            v27.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
313cabdff1aSopenharmony_ci        ld1             {v17.4s},[x1]           // {z[o1],z[o1+1]}
314cabdff1aSopenharmony_ci
315cabdff1aSopenharmony_ci        subs            x6,  x6,  #1            // n--
316cabdff1aSopenharmony_ci
317cabdff1aSopenharmony_ci        zip1            v20.4s, v26.4s, v27.4s
318cabdff1aSopenharmony_ci        zip2            v21.4s, v26.4s, v27.4s
319cabdff1aSopenharmony_ci        fneg            v22.4s, v20.4s
320cabdff1aSopenharmony_ci        fadd            v4.4s,  v21.4s, v20.4s
321cabdff1aSopenharmony_ci        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
322cabdff1aSopenharmony_ci        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
323cabdff1aSopenharmony_ci
324cabdff1aSopenharmony_ci        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
325cabdff1aSopenharmony_ci        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
326cabdff1aSopenharmony_ci
327cabdff1aSopenharmony_ci        fadd            v20.4s, v16.4s, v4.4s
328cabdff1aSopenharmony_ci        fsub            v22.4s, v16.4s, v4.4s
329cabdff1aSopenharmony_ci        fadd            v21.4s, v17.4s, v5.4s
330cabdff1aSopenharmony_ci        st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
331cabdff1aSopenharmony_ci        fsub            v23.4s, v17.4s, v5.4s
332cabdff1aSopenharmony_ci
333cabdff1aSopenharmony_ci        st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
334cabdff1aSopenharmony_ci        st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
335cabdff1aSopenharmony_ci        st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
336cabdff1aSopenharmony_ci        b.ne            1b
337cabdff1aSopenharmony_ci
338cabdff1aSopenharmony_ci        ret
339cabdff1aSopenharmony_ciendfunc
340cabdff1aSopenharmony_ci
341cabdff1aSopenharmony_ci.macro  def_fft n, n2, n4
342cabdff1aSopenharmony_cifunction fft\n\()_neon, align=6
343cabdff1aSopenharmony_ci        AARCH64_VALID_JUMP_TARGET
344cabdff1aSopenharmony_ci        AARCH64_SIGN_LINK_REGISTER
345cabdff1aSopenharmony_ci        sub             sp,  sp,  #16
346cabdff1aSopenharmony_ci        stp             x28, x30, [sp]
347cabdff1aSopenharmony_ci        add             x28, x0,  #\n4*2*8
348cabdff1aSopenharmony_ci        bl              fft\n2\()_neon
349cabdff1aSopenharmony_ci        mov             x0,  x28
350cabdff1aSopenharmony_ci        bl              fft\n4\()_neon
351cabdff1aSopenharmony_ci        add             x0,  x28, #\n4*1*8
352cabdff1aSopenharmony_ci        bl              fft\n4\()_neon
353cabdff1aSopenharmony_ci        sub             x0,  x28, #\n4*2*8
354cabdff1aSopenharmony_ci        ldp             x28, x30, [sp], #16
355cabdff1aSopenharmony_ci        AARCH64_VALIDATE_LINK_REGISTER
356cabdff1aSopenharmony_ci        movrel          x4,  X(ff_cos_\n)
357cabdff1aSopenharmony_ci        mov             x2,  #\n4>>1
358cabdff1aSopenharmony_ci        b               fft_pass_neon
359cabdff1aSopenharmony_ciendfunc
360cabdff1aSopenharmony_ci.endm
361cabdff1aSopenharmony_ci
362cabdff1aSopenharmony_ci        def_fft    32,    16,     8
363cabdff1aSopenharmony_ci        def_fft    64,    32,    16
364cabdff1aSopenharmony_ci        def_fft   128,    64,    32
365cabdff1aSopenharmony_ci        def_fft   256,   128,    64
366cabdff1aSopenharmony_ci        def_fft   512,   256,   128
367cabdff1aSopenharmony_ci        def_fft  1024,   512,   256
368cabdff1aSopenharmony_ci        def_fft  2048,  1024,   512
369cabdff1aSopenharmony_ci        def_fft  4096,  2048,  1024
370cabdff1aSopenharmony_ci        def_fft  8192,  4096,  2048
371cabdff1aSopenharmony_ci        def_fft 16384,  8192,  4096
372cabdff1aSopenharmony_ci        def_fft 32768, 16384,  8192
373cabdff1aSopenharmony_ci        def_fft 65536, 32768, 16384
374cabdff1aSopenharmony_ci
375cabdff1aSopenharmony_cifunction ff_fft_calc_neon, export=1
376cabdff1aSopenharmony_ci        prfm            pldl1keep, [x1]
377cabdff1aSopenharmony_ci        movrel          x10, trans4_float
378cabdff1aSopenharmony_ci        ldr             w2,  [x0]
379cabdff1aSopenharmony_ci        movrel          x11, trans8_float
380cabdff1aSopenharmony_ci        sub             w2,  w2,  #2
381cabdff1aSopenharmony_ci        movrel          x3,  fft_tab_neon
382cabdff1aSopenharmony_ci        ld1             {v30.16b}, [x10]
383cabdff1aSopenharmony_ci        mov             x7,  #-8
384cabdff1aSopenharmony_ci        movrel          x12, pmmp
385cabdff1aSopenharmony_ci        ldr             x3,  [x3, x2, lsl #3]
386cabdff1aSopenharmony_ci        movrel          x13, mppm
387cabdff1aSopenharmony_ci        movrel          x14, X(ff_cos_16)
388cabdff1aSopenharmony_ci        ld1             {v31.16b}, [x11]
389cabdff1aSopenharmony_ci        mov             x0,  x1
390cabdff1aSopenharmony_ci        ld1             {v29.4s},  [x12]         // pmmp
391cabdff1aSopenharmony_ci        ld1             {v28.4s},  [x13]
392cabdff1aSopenharmony_ci        br              x3
393cabdff1aSopenharmony_ciendfunc
394cabdff1aSopenharmony_ci
395cabdff1aSopenharmony_cifunction ff_fft_permute_neon, export=1
396cabdff1aSopenharmony_ci        mov             x6,  #1
397cabdff1aSopenharmony_ci        ldr             w2,  [x0]       // nbits
398cabdff1aSopenharmony_ci        ldr             x3,  [x0, #16]  // tmp_buf
399cabdff1aSopenharmony_ci        ldr             x0,  [x0, #8]   // revtab
400cabdff1aSopenharmony_ci        lsl             x6,  x6, x2
401cabdff1aSopenharmony_ci        mov             x2,  x6
402cabdff1aSopenharmony_ci1:
403cabdff1aSopenharmony_ci        ld1             {v0.2s,v1.2s}, [x1], #16
404cabdff1aSopenharmony_ci        ldr             w4,  [x0], #4
405cabdff1aSopenharmony_ci        uxth            w5,  w4
406cabdff1aSopenharmony_ci        lsr             w4,  w4,  #16
407cabdff1aSopenharmony_ci        add             x5,  x3,  x5,  lsl #3
408cabdff1aSopenharmony_ci        add             x4,  x3,  x4,  lsl #3
409cabdff1aSopenharmony_ci        st1             {v0.2s}, [x5]
410cabdff1aSopenharmony_ci        st1             {v1.2s}, [x4]
411cabdff1aSopenharmony_ci        subs            x6,  x6, #2
412cabdff1aSopenharmony_ci        b.gt            1b
413cabdff1aSopenharmony_ci
414cabdff1aSopenharmony_ci        sub             x1,  x1,  x2,  lsl #3
415cabdff1aSopenharmony_ci1:
416cabdff1aSopenharmony_ci        ld1             {v0.4s,v1.4s}, [x3], #32
417cabdff1aSopenharmony_ci        st1             {v0.4s,v1.4s}, [x1], #32
418cabdff1aSopenharmony_ci        subs            x2,  x2,  #4
419cabdff1aSopenharmony_ci        b.gt            1b
420cabdff1aSopenharmony_ci
421cabdff1aSopenharmony_ci        ret
422cabdff1aSopenharmony_ciendfunc
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_ciconst   fft_tab_neon, relocate=1
425cabdff1aSopenharmony_ci        .quad fft4_neon
426cabdff1aSopenharmony_ci        .quad fft8_neon
427cabdff1aSopenharmony_ci        .quad fft16_neon
428cabdff1aSopenharmony_ci        .quad fft32_neon
429cabdff1aSopenharmony_ci        .quad fft64_neon
430cabdff1aSopenharmony_ci        .quad fft128_neon
431cabdff1aSopenharmony_ci        .quad fft256_neon
432cabdff1aSopenharmony_ci        .quad fft512_neon
433cabdff1aSopenharmony_ci        .quad fft1024_neon
434cabdff1aSopenharmony_ci        .quad fft2048_neon
435cabdff1aSopenharmony_ci        .quad fft4096_neon
436cabdff1aSopenharmony_ci        .quad fft8192_neon
437cabdff1aSopenharmony_ci        .quad fft16384_neon
438cabdff1aSopenharmony_ci        .quad fft32768_neon
439cabdff1aSopenharmony_ci        .quad fft65536_neon
440cabdff1aSopenharmony_ciendconst
441cabdff1aSopenharmony_ci
442cabdff1aSopenharmony_ciconst   pmmp, align=4
443cabdff1aSopenharmony_ci        .float          +1.0, -1.0, -1.0, +1.0
444cabdff1aSopenharmony_ciendconst
445cabdff1aSopenharmony_ci
446cabdff1aSopenharmony_ciconst   mppm, align=4
447cabdff1aSopenharmony_ci        .float          -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
448cabdff1aSopenharmony_ciendconst
449