1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2013 RISC OS Open Ltd
3cabdff1aSopenharmony_ci * Author: Ben Avison <bavison@riscosopen.org>
4cabdff1aSopenharmony_ci *
5cabdff1aSopenharmony_ci * This file is part of FFmpeg.
6cabdff1aSopenharmony_ci *
7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci *
12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci * Lesser General Public License for more details.
16cabdff1aSopenharmony_ci *
17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci */
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci@ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
25cabdff1aSopenharmony_ci@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
26cabdff1aSopenharmony_ci@ all single-precision VFP registers may be corrupted on exit. The a2
27cabdff1aSopenharmony_ci@ register may not be clobbered in these functions, as it holds the
28cabdff1aSopenharmony_ci@ stored original FPSCR.
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_cifunction ff_fft_calc_vfp, export=1
31cabdff1aSopenharmony_ci        ldr     ip, [a1, #0]    @ nbits
32cabdff1aSopenharmony_ci        mov     a1, a2
33cabdff1aSopenharmony_ci        movrel  a2, (fft_tab_vfp - 8)
34cabdff1aSopenharmony_ci        ldr     pc, [a2, ip, lsl #2]
35cabdff1aSopenharmony_ciendfunc
36cabdff1aSopenharmony_ciconst   fft_tab_vfp, relocate=1
37cabdff1aSopenharmony_ci        .word   fft4_vfp
38cabdff1aSopenharmony_ci        .word   fft8_vfp
39cabdff1aSopenharmony_ci        .word   X(ff_fft16_vfp)     @ this one alone is exported
40cabdff1aSopenharmony_ci        .word   fft32_vfp
41cabdff1aSopenharmony_ci        .word   fft64_vfp
42cabdff1aSopenharmony_ci        .word   fft128_vfp
43cabdff1aSopenharmony_ci        .word   fft256_vfp
44cabdff1aSopenharmony_ci        .word   fft512_vfp
45cabdff1aSopenharmony_ci        .word   fft1024_vfp
46cabdff1aSopenharmony_ci        .word   fft2048_vfp
47cabdff1aSopenharmony_ci        .word   fft4096_vfp
48cabdff1aSopenharmony_ci        .word   fft8192_vfp
49cabdff1aSopenharmony_ci        .word   fft16384_vfp
50cabdff1aSopenharmony_ci        .word   fft32768_vfp
51cabdff1aSopenharmony_ci        .word   fft65536_vfp
52cabdff1aSopenharmony_ciendconst
53cabdff1aSopenharmony_ci
54cabdff1aSopenharmony_cifunction fft4_vfp
55cabdff1aSopenharmony_ci        vldr    d0, [a1, #0*2*4]   @ s0,s1   = z[0]
56cabdff1aSopenharmony_ci        vldr    d4, [a1, #1*2*4]   @ s8,s9   = z[1]
57cabdff1aSopenharmony_ci        vldr    d1, [a1, #2*2*4]   @ s2,s3   = z[2]
58cabdff1aSopenharmony_ci        vldr    d5, [a1, #3*2*4]   @ s10,s11 = z[3]
59cabdff1aSopenharmony_ci        @ stall
60cabdff1aSopenharmony_ci        vadd.f  s12, s0, s8        @ i0
61cabdff1aSopenharmony_ci        vadd.f  s13, s1, s9        @ i1
62cabdff1aSopenharmony_ci        vadd.f  s14, s2, s10       @ i2
63cabdff1aSopenharmony_ci        vadd.f  s15, s3, s11       @ i3
64cabdff1aSopenharmony_ci        vsub.f  s8, s0, s8         @ i4
65cabdff1aSopenharmony_ci        vsub.f  s9, s1, s9         @ i5
66cabdff1aSopenharmony_ci        vsub.f  s10, s2, s10       @ i6
67cabdff1aSopenharmony_ci        vsub.f  s11, s3, s11       @ i7
68cabdff1aSopenharmony_ci        @ stall
69cabdff1aSopenharmony_ci        @ stall
70cabdff1aSopenharmony_ci        vadd.f  s0, s12, s14       @ z[0].re
71cabdff1aSopenharmony_ci        vsub.f  s4, s12, s14       @ z[2].re
72cabdff1aSopenharmony_ci        vadd.f  s1, s13, s15       @ z[0].im
73cabdff1aSopenharmony_ci        vsub.f  s5, s13, s15       @ z[2].im
74cabdff1aSopenharmony_ci        vadd.f  s7, s9, s10        @ z[3].im
75cabdff1aSopenharmony_ci        vsub.f  s3, s9, s10        @ z[1].im
76cabdff1aSopenharmony_ci        vadd.f  s2, s8, s11        @ z[1].re
77cabdff1aSopenharmony_ci        vsub.f  s6, s8, s11        @ z[3].re
78cabdff1aSopenharmony_ci        @ stall
79cabdff1aSopenharmony_ci        @ stall
80cabdff1aSopenharmony_ci        vstr    d0, [a1, #0*2*4]
81cabdff1aSopenharmony_ci        vstr    d2, [a1, #2*2*4]
82cabdff1aSopenharmony_ci        @ stall
83cabdff1aSopenharmony_ci        @ stall
84cabdff1aSopenharmony_ci        vstr    d1, [a1, #1*2*4]
85cabdff1aSopenharmony_ci        vstr    d3, [a1, #3*2*4]
86cabdff1aSopenharmony_ci
87cabdff1aSopenharmony_ci        bx      lr
88cabdff1aSopenharmony_ciendfunc
89cabdff1aSopenharmony_ci
90cabdff1aSopenharmony_ci.macro macro_fft8_head
91cabdff1aSopenharmony_ci        @ FFT4
92cabdff1aSopenharmony_ci        vldr    d4, [a1, #0 * 2*4]
93cabdff1aSopenharmony_ci        vldr    d6, [a1, #1 * 2*4]
94cabdff1aSopenharmony_ci        vldr    d5, [a1, #2 * 2*4]
95cabdff1aSopenharmony_ci        vldr    d7, [a1, #3 * 2*4]
96cabdff1aSopenharmony_ci            @ BF
97cabdff1aSopenharmony_ci            vldr    d12, [a1, #4 * 2*4]
98cabdff1aSopenharmony_ci        vadd.f  s16, s8, s12    @ vector op
99cabdff1aSopenharmony_ci            vldr    d14, [a1, #5 * 2*4]
100cabdff1aSopenharmony_ci            vldr    d13, [a1, #6 * 2*4]
101cabdff1aSopenharmony_ci            vldr    d15, [a1, #7 * 2*4]
102cabdff1aSopenharmony_ci        vsub.f  s20, s8, s12    @ vector op
103cabdff1aSopenharmony_ci        vadd.f  s0, s16, s18
104cabdff1aSopenharmony_ci        vsub.f  s2, s16, s18
105cabdff1aSopenharmony_ci        vadd.f  s1, s17, s19
106cabdff1aSopenharmony_ci        vsub.f  s3, s17, s19
107cabdff1aSopenharmony_ci        vadd.f  s7, s21, s22
108cabdff1aSopenharmony_ci        vsub.f  s5, s21, s22
109cabdff1aSopenharmony_ci        vadd.f  s4, s20, s23
110cabdff1aSopenharmony_ci        vsub.f  s6, s20, s23
111cabdff1aSopenharmony_ci            vsub.f  s20, s24, s28   @ vector op
112cabdff1aSopenharmony_ci        vstr    d0, [a1, #0 * 2*4]  @ transfer s0-s7 to s24-s31 via memory
113cabdff1aSopenharmony_ci        vstr    d1, [a1, #1 * 2*4]
114cabdff1aSopenharmony_ci        vldr    s0, cos1pi4
115cabdff1aSopenharmony_ci            vadd.f  s16, s24, s28   @ vector op
116cabdff1aSopenharmony_ci        vstr    d2, [a1, #2 * 2*4]
117cabdff1aSopenharmony_ci        vstr    d3, [a1, #3 * 2*4]
118cabdff1aSopenharmony_ci        vldr    d12, [a1, #0 * 2*4]
119cabdff1aSopenharmony_ci            @ TRANSFORM
120cabdff1aSopenharmony_ci            vmul.f  s20, s20, s0    @ vector x scalar op
121cabdff1aSopenharmony_ci        vldr    d13, [a1, #1 * 2*4]
122cabdff1aSopenharmony_ci        vldr    d14, [a1, #2 * 2*4]
123cabdff1aSopenharmony_ci        vldr    d15, [a1, #3 * 2*4]
124cabdff1aSopenharmony_ci        @ BUTTERFLIES
125cabdff1aSopenharmony_ci        vadd.f  s0, s18, s16
126cabdff1aSopenharmony_ci        vadd.f  s1, s17, s19
127cabdff1aSopenharmony_ci        vsub.f  s2, s17, s19
128cabdff1aSopenharmony_ci        vsub.f  s3, s18, s16
129cabdff1aSopenharmony_ci            vadd.f  s4, s21, s20
130cabdff1aSopenharmony_ci            vsub.f  s5, s21, s20
131cabdff1aSopenharmony_ci            vadd.f  s6, s22, s23
132cabdff1aSopenharmony_ci            vsub.f  s7, s22, s23
133cabdff1aSopenharmony_ci        vadd.f  s8, s0, s24         @ vector op
134cabdff1aSopenharmony_ci        vstr    d0, [a1, #0 * 2*4]  @ transfer s0-s3 to s12-s15 via memory
135cabdff1aSopenharmony_ci        vstr    d1, [a1, #1 * 2*4]
136cabdff1aSopenharmony_ci        vldr    d6, [a1, #0 * 2*4]
137cabdff1aSopenharmony_ci        vldr    d7, [a1, #1 * 2*4]
138cabdff1aSopenharmony_ci            vadd.f  s1, s5, s6
139cabdff1aSopenharmony_ci            vadd.f  s0, s7, s4
140cabdff1aSopenharmony_ci            vsub.f  s2, s5, s6
141cabdff1aSopenharmony_ci            vsub.f  s3, s7, s4
142cabdff1aSopenharmony_ci        vsub.f  s12, s24, s12       @ vector op
143cabdff1aSopenharmony_ci            vsub.f  s5, s29, s1
144cabdff1aSopenharmony_ci            vsub.f  s4, s28, s0
145cabdff1aSopenharmony_ci            vsub.f  s6, s30, s2
146cabdff1aSopenharmony_ci            vsub.f  s7, s31, s3
147cabdff1aSopenharmony_ci            vadd.f  s16, s0, s28    @ vector op
148cabdff1aSopenharmony_ci        vstr    d6, [a1, #4 * 2*4]
149cabdff1aSopenharmony_ci        vstr    d7, [a1, #6 * 2*4]
150cabdff1aSopenharmony_ci        vstr    d4, [a1, #0 * 2*4]
151cabdff1aSopenharmony_ci        vstr    d5, [a1, #2 * 2*4]
152cabdff1aSopenharmony_ci             vstr    d2, [a1, #5 * 2*4]
153cabdff1aSopenharmony_ci             vstr    d3, [a1, #7 * 2*4]
154cabdff1aSopenharmony_ci.endm
155cabdff1aSopenharmony_ci
156cabdff1aSopenharmony_ci.macro macro_fft8_tail
157cabdff1aSopenharmony_ci             vstr    d8, [a1, #1 * 2*4]
158cabdff1aSopenharmony_ci             vstr    d9, [a1, #3 * 2*4]
159cabdff1aSopenharmony_ci.endm
160cabdff1aSopenharmony_ci
161cabdff1aSopenharmony_cifunction .Lfft8_internal_vfp
162cabdff1aSopenharmony_ci        macro_fft8_head
163cabdff1aSopenharmony_ci        macro_fft8_tail
164cabdff1aSopenharmony_ci        bx      lr
165cabdff1aSopenharmony_ciendfunc
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_cifunction fft8_vfp
168cabdff1aSopenharmony_ci        ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
169cabdff1aSopenharmony_ci        fmrx    a2, FPSCR
170cabdff1aSopenharmony_ci        fmxr    FPSCR, a3
171cabdff1aSopenharmony_ci        vpush   {s16-s31}
172cabdff1aSopenharmony_ci        mov     ip, lr
173cabdff1aSopenharmony_ci        bl      .Lfft8_internal_vfp
174cabdff1aSopenharmony_ci        vpop    {s16-s31}
175cabdff1aSopenharmony_ci        fmxr    FPSCR, a2
176cabdff1aSopenharmony_ci        bx      ip
177cabdff1aSopenharmony_ciendfunc
178cabdff1aSopenharmony_ci
179cabdff1aSopenharmony_ci.align 3
180cabdff1aSopenharmony_cicos1pi4:    @ cos(1*pi/4) = sqrt(2)
181cabdff1aSopenharmony_ci        .float  0.707106769084930419921875
182cabdff1aSopenharmony_cicos1pi8:    @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
183cabdff1aSopenharmony_ci        .float  0.92387950420379638671875
184cabdff1aSopenharmony_cicos3pi8:    @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
185cabdff1aSopenharmony_ci        .float  0.3826834261417388916015625
186cabdff1aSopenharmony_ci
187cabdff1aSopenharmony_cifunction .Lfft16_internal_vfp
188cabdff1aSopenharmony_ci        macro_fft8_head
189cabdff1aSopenharmony_ci        @ FFT4(z+8)
190cabdff1aSopenharmony_ci        vldr    d10, [a1, #8 * 2*4]
191cabdff1aSopenharmony_ci        vldr    d12, [a1, #9 * 2*4]
192cabdff1aSopenharmony_ci        vldr    d11, [a1, #10 * 2*4]
193cabdff1aSopenharmony_ci        vldr    d13, [a1, #11 * 2*4]
194cabdff1aSopenharmony_ci        macro_fft8_tail
195cabdff1aSopenharmony_ci        vadd.f  s16, s20, s24   @ vector op
196cabdff1aSopenharmony_ci            @ FFT4(z+12)
197cabdff1aSopenharmony_ci            vldr    d4, [a1, #12 * 2*4]
198cabdff1aSopenharmony_ci            vldr    d6, [a1, #13 * 2*4]
199cabdff1aSopenharmony_ci            vldr    d5, [a1, #14 * 2*4]
200cabdff1aSopenharmony_ci        vsub.f  s20, s20, s24   @ vector op
201cabdff1aSopenharmony_ci            vldr    d7, [a1, #15 * 2*4]
202cabdff1aSopenharmony_ci        vadd.f  s0, s16, s18
203cabdff1aSopenharmony_ci        vsub.f  s4, s16, s18
204cabdff1aSopenharmony_ci        vadd.f  s1, s17, s19
205cabdff1aSopenharmony_ci        vsub.f  s5, s17, s19
206cabdff1aSopenharmony_ci        vadd.f  s7, s21, s22
207cabdff1aSopenharmony_ci        vsub.f  s3, s21, s22
208cabdff1aSopenharmony_ci        vadd.f  s2, s20, s23
209cabdff1aSopenharmony_ci        vsub.f  s6, s20, s23
210cabdff1aSopenharmony_ci            vadd.f  s16, s8, s12    @ vector op
211cabdff1aSopenharmony_ci        vstr    d0, [a1, #8 * 2*4]
212cabdff1aSopenharmony_ci        vstr    d2, [a1, #10 * 2*4]
213cabdff1aSopenharmony_ci        vstr    d1, [a1, #9 * 2*4]
214cabdff1aSopenharmony_ci            vsub.f  s20, s8, s12
215cabdff1aSopenharmony_ci        vstr    d3, [a1, #11 * 2*4]
216cabdff1aSopenharmony_ci        @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
217cabdff1aSopenharmony_ci        vldr    d12, [a1, #10 * 2*4]
218cabdff1aSopenharmony_ci            vadd.f  s0, s16, s18
219cabdff1aSopenharmony_ci            vadd.f  s1, s17, s19
220cabdff1aSopenharmony_ci            vsub.f  s6, s16, s18
221cabdff1aSopenharmony_ci            vsub.f  s7, s17, s19
222cabdff1aSopenharmony_ci            vsub.f  s3, s21, s22
223cabdff1aSopenharmony_ci            vadd.f  s2, s20, s23
224cabdff1aSopenharmony_ci            vadd.f  s5, s21, s22
225cabdff1aSopenharmony_ci            vsub.f  s4, s20, s23
226cabdff1aSopenharmony_ci            vstr    d0, [a1, #12 * 2*4]
227cabdff1aSopenharmony_ci        vmov    s0, s6
228cabdff1aSopenharmony_ci          @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
229cabdff1aSopenharmony_ci          vldr    d6, [a1, #9 * 2*4]
230cabdff1aSopenharmony_ci            vstr    d1, [a1, #13 * 2*4]
231cabdff1aSopenharmony_ci        vldr    d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
232cabdff1aSopenharmony_ci            vstr    d2, [a1, #15 * 2*4]
233cabdff1aSopenharmony_ci          vldr    d7, [a1, #13 * 2*4]
234cabdff1aSopenharmony_ci        vadd.f  s4, s25, s24
235cabdff1aSopenharmony_ci        vsub.f  s5, s25, s24
236cabdff1aSopenharmony_ci        vsub.f  s6, s0, s7
237cabdff1aSopenharmony_ci        vadd.f  s7, s0, s7
238cabdff1aSopenharmony_ci          vmul.f  s20, s12, s3  @ vector op
239cabdff1aSopenharmony_ci            @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
240cabdff1aSopenharmony_ci            vldr    d4, [a1, #11 * 2*4]
241cabdff1aSopenharmony_ci            vldr    d5, [a1, #15 * 2*4]
242cabdff1aSopenharmony_ci            vldr    s1, cos3pi8
243cabdff1aSopenharmony_ci        vmul.f  s24, s4, s2     @ vector * scalar op
244cabdff1aSopenharmony_ci          vmul.f  s28, s12, s1  @ vector * scalar op
245cabdff1aSopenharmony_ci            vmul.f  s12, s8, s1 @ vector * scalar op
246cabdff1aSopenharmony_ci          vadd.f  s4, s20, s29
247cabdff1aSopenharmony_ci          vsub.f  s5, s21, s28
248cabdff1aSopenharmony_ci          vsub.f  s6, s22, s31
249cabdff1aSopenharmony_ci          vadd.f  s7, s23, s30
250cabdff1aSopenharmony_ci            vmul.f  s8, s8, s3  @ vector * scalar op
251cabdff1aSopenharmony_ci          vldr    d8, [a1, #1 * 2*4]
252cabdff1aSopenharmony_ci          vldr    d9, [a1, #5 * 2*4]
253cabdff1aSopenharmony_ci            vldr    d10, [a1, #3 * 2*4]
254cabdff1aSopenharmony_ci            vldr    d11, [a1, #7 * 2*4]
255cabdff1aSopenharmony_ci        vldr    d14, [a1, #2 * 2*4]
256cabdff1aSopenharmony_ci          vadd.f  s0, s6, s4
257cabdff1aSopenharmony_ci          vadd.f  s1, s5, s7
258cabdff1aSopenharmony_ci          vsub.f  s2, s5, s7
259cabdff1aSopenharmony_ci          vsub.f  s3, s6, s4
260cabdff1aSopenharmony_ci            vadd.f  s4, s12, s9
261cabdff1aSopenharmony_ci            vsub.f  s5, s13, s8
262cabdff1aSopenharmony_ci            vsub.f  s6, s14, s11
263cabdff1aSopenharmony_ci            vadd.f  s7, s15, s10
264cabdff1aSopenharmony_ci          vadd.f  s12, s0, s16  @ vector op
265cabdff1aSopenharmony_ci          vstr    d0, [a1, #1 * 2*4]
266cabdff1aSopenharmony_ci          vstr    d1, [a1, #5 * 2*4]
267cabdff1aSopenharmony_ci          vldr    d4, [a1, #1 * 2*4]
268cabdff1aSopenharmony_ci          vldr    d5, [a1, #5 * 2*4]
269cabdff1aSopenharmony_ci            vadd.f  s0, s6, s4
270cabdff1aSopenharmony_ci            vadd.f  s1, s5, s7
271cabdff1aSopenharmony_ci            vsub.f  s2, s5, s7
272cabdff1aSopenharmony_ci            vsub.f  s3, s6, s4
273cabdff1aSopenharmony_ci          vsub.f  s8, s16, s8   @ vector op
274cabdff1aSopenharmony_ci          vstr    d6, [a1, #1 * 2*4]
275cabdff1aSopenharmony_ci          vstr    d7, [a1, #5 * 2*4]
276cabdff1aSopenharmony_ci        vldr    d15, [a1, #6 * 2*4]
277cabdff1aSopenharmony_ci            vsub.f  s4, s20, s0
278cabdff1aSopenharmony_ci            vsub.f  s5, s21, s1
279cabdff1aSopenharmony_ci            vsub.f  s6, s22, s2
280cabdff1aSopenharmony_ci            vsub.f  s7, s23, s3
281cabdff1aSopenharmony_ci            vadd.f  s20, s0, s20    @ vector op
282cabdff1aSopenharmony_ci          vstr    d4, [a1, #9 * 2*4]
283cabdff1aSopenharmony_ci              @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
284cabdff1aSopenharmony_ci              vldr    d6, [a1, #8 * 2*4]
285cabdff1aSopenharmony_ci          vstr    d5, [a1, #13 * 2*4]
286cabdff1aSopenharmony_ci              vldr    d7, [a1, #12 * 2*4]
287cabdff1aSopenharmony_ci          vstr    d2, [a1, #11 * 2*4]
288cabdff1aSopenharmony_ci              vldr    d8, [a1, #0 * 2*4]
289cabdff1aSopenharmony_ci          vstr    d3, [a1, #15 * 2*4]
290cabdff1aSopenharmony_ci              vldr    d9, [a1, #4 * 2*4]
291cabdff1aSopenharmony_ci        vadd.f  s0, s26, s24
292cabdff1aSopenharmony_ci        vadd.f  s1, s25, s27
293cabdff1aSopenharmony_ci        vsub.f  s2, s25, s27
294cabdff1aSopenharmony_ci        vsub.f  s3, s26, s24
295cabdff1aSopenharmony_ci              vadd.f  s4, s14, s12
296cabdff1aSopenharmony_ci              vadd.f  s5, s13, s15
297cabdff1aSopenharmony_ci              vsub.f  s6, s13, s15
298cabdff1aSopenharmony_ci              vsub.f  s7, s14, s12
299cabdff1aSopenharmony_ci        vadd.f  s8, s0, s28 @ vector op
300cabdff1aSopenharmony_ci        vstr    d0, [a1, #3 * 2*4]
301cabdff1aSopenharmony_ci        vstr    d1, [a1, #7 * 2*4]
302cabdff1aSopenharmony_ci        vldr    d6, [a1, #3 * 2*4]
303cabdff1aSopenharmony_ci        vldr    d7, [a1, #7 * 2*4]
304cabdff1aSopenharmony_ci              vsub.f  s0, s16, s4
305cabdff1aSopenharmony_ci              vsub.f  s1, s17, s5
306cabdff1aSopenharmony_ci              vsub.f  s2, s18, s6
307cabdff1aSopenharmony_ci              vsub.f  s3, s19, s7
308cabdff1aSopenharmony_ci        vsub.f  s12, s28, s12       @ vector op
309cabdff1aSopenharmony_ci              vadd.f  s16, s4, s16  @ vector op
310cabdff1aSopenharmony_ci            vstr    d10, [a1, #3 * 2*4]
311cabdff1aSopenharmony_ci            vstr    d11, [a1, #7 * 2*4]
312cabdff1aSopenharmony_ci        vstr    d4, [a1, #2 * 2*4]
313cabdff1aSopenharmony_ci        vstr    d5, [a1, #6 * 2*4]
314cabdff1aSopenharmony_ci              vstr    d0, [a1, #8 * 2*4]
315cabdff1aSopenharmony_ci              vstr    d1, [a1, #12 * 2*4]
316cabdff1aSopenharmony_ci        vstr    d6, [a1, #10 * 2*4]
317cabdff1aSopenharmony_ci        vstr    d7, [a1, #14 * 2*4]
318cabdff1aSopenharmony_ci              vstr    d8, [a1, #0 * 2*4]
319cabdff1aSopenharmony_ci              vstr    d9, [a1, #4 * 2*4]
320cabdff1aSopenharmony_ci
321cabdff1aSopenharmony_ci        bx      lr
322cabdff1aSopenharmony_ciendfunc
323cabdff1aSopenharmony_ci
324cabdff1aSopenharmony_cifunction ff_fft16_vfp, export=1
325cabdff1aSopenharmony_ci        ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
326cabdff1aSopenharmony_ci        fmrx    a2, FPSCR
327cabdff1aSopenharmony_ci        fmxr    FPSCR, a3
328cabdff1aSopenharmony_ci        vpush   {s16-s31}
329cabdff1aSopenharmony_ci        mov     ip, lr
330cabdff1aSopenharmony_ci        bl      .Lfft16_internal_vfp
331cabdff1aSopenharmony_ci        vpop    {s16-s31}
332cabdff1aSopenharmony_ci        fmxr    FPSCR, a2
333cabdff1aSopenharmony_ci        bx      ip
334cabdff1aSopenharmony_ciendfunc
335cabdff1aSopenharmony_ci
336cabdff1aSopenharmony_ci.macro pass n, z0, z1, z2, z3
337cabdff1aSopenharmony_ci        add     v6, v5, #4*2*\n
338cabdff1aSopenharmony_ci        @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
339cabdff1aSopenharmony_ci            @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
340cabdff1aSopenharmony_ci                @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
341cabdff1aSopenharmony_ci                    @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
342cabdff1aSopenharmony_ci            vldr    d8, [\z2, #8*(o2+1)]        @ s16,s17
343cabdff1aSopenharmony_ci            vldmdb  v6!, {s2}
344cabdff1aSopenharmony_ci            vldr    d9, [\z3, #8*(o3+1)]        @ s18,s19
345cabdff1aSopenharmony_ci            vldmia  v5!, {s0,s1}                @ s0 is unused
346cabdff1aSopenharmony_ci        vldr    s7, [\z2, #8*o2]            @ t1
347cabdff1aSopenharmony_ci            vmul.f  s20, s16, s2                @ vector * scalar
348cabdff1aSopenharmony_ci        vldr    s0, [\z3, #8*o3]            @ t5
349cabdff1aSopenharmony_ci        vldr    s6, [\z2, #8*o2+4]          @ t2
350cabdff1aSopenharmony_ci        vldr    s3, [\z3, #8*o3+4]          @ t6
351cabdff1aSopenharmony_ci            vmul.f  s16, s16, s1                @ vector * scalar
352cabdff1aSopenharmony_ci        ldr     a4, =\n-1
353cabdff1aSopenharmony_ci1:      add     \z0, \z0, #8*2
354cabdff1aSopenharmony_ci .if \n*4*2 >= 512
355cabdff1aSopenharmony_ci        add     \z1, \z1, #8*2
356cabdff1aSopenharmony_ci .endif
357cabdff1aSopenharmony_ci .if \n*4*2 >= 256
358cabdff1aSopenharmony_ci        add     \z2, \z2, #8*2
359cabdff1aSopenharmony_ci .endif
360cabdff1aSopenharmony_ci .if \n*4*2 >= 512
361cabdff1aSopenharmony_ci        add     \z3, \z3, #8*2
362cabdff1aSopenharmony_ci .endif
363cabdff1aSopenharmony_ci        @ up to 2 stalls (VFP vector issuing / waiting for s0)
364cabdff1aSopenharmony_ci        @ depending upon whether this is the first iteration and
365cabdff1aSopenharmony_ci        @ how many add instructions are inserted above
366cabdff1aSopenharmony_ci        vadd.f  s4, s0, s7                  @ t5
367cabdff1aSopenharmony_ci        vadd.f  s5, s6, s3                  @ t6
368cabdff1aSopenharmony_ci        vsub.f  s6, s6, s3                  @ t4
369cabdff1aSopenharmony_ci        vsub.f  s7, s0, s7                  @ t3
370cabdff1aSopenharmony_ci        vldr    d6, [\z0, #8*0-8*2]         @ s12,s13
371cabdff1aSopenharmony_ci            vadd.f  s0, s16, s21                @ t1
372cabdff1aSopenharmony_ci        vldr    d7, [\z1, #8*o1-8*2]        @ s14,s15
373cabdff1aSopenharmony_ci            vsub.f  s1, s18, s23                @ t5
374cabdff1aSopenharmony_ci        vadd.f  s8, s4, s12                 @ vector + vector
375cabdff1aSopenharmony_ci        @ stall (VFP vector issuing)
376cabdff1aSopenharmony_ci        @ stall (VFP vector issuing)
377cabdff1aSopenharmony_ci        @ stall (VFP vector issuing)
378cabdff1aSopenharmony_ci        vsub.f  s4, s12, s4
379cabdff1aSopenharmony_ci        vsub.f  s5, s13, s5
380cabdff1aSopenharmony_ci        vsub.f  s6, s14, s6
381cabdff1aSopenharmony_ci        vsub.f  s7, s15, s7
382cabdff1aSopenharmony_ci            vsub.f  s2, s17, s20                @ t2
383cabdff1aSopenharmony_ci            vadd.f  s3, s19, s22                @ t6
384cabdff1aSopenharmony_ci        vstr    d4, [\z0, #8*0-8*2]         @ s8,s9
385cabdff1aSopenharmony_ci        vstr    d5, [\z1, #8*o1-8*2]        @ s10,s11
386cabdff1aSopenharmony_ci        @ stall (waiting for s5)
387cabdff1aSopenharmony_ci        vstr    d2, [\z2, #8*o2-8*2]        @ s4,s5
388cabdff1aSopenharmony_ci            vadd.f  s4, s1, s0                  @ t5
389cabdff1aSopenharmony_ci        vstr    d3, [\z3, #8*o3-8*2]        @ s6,s7
390cabdff1aSopenharmony_ci            vsub.f  s7, s1, s0                  @ t3
391cabdff1aSopenharmony_ci            vadd.f  s5, s2, s3                  @ t6
392cabdff1aSopenharmony_ci            vsub.f  s6, s2, s3                  @ t4
393cabdff1aSopenharmony_ci            vldr    d6, [\z0, #8*1-8*2]         @ s12,s13
394cabdff1aSopenharmony_ci            vldr    d7, [\z1, #8*(o1+1)-8*2]    @ s14,s15
395cabdff1aSopenharmony_ci                vldr    d4, [\z2, #8*o2]            @ s8,s9
396cabdff1aSopenharmony_ci                vldmdb  v6!, {s2,s3}
397cabdff1aSopenharmony_ci                vldr    d5, [\z3, #8*o3]            @ s10,s11
398cabdff1aSopenharmony_ci            vadd.f  s20, s4, s12                @ vector + vector
399cabdff1aSopenharmony_ci                vldmia  v5!, {s0,s1}
400cabdff1aSopenharmony_ci                    vldr    d8, [\z2, #8*(o2+1)]        @ s16,s17
401cabdff1aSopenharmony_ci            @ stall (VFP vector issuing)
402cabdff1aSopenharmony_ci            vsub.f  s4, s12, s4
403cabdff1aSopenharmony_ci            vsub.f  s5, s13, s5
404cabdff1aSopenharmony_ci            vsub.f  s6, s14, s6
405cabdff1aSopenharmony_ci            vsub.f  s7, s15, s7
406cabdff1aSopenharmony_ci                vmul.f  s12, s8, s3                 @ vector * scalar
407cabdff1aSopenharmony_ci            vstr    d10, [\z0, #8*1-8*2]        @ s20,s21
408cabdff1aSopenharmony_ci                    vldr    d9, [\z3, #8*(o3+1)]        @ s18,s19
409cabdff1aSopenharmony_ci            vstr    d11, [\z1, #8*(o1+1)-8*2]   @ s22,s23
410cabdff1aSopenharmony_ci                vmul.f  s8, s8, s0                  @ vector * scalar
411cabdff1aSopenharmony_ci            vstr    d2, [\z2, #8*(o2+1)-8*2]    @ s4,s5
412cabdff1aSopenharmony_ci            @ stall (waiting for s7)
413cabdff1aSopenharmony_ci            vstr    d3, [\z3, #8*(o3+1)-8*2]    @ s6,s7
414cabdff1aSopenharmony_ci                    vmul.f  s20, s16, s2                @ vector * scalar
415cabdff1aSopenharmony_ci                @ stall (VFP vector issuing)
416cabdff1aSopenharmony_ci                @ stall (VFP vector issuing)
417cabdff1aSopenharmony_ci                @ stall (VFP vector issuing)
418cabdff1aSopenharmony_ci                vadd.f  s7, s8, s13                 @ t1
419cabdff1aSopenharmony_ci                vsub.f  s6, s9, s12                 @ t2
420cabdff1aSopenharmony_ci                vsub.f  s0, s10, s15                @ t5
421cabdff1aSopenharmony_ci                vadd.f  s3, s11, s14                @ t6
422cabdff1aSopenharmony_ci                    vmul.f  s16, s16, s1                @ vector * scalar
423cabdff1aSopenharmony_ci        subs    a4, a4, #1
424cabdff1aSopenharmony_ci        bne     1b
425cabdff1aSopenharmony_ci        @ What remains is identical to the first two indentations of
426cabdff1aSopenharmony_ci        @ the above, but without the increment of z
427cabdff1aSopenharmony_ci        vadd.f  s4, s0, s7                  @ t5
428cabdff1aSopenharmony_ci        vadd.f  s5, s6, s3                  @ t6
429cabdff1aSopenharmony_ci        vsub.f  s6, s6, s3                  @ t4
430cabdff1aSopenharmony_ci        vsub.f  s7, s0, s7                  @ t3
431cabdff1aSopenharmony_ci        vldr    d6, [\z0, #8*0]             @ s12,s13
432cabdff1aSopenharmony_ci            vadd.f  s0, s16, s21                @ t1
433cabdff1aSopenharmony_ci        vldr    d7, [\z1, #8*o1]            @ s14,s15
434cabdff1aSopenharmony_ci            vsub.f  s1, s18, s23                @ t5
435cabdff1aSopenharmony_ci        vadd.f  s8, s4, s12                 @ vector + vector
436cabdff1aSopenharmony_ci        vsub.f  s4, s12, s4
437cabdff1aSopenharmony_ci        vsub.f  s5, s13, s5
438cabdff1aSopenharmony_ci        vsub.f  s6, s14, s6
439cabdff1aSopenharmony_ci        vsub.f  s7, s15, s7
440cabdff1aSopenharmony_ci            vsub.f  s2, s17, s20                @ t2
441cabdff1aSopenharmony_ci            vadd.f  s3, s19, s22                @ t6
442cabdff1aSopenharmony_ci        vstr    d4, [\z0, #8*0]             @ s8,s9
443cabdff1aSopenharmony_ci        vstr    d5, [\z1, #8*o1]            @ s10,s11
444cabdff1aSopenharmony_ci        vstr    d2, [\z2, #8*o2]            @ s4,s5
445cabdff1aSopenharmony_ci            vadd.f  s4, s1, s0                  @ t5
446cabdff1aSopenharmony_ci        vstr    d3, [\z3, #8*o3]            @ s6,s7
447cabdff1aSopenharmony_ci            vsub.f  s7, s1, s0                  @ t3
448cabdff1aSopenharmony_ci            vadd.f  s5, s2, s3                  @ t6
449cabdff1aSopenharmony_ci            vsub.f  s6, s2, s3                  @ t4
450cabdff1aSopenharmony_ci            vldr    d6, [\z0, #8*1]             @ s12,s13
451cabdff1aSopenharmony_ci            vldr    d7, [\z1, #8*(o1+1)]        @ s14,s15
452cabdff1aSopenharmony_ci            vadd.f  s20, s4, s12                @ vector + vector
453cabdff1aSopenharmony_ci            vsub.f  s4, s12, s4
454cabdff1aSopenharmony_ci            vsub.f  s5, s13, s5
455cabdff1aSopenharmony_ci            vsub.f  s6, s14, s6
456cabdff1aSopenharmony_ci            vsub.f  s7, s15, s7
457cabdff1aSopenharmony_ci            vstr    d10, [\z0, #8*1]            @ s20,s21
458cabdff1aSopenharmony_ci            vstr    d11, [\z1, #8*(o1+1)]       @ s22,s23
459cabdff1aSopenharmony_ci            vstr    d2, [\z2, #8*(o2+1)]        @ s4,s5
460cabdff1aSopenharmony_ci            vstr    d3, [\z3, #8*(o3+1)]        @ s6,s7
461cabdff1aSopenharmony_ci.endm
462cabdff1aSopenharmony_ci
463cabdff1aSopenharmony_ci.macro  def_fft n, n2, n4
464cabdff1aSopenharmony_cifunction .Lfft\n\()_internal_vfp
465cabdff1aSopenharmony_ci .if \n >= 512
466cabdff1aSopenharmony_ci        push    {v1-v6,lr}
467cabdff1aSopenharmony_ci .elseif \n >= 256
468cabdff1aSopenharmony_ci        push    {v1-v2,v5-v6,lr}
469cabdff1aSopenharmony_ci .else
470cabdff1aSopenharmony_ci        push    {v1,v5-v6,lr}
471cabdff1aSopenharmony_ci .endif
472cabdff1aSopenharmony_ci        mov     v1, a1
473cabdff1aSopenharmony_ci        bl      .Lfft\n2\()_internal_vfp
474cabdff1aSopenharmony_ci        add     a1, v1, #8*(\n/4)*2
475cabdff1aSopenharmony_ci        bl      .Lfft\n4\()_internal_vfp
476cabdff1aSopenharmony_ci        movrelx v5, X(ff_cos_\n), a1
477cabdff1aSopenharmony_ci        add     a1, v1, #8*(\n/4)*3
478cabdff1aSopenharmony_ci        bl      .Lfft\n4\()_internal_vfp
479cabdff1aSopenharmony_ci .if \n >= 512
480cabdff1aSopenharmony_ci  .set o1, 0*(\n/4/2)
481cabdff1aSopenharmony_ci  .set o2, 0*(\n/4/2)
482cabdff1aSopenharmony_ci  .set o3, 0*(\n/4/2)
483cabdff1aSopenharmony_ci        add     v2, v1, #8*2*(\n/4/2)
484cabdff1aSopenharmony_ci        add     v3, v1, #8*4*(\n/4/2)
485cabdff1aSopenharmony_ci        add     v4, v1, #8*6*(\n/4/2)
486cabdff1aSopenharmony_ci        pass    (\n/4/2), v1, v2, v3, v4
487cabdff1aSopenharmony_ci        pop     {v1-v6,pc}
488cabdff1aSopenharmony_ci .elseif \n >= 256
489cabdff1aSopenharmony_ci  .set o1, 2*(\n/4/2)
490cabdff1aSopenharmony_ci  .set o2, 0*(\n/4/2)
491cabdff1aSopenharmony_ci  .set o3, 2*(\n/4/2)
492cabdff1aSopenharmony_ci        add     v2, v1, #8*4*(\n/4/2)
493cabdff1aSopenharmony_ci        pass    (\n/4/2), v1, v1, v2, v2
494cabdff1aSopenharmony_ci        pop     {v1-v2,v5-v6,pc}
495cabdff1aSopenharmony_ci .else
496cabdff1aSopenharmony_ci  .set o1, 2*(\n/4/2)
497cabdff1aSopenharmony_ci  .set o2, 4*(\n/4/2)
498cabdff1aSopenharmony_ci  .set o3, 6*(\n/4/2)
499cabdff1aSopenharmony_ci        pass    (\n/4/2), v1, v1, v1, v1
500cabdff1aSopenharmony_ci        pop     {v1,v5-v6,pc}
501cabdff1aSopenharmony_ci .endif
502cabdff1aSopenharmony_ciendfunc
503cabdff1aSopenharmony_ci
504cabdff1aSopenharmony_cifunction fft\n\()_vfp
505cabdff1aSopenharmony_ci        ldr     a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */
506cabdff1aSopenharmony_ci        fmrx    a2, FPSCR
507cabdff1aSopenharmony_ci        fmxr    FPSCR, a3
508cabdff1aSopenharmony_ci        vpush   {s16-s31}
509cabdff1aSopenharmony_ci        mov     ip, lr
510cabdff1aSopenharmony_ci        bl      .Lfft\n\()_internal_vfp
511cabdff1aSopenharmony_ci        vpop    {s16-s31}
512cabdff1aSopenharmony_ci        fmxr    FPSCR, a2
513cabdff1aSopenharmony_ci        bx      ip
514cabdff1aSopenharmony_ciendfunc
515cabdff1aSopenharmony_ci
516cabdff1aSopenharmony_ci.ltorg
517cabdff1aSopenharmony_ci.endm
518cabdff1aSopenharmony_ci
519cabdff1aSopenharmony_ci        def_fft    32,    16,     8
520cabdff1aSopenharmony_ci        def_fft    64,    32,    16
521cabdff1aSopenharmony_ci        def_fft   128,    64,    32
522cabdff1aSopenharmony_ci        def_fft   256,   128,    64
523cabdff1aSopenharmony_ci        def_fft   512,   256,   128
524cabdff1aSopenharmony_ci        def_fft  1024,   512,   256
525cabdff1aSopenharmony_ci        def_fft  2048,  1024,   512
526cabdff1aSopenharmony_ci        def_fft  4096,  2048,  1024
527cabdff1aSopenharmony_ci        def_fft  8192,  4096,  2048
528cabdff1aSopenharmony_ci        def_fft 16384,  8192,  4096
529cabdff1aSopenharmony_ci        def_fft 32768, 16384,  8192
530cabdff1aSopenharmony_ci        def_fft 65536, 32768, 16384
531