1 /*
2  * ARM NEON optimised FFT
3  *
4  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
5  * Copyright (c) 2009 Naotoshi Nojiri
6  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
7  *
8  * This algorithm (though not any of the implementation details) is
9  * based on libdjbfft by D. J. Bernstein.
10  *
11  * This file is part of FFmpeg.
12  *
13  * FFmpeg is free software; you can redistribute it and/or
14  * modify it under the terms of the GNU Lesser General Public
15  * License as published by the Free Software Foundation; either
16  * version 2.1 of the License, or (at your option) any later version.
17  *
18  * FFmpeg is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21  * Lesser General Public License for more details.
22  *
23  * You should have received a copy of the GNU Lesser General Public
24  * License along with FFmpeg; if not, write to the Free Software
25  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26  */
27 
28 #include "libavutil/aarch64/asm.S"
29 
30 #define M_SQRT1_2 0.70710678118654752440
31 
32 .macro transpose d0, d1, s0, s1
33         trn1            \d0, \s0, \s1
34         trn2            \d1, \s0, \s1
35 .endm
36 
37 
38 function fft4_neon
39         AARCH64_VALID_JUMP_TARGET
40         ld1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
41 
42         fadd            v4.2s,  v0.2s,  v1.2s   // r0+r1,i0+i1
43         fsub            v6.2s,  v0.2s,  v1.2s   // r0-r1,i0-i1
44 
45         ext             v16.8b, v2.8b,  v3.8b,  #4
46         ext             v17.8b, v3.8b,  v2.8b,  #4
47 
48         fadd            v5.2s,  v2.2s,  v3.2s   // i2+i3,r2+r3
49         fsub            v7.2s,  v16.2s, v17.2s  // r3-r2,i2-i3
50 
51         fadd            v0.2s,  v4.2s,  v5.2s
52         fsub            v2.2s,  v4.2s,  v5.2s
53         fadd            v1.2s,  v6.2s,  v7.2s
54         fsub            v3.2s,  v6.2s,  v7.2s
55 
56         st1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
57 
58         ret
59 endfunc
60 
61 function fft8_neon
62         AARCH64_VALID_JUMP_TARGET
63         mov             x1,  x0
64         ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
65         ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
66         ext             v22.8b, v2.8b,  v3.8b,  #4
67         ext             v23.8b, v3.8b,  v2.8b,  #4
68         fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
69         fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
70         fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
71         fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
72         rev64           v27.2s, v28.2s  // ???
73         fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
74         fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
75         fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
76         ext             v6.8b,  v4.8b,  v5.8b,  #4
77         ext             v7.8b,  v5.8b,  v4.8b,  #4
78         fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
79         fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
80         fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
81         fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
82         fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
83         fadd            v0.2s,  v20.2s, v21.2s
84         fsub            v2.2s,  v20.2s, v21.2s
85         fadd            v1.2s,  v22.2s, v23.2s
86         rev64           v26.2s, v26.2s
87         rev64           v27.2s, v27.2s
88         fsub            v3.2s,  v22.2s, v23.2s
89         fsub            v6.2s,  v6.2s,  v7.2s
90         fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
91         fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
92         fadd            v7.2s,  v4.2s,  v5.2s
93         fsub            v18.2s, v2.2s,  v6.2s
94         ext             v26.8b, v24.8b, v25.8b, #4
95         ext             v27.8b, v25.8b, v24.8b, #4
96         fadd            v2.2s,  v2.2s,  v6.2s
97         fsub            v16.2s, v0.2s,  v7.2s
98         fadd            v5.2s,  v25.2s, v24.2s
99         fsub            v4.2s,  v26.2s, v27.2s
100         fadd            v0.2s,  v0.2s,  v7.2s
101         fsub            v17.2s, v1.2s,  v5.2s
102         fsub            v19.2s, v3.2s,  v4.2s
103         fadd            v3.2s,  v3.2s,  v4.2s
104         fadd            v1.2s,  v1.2s,  v5.2s
105 
106         st1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
107         st1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x1]
108 
109         ret
110 endfunc
111 
112 function fft16_neon
113         AARCH64_VALID_JUMP_TARGET
114         mov             x1,  x0
115         ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
116         ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
117         ext             v22.8b, v2.8b,  v3.8b,  #4
118         ext             v23.8b, v3.8b,  v2.8b,  #4
119         fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
120         fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
121         fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
122         fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
123         rev64           v27.2s, v28.2s  // ???
124         fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
125         fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
126         fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
127         ext             v6.8b,  v4.8b,  v5.8b,  #4
128         ext             v7.8b,  v5.8b,  v4.8b,  #4
129         fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
130         fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
131         fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
132         fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
133         fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
134         fadd            v0.2s,  v20.2s, v21.2s
135         fsub            v2.2s,  v20.2s, v21.2s
136         fadd            v1.2s,  v22.2s, v23.2s
137         rev64           v26.2s, v26.2s
138         rev64           v27.2s, v27.2s
139         fsub            v3.2s,  v22.2s, v23.2s
140         fsub            v6.2s,  v6.2s,  v7.2s
141         fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
142         fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
143         fadd            v7.2s,  v4.2s,  v5.2s
144         fsub            v18.2s, v2.2s,  v6.2s
145         ld1             {v20.4s,v21.4s}, [x0], #32
146         ld1             {v22.4s,v23.4s}, [x0], #32
147         ext             v26.8b, v24.8b, v25.8b, #4
148         ext             v27.8b, v25.8b, v24.8b, #4
149         fadd            v2.2s,  v2.2s,  v6.2s
150         fsub            v16.2s, v0.2s,  v7.2s
151         fadd            v5.2s,  v25.2s, v24.2s
152         fsub            v4.2s,  v26.2s, v27.2s
153         transpose       v24.2d, v25.2d, v20.2d, v22.2d
154         transpose       v26.2d, v27.2d, v21.2d, v23.2d
155         fadd            v0.2s,  v0.2s,  v7.2s
156         fsub            v17.2s, v1.2s,  v5.2s
157         fsub            v19.2s, v3.2s,  v4.2s
158         fadd            v3.2s,  v3.2s,  v4.2s
159         fadd            v1.2s,  v1.2s,  v5.2s
160         ext             v20.16b, v21.16b, v21.16b,  #4
161         ext             v21.16b, v23.16b, v23.16b,  #4
162 
163         zip1            v0.2d,  v0.2d,  v1.2d   // {z[0],   z[1]}
164         zip1            v1.2d,  v2.2d,  v3.2d   // {z[2],   z[3]}
165         zip1            v2.2d,  v16.2d, v17.2d  // {z[o1],  z[o1+1]}
166         zip1            v3.2d,  v18.2d, v19.2d  // {z[o1+2],z[o1+3]}
167 
168         // 2 x fft4
169         transpose       v22.2d, v23.2d, v20.2d, v21.2d
170 
171         fadd            v4.4s,  v24.4s, v25.4s
172         fadd            v5.4s,  v26.4s, v27.4s
173         fsub            v6.4s,  v24.4s, v25.4s
174         fsub            v7.4s,  v22.4s, v23.4s
175 
176         ld1             {v23.4s},  [x14]
177 
178         fadd            v24.4s, v4.4s,  v5.4s   // {z[o2+0],z[o2+1]}
179         fsub            v26.4s, v4.4s,  v5.4s   // {z[o2+2],z[o2+3]}
180         fadd            v25.4s, v6.4s,  v7.4s   // {z[o3+0],z[o3+1]}
181         fsub            v27.4s, v6.4s,  v7.4s   // {z[o3+2],z[o3+3]}
182 
183         //fft_pass_neon_16
184         rev64           v7.4s,  v25.4s
185         fmul            v25.4s, v25.4s, v23.s[1]
186         fmul            v7.4s,  v7.4s,  v29.4s
187         fmla            v25.4s, v7.4s,  v23.s[3] // {t1a,t2a,t5a,t6a}
188 
189         zip1            v20.4s, v24.4s, v25.4s
190         zip2            v21.4s, v24.4s, v25.4s
191         fneg            v22.4s, v20.4s
192         fadd            v4.4s,  v21.4s, v20.4s
193         fsub            v6.4s,  v20.4s, v21.4s  // just the second half
194         fadd            v5.4s,  v21.4s, v22.4s  // just the first half
195 
196         tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
197         tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
198 
199         fsub            v20.4s, v0.4s,  v4.4s   // {z[o2],z[o2+1]}
200         fadd            v16.4s, v0.4s,  v4.4s   // {z[0], z[1]}
201         fsub            v22.4s, v2.4s,  v5.4s   // {z[o3],z[o3+1]}
202         fadd            v18.4s, v2.4s,  v5.4s   // {z[o1],z[o1+1]}
203 
204 //second half
205         rev64           v6.4s,  v26.4s
206         fmul            v26.4s, v26.4s, v23.s[2]
207         rev64           v7.4s,  v27.4s
208         fmul            v27.4s, v27.4s, v23.s[3]
209         fmul            v6.4s,  v6.4s,  v29.4s
210         fmul            v7.4s,  v7.4s,  v29.4s
211         fmla            v26.4s, v6.4s,  v23.s[2] // {t1,t2,t5,t6}
212         fmla            v27.4s, v7.4s,  v23.s[1] // {t1a,t2a,t5a,t6a}
213 
214         zip1            v24.4s, v26.4s, v27.4s
215         zip2            v25.4s, v26.4s, v27.4s
216         fneg            v26.4s, v24.4s
217         fadd            v4.4s,  v25.4s, v24.4s
218         fsub            v6.4s,  v24.4s, v25.4s  // just the second half
219         fadd            v5.4s,  v25.4s, v26.4s  // just the first half
220 
221         tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
222         tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
223 
224         fadd            v17.4s, v1.4s, v4.4s    // {z[2], z[3]}
225         fsub            v21.4s, v1.4s, v4.4s    // {z[o2+2],z[o2+3]}
226         fadd            v19.4s, v3.4s, v5.4s    // {z[o1+2],z[o1+3]}
227         fsub            v23.4s, v3.4s, v5.4s    // {z[o3+2],z[o3+3]}
228 
229         st1             {v16.4s,v17.4s}, [x1], #32
230         st1             {v18.4s,v19.4s}, [x1], #32
231         st1             {v20.4s,v21.4s}, [x1], #32
232         st1             {v22.4s,v23.4s}, [x1], #32
233 
234         ret
235 endfunc
236 
237 
238 const  trans4_float, align=4
239         .byte    0,  1,  2,  3
240         .byte    8,  9, 10, 11
241         .byte    4,  5,  6,  7
242         .byte   12, 13, 14, 15
243 endconst
244 
245 const  trans8_float, align=4
246         .byte   24, 25, 26, 27
247         .byte    0,  1,  2,  3
248         .byte   28, 29, 30, 31
249         .byte    4,  5,  6,  7
250 endconst
251 
252 function fft_pass_neon
253         sub             x6,  x2,  #1            // n - 1, loop counter
254         lsl             x5,  x2,  #3            // 2 * n * sizeof FFTSample
255         lsl             x1,  x2,  #4            // 2 * n * sizeof FFTComplex
256         add             x5,  x4,  x5            // wim
257         add             x3,  x1,  x2,  lsl #5   // 4 * n * sizeof FFTComplex
258         add             x2,  x0,  x2,  lsl #5   // &z[o2]
259         add             x3,  x0,  x3            // &z[o3]
260         add             x1,  x0,  x1            // &z[o1]
261         ld1             {v20.4s},[x2]           // {z[o2],z[o2+1]}
262         ld1             {v22.4s},[x3]           // {z[o3],z[o3+1]}
263         ld1             {v4.2s},  [x4], #8      // {wre[0],wre[1]}
264         trn2            v25.2d, v20.2d, v22.2d
265         sub             x5,  x5,  #4            // wim--
266         trn1            v24.2d, v20.2d, v22.2d
267         ld1             {v5.s}[0],  [x5], x7    // d5[0] = wim[-1]
268         rev64           v7.4s,  v25.4s
269         fmul            v25.4s, v25.4s, v4.s[1]
270         ld1             {v16.4s}, [x0]          // {z[0],z[1]}
271         fmul            v7.4s,  v7.4s,  v29.4s
272         ld1             {v17.4s}, [x1]          // {z[o1],z[o1+1]}
273         prfm            pldl1keep, [x2, #16]
274         prfm            pldl1keep, [x3, #16]
275         fmla            v25.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
276         prfm            pldl1keep, [x0, #16]
277         prfm            pldl1keep, [x1, #16]
278 
279         zip1            v20.4s, v24.4s, v25.4s
280         zip2            v21.4s, v24.4s, v25.4s
281         fneg            v22.4s, v20.4s
282         fadd            v4.4s,  v21.4s, v20.4s
283         fsub            v6.4s,  v20.4s, v21.4s  // just the second half
284         fadd            v5.4s,  v21.4s, v22.4s  // just the first half
285 
286         tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
287         tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
288 
289         fadd            v20.4s, v16.4s, v4.4s
290         fsub            v22.4s, v16.4s, v4.4s
291         fadd            v21.4s, v17.4s, v5.4s
292         st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
293         fsub            v23.4s, v17.4s, v5.4s
294 
295         st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
296         st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
297         st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
298 1:
299         ld1             {v20.4s},[x2]    // {z[o2],z[o2+1]}
300         ld1             {v22.4s},[x3]    // {z[o3],z[o3+1]}
301         ld1             {v4.2s}, [x4], #8       // {wre[0],wre[1]}
302         transpose       v26.2d, v27.2d, v20.2d, v22.2d
303         ld1             {v5.2s}, [x5], x7       // {wim[-1],wim[0]}
304         rev64           v6.4s,  v26.4s
305         fmul            v26.4s, v26.4s, v4.s[0]
306         rev64           v7.4s,  v27.4s
307         fmul            v27.4s, v27.4s, v4.s[1]
308         fmul            v6.4s,  v6.4s,  v29.4s
309         fmul            v7.4s,  v7.4s,  v29.4s
310         ld1             {v16.4s},[x0]           // {z[0],z[1]}
311         fmla            v26.4s, v6.4s,  v5.s[1] // {t1,t2,t5,t6}
312         fmla            v27.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
313         ld1             {v17.4s},[x1]           // {z[o1],z[o1+1]}
314 
315         subs            x6,  x6,  #1            // n--
316 
317         zip1            v20.4s, v26.4s, v27.4s
318         zip2            v21.4s, v26.4s, v27.4s
319         fneg            v22.4s, v20.4s
320         fadd            v4.4s,  v21.4s, v20.4s
321         fsub            v6.4s,  v20.4s, v21.4s  // just the second half
322         fadd            v5.4s,  v21.4s, v22.4s  // just the first half
323 
324         tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
325         tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
326 
327         fadd            v20.4s, v16.4s, v4.4s
328         fsub            v22.4s, v16.4s, v4.4s
329         fadd            v21.4s, v17.4s, v5.4s
330         st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
331         fsub            v23.4s, v17.4s, v5.4s
332 
333         st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
334         st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
335         st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
336         b.ne            1b
337 
338         ret
339 endfunc
340 
341 .macro  def_fft n, n2, n4
342 function fft\n\()_neon, align=6
343         AARCH64_VALID_JUMP_TARGET
344         AARCH64_SIGN_LINK_REGISTER
345         sub             sp,  sp,  #16
346         stp             x28, x30, [sp]
347         add             x28, x0,  #\n4*2*8
348         bl              fft\n2\()_neon
349         mov             x0,  x28
350         bl              fft\n4\()_neon
351         add             x0,  x28, #\n4*1*8
352         bl              fft\n4\()_neon
353         sub             x0,  x28, #\n4*2*8
354         ldp             x28, x30, [sp], #16
355         AARCH64_VALIDATE_LINK_REGISTER
356         movrel          x4,  X(ff_cos_\n)
357         mov             x2,  #\n4>>1
358         b               fft_pass_neon
359 endfunc
360 .endm
361 
362         def_fft    32,    16,     8
363         def_fft    64,    32,    16
364         def_fft   128,    64,    32
365         def_fft   256,   128,    64
366         def_fft   512,   256,   128
367         def_fft  1024,   512,   256
368         def_fft  2048,  1024,   512
369         def_fft  4096,  2048,  1024
370         def_fft  8192,  4096,  2048
371         def_fft 16384,  8192,  4096
372         def_fft 32768, 16384,  8192
373         def_fft 65536, 32768, 16384
374 
375 function ff_fft_calc_neon, export=1
376         prfm            pldl1keep, [x1]
377         movrel          x10, trans4_float
378         ldr             w2,  [x0]
379         movrel          x11, trans8_float
380         sub             w2,  w2,  #2
381         movrel          x3,  fft_tab_neon
382         ld1             {v30.16b}, [x10]
383         mov             x7,  #-8
384         movrel          x12, pmmp
385         ldr             x3,  [x3, x2, lsl #3]
386         movrel          x13, mppm
387         movrel          x14, X(ff_cos_16)
388         ld1             {v31.16b}, [x11]
389         mov             x0,  x1
390         ld1             {v29.4s},  [x12]         // pmmp
391         ld1             {v28.4s},  [x13]
392         br              x3
393 endfunc
394 
395 function ff_fft_permute_neon, export=1
396         mov             x6,  #1
397         ldr             w2,  [x0]       // nbits
398         ldr             x3,  [x0, #16]  // tmp_buf
399         ldr             x0,  [x0, #8]   // revtab
400         lsl             x6,  x6, x2
401         mov             x2,  x6
402 1:
403         ld1             {v0.2s,v1.2s}, [x1], #16
404         ldr             w4,  [x0], #4
405         uxth            w5,  w4
406         lsr             w4,  w4,  #16
407         add             x5,  x3,  x5,  lsl #3
408         add             x4,  x3,  x4,  lsl #3
409         st1             {v0.2s}, [x5]
410         st1             {v1.2s}, [x4]
411         subs            x6,  x6, #2
412         b.gt            1b
413 
414         sub             x1,  x1,  x2,  lsl #3
415 1:
416         ld1             {v0.4s,v1.4s}, [x3], #32
417         st1             {v0.4s,v1.4s}, [x1], #32
418         subs            x2,  x2,  #4
419         b.gt            1b
420 
421         ret
422 endfunc
423 
424 const   fft_tab_neon, relocate=1
425         .quad fft4_neon
426         .quad fft8_neon
427         .quad fft16_neon
428         .quad fft32_neon
429         .quad fft64_neon
430         .quad fft128_neon
431         .quad fft256_neon
432         .quad fft512_neon
433         .quad fft1024_neon
434         .quad fft2048_neon
435         .quad fft4096_neon
436         .quad fft8192_neon
437         .quad fft16384_neon
438         .quad fft32768_neon
439         .quad fft65536_neon
440 endconst
441 
442 const   pmmp, align=4
443         .float          +1.0, -1.0, -1.0, +1.0
444 endconst
445 
446 const   mppm, align=4
447         .float          -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
448 endconst
449