1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "config.h"
23 #include "libavutil/aarch64/asm.S"
24 
25 function swri_oldapi_conv_flt_to_s16_neon, export=1
26 oldapi_conv_flt_to_s16_neon:
27         subs            x2,  x2,  #8
28         ld1             {v0.4s}, [x1],  #16
29         fcvtzs          v4.4s,  v0.4s,  #31
30         ld1             {v1.4s}, [x1],  #16
31         fcvtzs          v5.4s,  v1.4s,  #31
32         b.eq            3f
33         ands            x12, x2,  #~15
34         b.eq            2f
35 1:      subs            x12, x12, #16
36         sqrshrn         v4.4h,  v4.4s,  #16
37         ld1             {v2.4s}, [x1],  #16
38         fcvtzs          v6.4s,  v2.4s,  #31
39         sqrshrn2        v4.8h,  v5.4s,  #16
40         ld1             {v3.4s}, [x1],  #16
41         fcvtzs          v7.4s,  v3.4s,  #31
42         sqrshrn         v6.4h,  v6.4s,  #16
43         st1             {v4.8h}, [x0],  #16
44         sqrshrn2        v6.8h,  v7.4s,  #16
45         ld1             {v0.4s}, [x1],  #16
46         fcvtzs          v4.4s,  v0.4s,  #31
47         ld1             {v1.4s}, [x1],  #16
48         fcvtzs          v5.4s,  v1.4s,  #31
49         st1             {v6.8h}, [x0],  #16
50         b.ne            1b
51         ands            x2,  x2,  #15
52         b.eq            3f
53 2:      ld1             {v2.4s}, [x1],  #16
54         sqrshrn         v4.4h,  v4.4s,  #16
55         fcvtzs          v6.4s,  v2.4s,  #31
56         ld1             {v3.4s}, [x1],  #16
57         sqrshrn2        v4.8h,  v5.4s,  #16
58         fcvtzs          v7.4s,  v3.4s,  #31
59         sqrshrn         v6.4h,  v6.4s,  #16
60         st1             {v4.8h}, [x0],  #16
61         sqrshrn2        v6.8h,  v7.4s,  #16
62         st1             {v6.8h}, [x0]
63         ret
64 3:      sqrshrn         v4.4h,  v4.4s,  #16
65         sqrshrn2        v4.8h,  v5.4s,  #16
66         st1             {v4.8h}, [x0]
67         ret
68 endfunc
69 
70 function swri_oldapi_conv_fltp_to_s16_2ch_neon, export=1
71 oldapi_conv_fltp_to_s16_2ch_neon:
72         ldp             x4,  x5,  [x1]
73         subs            x2,  x2,  #8
74         ld1             {v0.4s},  [x4], #16
75         fcvtzs          v4.4s,  v0.4s,  #31
76         ld1             {v1.4s},  [x4], #16
77         fcvtzs          v5.4s,  v1.4s,  #31
78         ld1             {v2.4s},  [x5], #16
79         fcvtzs          v6.4s,  v2.4s,  #31
80         ld1             {v3.4s},  [x5], #16
81         fcvtzs          v7.4s,  v3.4s,  #31
82         b.eq            3f
83         ands            x12, x2,  #~15
84         b.eq            2f
85 1:      subs            x12, x12, #16
86         ld1             {v16.4s}, [x4], #16
87         fcvtzs          v20.4s, v16.4s, #31
88         sri             v6.4s,  v4.4s,  #16
89         ld1             {v17.4s}, [x4], #16
90         fcvtzs          v21.4s, v17.4s, #31
91         ld1             {v18.4s}, [x5], #16
92         fcvtzs          v22.4s, v18.4s, #31
93         ld1             {v19.4s}, [x5], #16
94         sri             v7.4s,  v5.4s,  #16
95         st1             {v6.4s},  [x0], #16
96         fcvtzs          v23.4s, v19.4s, #31
97         st1             {v7.4s},  [x0], #16
98         sri             v22.4s, v20.4s, #16
99         ld1             {v0.4s},  [x4], #16
100         sri             v23.4s, v21.4s, #16
101         st1             {v22.4s}, [x0], #16
102         fcvtzs          v4.4s,  v0.4s,  #31
103         ld1             {v1.4s},  [x4], #16
104         fcvtzs          v5.4s,  v1.4s,  #31
105         ld1             {v2.4s},  [x5], #16
106         fcvtzs          v6.4s,  v2.4s,  #31
107         ld1             {v3.4s},  [x5], #16
108         fcvtzs          v7.4s,  v3.4s,  #31
109         st1             {v23.4s}, [x0], #16
110         b.ne            1b
111         ands            x2,  x2,  #15
112         b.eq            3f
113 2:      sri             v6.4s,  v4.4s,  #16
114         ld1             {v0.4s},  [x4], #16
115         fcvtzs          v0.4s,  v0.4s,  #31
116         ld1             {v1.4s},  [x4], #16
117         fcvtzs          v1.4s,  v1.4s,  #31
118         ld1             {v2.4s},  [x5], #16
119         fcvtzs          v2.4s,  v2.4s,  #31
120         sri             v7.4s,  v5.4s,  #16
121         ld1             {v3.4s},  [x5], #16
122         fcvtzs          v3.4s,  v3.4s,  #31
123         sri             v2.4s,  v0.4s,  #16
124         st1             {v6.4s,v7.4s},  [x0], #32
125         sri             v3.4s,  v1.4s,  #16
126         st1             {v2.4s,v3.4s},  [x0], #32
127         ret
128 3:      sri             v6.4s,  v4.4s,  #16
129         sri             v7.4s,  v5.4s,  #16
130         st1             {v6.4s,v7.4s},  [x0]
131         ret
132 endfunc
133 
134 function swri_oldapi_conv_fltp_to_s16_nch_neon, export=1
135         cmp             w3,  #2
136         b.eq            oldapi_conv_fltp_to_s16_2ch_neon
137         b.gt            1f
138         ldr             x1,  [x1]
139         b               oldapi_conv_flt_to_s16_neon
140 1:
141         cmp             w3,  #4
142         lsl             x12, x3,  #1
143         b.lt            4f
144 
145 5:      // 4 channels
146         ldp             x4, x5, [x1], #16
147         ldp             x6, x7, [x1], #16
148         mov             w9,  w2
149         mov             x8,  x0
150         ld1             {v4.4s},        [x4], #16
151         fcvtzs          v4.4s,  v4.4s,  #31
152         ld1             {v5.4s},        [x5], #16
153         fcvtzs          v5.4s,  v5.4s,  #31
154         ld1             {v6.4s},        [x6], #16
155         fcvtzs          v6.4s, v6.4s, #31
156         ld1             {v7.4s},        [x7], #16
157         fcvtzs          v7.4s, v7.4s, #31
158 6:
159         subs            w9,  w9,  #8
160         ld1             {v0.4s},        [x4], #16
161         fcvtzs          v0.4s,  v0.4s,  #31
162         sri             v5.4s,  v4.4s,  #16
163         ld1             {v1.4s},        [x5], #16
164         fcvtzs          v1.4s,  v1.4s,  #31
165         sri             v7.4s,  v6.4s,  #16
166         ld1             {v2.4s},        [x6], #16
167         fcvtzs          v2.4s,  v2.4s,  #31
168         zip1            v16.4s, v5.4s,  v7.4s
169         ld1             {v3.4s},        [x7], #16
170         fcvtzs          v3.4s,  v3.4s,  #31
171         zip2            v17.4s, v5.4s,  v7.4s
172         st1             {v16.d}[0],     [x8], x12
173         sri             v1.4s,  v0.4s,  #16
174         st1             {v16.d}[1],     [x8], x12
175         sri             v3.4s,  v2.4s,  #16
176         st1             {v17.d}[0],     [x8], x12
177         zip1            v18.4s, v1.4s,  v3.4s
178         st1             {v17.d}[1],     [x8], x12
179         zip2            v19.4s, v1.4s,  v3.4s
180         b.eq            7f
181         ld1             {v4.4s},        [x4], #16
182         fcvtzs          v4.4s,  v4.4s,  #31
183         st1             {v18.d}[0],     [x8], x12
184         ld1             {v5.4s},        [x5], #16
185         fcvtzs          v5.4s,  v5.4s,  #31
186         st1             {v18.d}[1],     [x8], x12
187         ld1             {v6.4s},    [x6], #16
188         fcvtzs          v6.4s, v6.4s, #31
189         st1             {v19.d}[0],     [x8], x12
190         ld1             {v7.4s},    [x7], #16
191         fcvtzs          v7.4s, v7.4s, #31
192         st1             {v19.d}[1],     [x8], x12
193         b               6b
194 7:
195         st1             {v18.d}[0],     [x8], x12
196         st1             {v18.d}[1],     [x8], x12
197         st1             {v19.d}[0],     [x8], x12
198         st1             {v19.d}[1],     [x8], x12
199         subs            w3,  w3,  #4
200         b.eq            end
201         cmp             w3,  #4
202         add             x0,  x0,  #8
203         b.ge            5b
204 
205 4:      // 2 channels
206         cmp             w3,  #2
207         b.lt            4f
208         ldp             x4,  x5,  [x1], #16
209         mov             w9,  w2
210         mov             x8,  x0
211         tst             w9,  #8
212         ld1             {v4.4s},        [x4], #16
213         fcvtzs          v4.4s,  v4.4s,  #31
214         ld1             {v5.4s},        [x5], #16
215         fcvtzs          v5.4s,  v5.4s,  #31
216         ld1             {v6.4s},        [x4], #16
217         fcvtzs          v6.4s,  v6.4s,  #31
218         ld1             {v7.4s},        [x5], #16
219         fcvtzs          v7.4s,  v7.4s,  #31
220         b.eq            6f
221         subs            w9,  w9,  #8
222         b.eq            7f
223         sri             v5.4s,  v4.4s,  #16
224         ld1             {v4.4s},        [x4], #16
225         fcvtzs          v4.4s,  v4.4s,  #31
226         st1             {v5.s}[0],      [x8], x12
227         sri             v7.4s,  v6.4s,  #16
228         st1             {v5.s}[1],      [x8], x12
229         ld1             {v6.4s},        [x4], #16
230         fcvtzs          v6.4s,  v6.4s, #31
231         st1             {v5.s}[2],      [x8], x12
232         st1             {v5.s}[3],      [x8], x12
233         st1             {v7.s}[0],      [x8], x12
234         st1             {v7.s}[1],      [x8], x12
235         ld1             {v5.4s},        [x5], #16
236         fcvtzs          v5.4s,  v5.4s,  #31
237         st1             {v7.s}[2],      [x8], x12
238         st1             {v7.s}[3],      [x8], x12
239         ld1             {v7.4s},        [x5], #16
240         fcvtzs          v7.4s,  v7.4s,  #31
241 6:
242         subs            w9,  w9,  #16
243         ld1             {v0.4s},        [x4], #16
244         sri             v5.4s,  v4.4s,  #16
245         fcvtzs          v0.4s,  v0.4s,  #31
246         ld1             {v1.4s},        [x5], #16
247         sri             v7.4s,  v6.4s,  #16
248         st1             {v5.s}[0],      [x8], x12
249         st1             {v5.s}[1],      [x8], x12
250         fcvtzs          v1.4s,  v1.4s,  #31
251         st1             {v5.s}[2],      [x8], x12
252         st1             {v5.s}[3],      [x8], x12
253         ld1             {v2.4s},        [x4], #16
254         st1             {v7.s}[0],      [x8], x12
255         fcvtzs          v2.4s,  v2.4s,  #31
256         st1             {v7.s}[1],      [x8], x12
257         ld1             {v3.4s},        [x5], #16
258         st1             {v7.s}[2],      [x8], x12
259         fcvtzs          v3.4s,  v3.4s,  #31
260         st1             {v7.s}[3],      [x8], x12
261         sri             v1.4s,  v0.4s,  #16
262         sri             v3.4s,  v2.4s,  #16
263         b.eq            6f
264         ld1             {v4.4s},        [x4], #16
265         st1             {v1.s}[0],      [x8], x12
266         fcvtzs          v4.4s,  v4.4s,  #31
267         st1             {v1.s}[1],      [x8], x12
268         ld1             {v5.4s},        [x5], #16
269         st1             {v1.s}[2],      [x8], x12
270         fcvtzs          v5.4s,  v5.4s,  #31
271         st1             {v1.s}[3],      [x8], x12
272         ld1             {v6.4s},        [x4], #16
273         st1             {v3.s}[0],      [x8], x12
274         fcvtzs          v6.4s,  v6.4s,  #31
275         st1             {v3.s}[1],      [x8], x12
276         ld1             {v7.4s},        [x5], #16
277         st1             {v3.s}[2],      [x8], x12
278         fcvtzs          v7.4s,  v7.4s,  #31
279         st1             {v3.s}[3],      [x8], x12
280         b.gt            6b
281 6:
282         st1             {v1.s}[0],      [x8], x12
283         st1             {v1.s}[1],      [x8], x12
284         st1             {v1.s}[2],      [x8], x12
285         st1             {v1.s}[3],      [x8], x12
286         st1             {v3.s}[0],      [x8], x12
287         st1             {v3.s}[1],      [x8], x12
288         st1             {v3.s}[2],      [x8], x12
289         st1             {v3.s}[3],      [x8], x12
290         b               8f
291 7:
292         sri             v5.4s,  v4.4s,  #16
293         sri             v7.4s,  v6.4s,  #16
294         st1             {v5.s}[0],      [x8], x12
295         st1             {v5.s}[1],      [x8], x12
296         st1             {v5.s}[2],      [x8], x12
297         st1             {v5.s}[3],      [x8], x12
298         st1             {v7.s}[0],      [x8], x12
299         st1             {v7.s}[1],      [x8], x12
300         st1             {v7.s}[2],      [x8], x12
301         st1             {v7.s}[3],      [x8], x12
302 8:
303         subs            w3,  w3,  #2
304         add             x0,  x0,  #4
305         b.eq            end
306 
307 4:      // 1 channel
308         ldr             x4,  [x1]
309         tst             w2,  #8
310         mov             w9,  w2
311         mov             x5,  x0
312         ld1             {v0.4s},        [x4], #16
313         fcvtzs          v0.4s,  v0.4s,  #31
314         ld1             {v1.4s},        [x4], #16
315         fcvtzs          v1.4s,  v1.4s,  #31
316         b.ne            8f
317 6:
318         subs            w9,  w9,  #16
319         ld1             {v2.4s},        [x4], #16
320         fcvtzs          v2.4s,  v2.4s,  #31
321         ld1             {v3.4s},        [x4], #16
322         fcvtzs          v3.4s,  v3.4s,  #31
323         st1             {v0.h}[1],      [x5], x12
324         st1             {v0.h}[3],      [x5], x12
325         st1             {v0.h}[5],      [x5], x12
326         st1             {v0.h}[7],      [x5], x12
327         st1             {v1.h}[1],      [x5], x12
328         st1             {v1.h}[3],      [x5], x12
329         st1             {v1.h}[5],      [x5], x12
330         st1             {v1.h}[7],      [x5], x12
331         b.eq            7f
332         ld1             {v0.4s},        [x4], #16
333         fcvtzs          v0.4s,  v0.4s,  #31
334         ld1             {v1.4s},        [x4], #16
335         fcvtzs          v1.4s,  v1.4s,  #31
336 7:
337         st1             {v2.h}[1],      [x5], x12
338         st1             {v2.h}[3],      [x5], x12
339         st1             {v2.h}[5],      [x5], x12
340         st1             {v2.h}[7],      [x5], x12
341         st1             {v3.h}[1],      [x5], x12
342         st1             {v3.h}[3],      [x5], x12
343         st1             {v3.h}[5],      [x5], x12
344         st1             {v3.h}[7],      [x5], x12
345         b.gt            6b
346         ret
347 8:
348         subs            w9,  w9,  #8
349         st1             {v0.h}[1],      [x5], x12
350         st1             {v0.h}[3],      [x5], x12
351         st1             {v0.h}[5],      [x5], x12
352         st1             {v0.h}[7],      [x5], x12
353         st1             {v1.h}[1],      [x5], x12
354         st1             {v1.h}[3],      [x5], x12
355         st1             {v1.h}[5],      [x5], x12
356         st1             {v1.h}[7],      [x5], x12
357         b.eq            end
358         ld1             {v0.4s},        [x4], #16
359         fcvtzs          v0.4s,  v0.4s,  #31
360         ld1             {v1.4s},        [x4], #16
361         fcvtzs          v1.4s,  v1.4s,  #31
362         b               6b
363 end:
364         ret
365 endfunc
366