1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "config.h"
22cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci/**
25cabdff1aSopenharmony_ci * ARM VFP optimised int32 to float conversion.
26cabdff1aSopenharmony_ci * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
27cabdff1aSopenharmony_ci * (16 bytes alignment is best for BCM2835), little-endian.
28cabdff1aSopenharmony_ci */
29cabdff1aSopenharmony_ci@ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, const int32_t *src, const float *mul, int len)
30cabdff1aSopenharmony_cifunction ff_int32_to_float_fmul_array8_vfp, export=1
31cabdff1aSopenharmony_ci        push    {lr}
32cabdff1aSopenharmony_ci        ldr     a1, [sp, #4]
33cabdff1aSopenharmony_ci        subs    lr, a1, #3*8
34cabdff1aSopenharmony_ci        bcc     50f                        @ too short to pipeline
35cabdff1aSopenharmony_ci        @ Now need to find (len / 8) % 3. The approximation
36cabdff1aSopenharmony_ci        @ x / 24 = (x * 0xAB) >> 12
37cabdff1aSopenharmony_ci        @ is good for x < 4096, which is true for both AC3 and DCA.
38cabdff1aSopenharmony_ci        mov     a1, #0xAB
39cabdff1aSopenharmony_ci        ldr     ip, =0x03070000            @ RunFast mode, short vectors of length 8, stride 1
40cabdff1aSopenharmony_ci        mul     a1, lr, a1
41cabdff1aSopenharmony_ci        vpush   {s16-s31}
42cabdff1aSopenharmony_ci        mov     a1, a1, lsr #12
43cabdff1aSopenharmony_ci        add     a1, a1, a1, lsl #1
44cabdff1aSopenharmony_ci        rsb     a1, a1, lr, lsr #3
45cabdff1aSopenharmony_ci        cmp     a1, #1
46cabdff1aSopenharmony_ci        fmrx    a1, FPSCR
47cabdff1aSopenharmony_ci        fmxr    FPSCR, ip
48cabdff1aSopenharmony_ci        beq     11f
49cabdff1aSopenharmony_ci        blo     10f
50cabdff1aSopenharmony_ci        @ Array is (2 + multiple of 3) x 8 floats long
51cabdff1aSopenharmony_ci        @ drop through...
52cabdff1aSopenharmony_ci        vldmia          a3!, {s16-s23}
53cabdff1aSopenharmony_ci        vldmia          a4!, {s2,s3}
54cabdff1aSopenharmony_ci        vldmia          a3!, {s24-s31}
55cabdff1aSopenharmony_ci        vcvt.f32.s32    s16, s16
56cabdff1aSopenharmony_ci        vcvt.f32.s32    s17, s17
57cabdff1aSopenharmony_ci        vcvt.f32.s32    s18, s18
58cabdff1aSopenharmony_ci        vcvt.f32.s32    s19, s19
59cabdff1aSopenharmony_ci        vcvt.f32.s32    s20, s20
60cabdff1aSopenharmony_ci        vcvt.f32.s32    s21, s21
61cabdff1aSopenharmony_ci        vcvt.f32.s32    s22, s22
62cabdff1aSopenharmony_ci        vcvt.f32.s32    s23, s23
63cabdff1aSopenharmony_ci        vmul.f32        s16, s16, s2
64cabdff1aSopenharmony_ci        @ drop through...
65cabdff1aSopenharmony_ci3:
66cabdff1aSopenharmony_ci        vldmia          a3!, {s8-s15}
67cabdff1aSopenharmony_ci        vldmia          a4!, {s1}
68cabdff1aSopenharmony_ci        vcvt.f32.s32    s24, s24
69cabdff1aSopenharmony_ci        vcvt.f32.s32    s25, s25
70cabdff1aSopenharmony_ci        vcvt.f32.s32    s26, s26
71cabdff1aSopenharmony_ci        vcvt.f32.s32    s27, s27
72cabdff1aSopenharmony_ci        vcvt.f32.s32    s28, s28
73cabdff1aSopenharmony_ci        vcvt.f32.s32    s29, s29
74cabdff1aSopenharmony_ci        vcvt.f32.s32    s30, s30
75cabdff1aSopenharmony_ci        vcvt.f32.s32    s31, s31
76cabdff1aSopenharmony_ci        vmul.f32        s24, s24, s3
77cabdff1aSopenharmony_ci        vstmia          a2!, {s16-s19}
78cabdff1aSopenharmony_ci        vstmia          a2!, {s20-s23}
79cabdff1aSopenharmony_ci2:
80cabdff1aSopenharmony_ci        vldmia          a3!, {s16-s23}
81cabdff1aSopenharmony_ci        vldmia          a4!, {s2}
82cabdff1aSopenharmony_ci        vcvt.f32.s32    s8, s8
83cabdff1aSopenharmony_ci        vcvt.f32.s32    s9, s9
84cabdff1aSopenharmony_ci        vcvt.f32.s32    s10, s10
85cabdff1aSopenharmony_ci        vcvt.f32.s32    s11, s11
86cabdff1aSopenharmony_ci        vcvt.f32.s32    s12, s12
87cabdff1aSopenharmony_ci        vcvt.f32.s32    s13, s13
88cabdff1aSopenharmony_ci        vcvt.f32.s32    s14, s14
89cabdff1aSopenharmony_ci        vcvt.f32.s32    s15, s15
90cabdff1aSopenharmony_ci        vmul.f32        s8, s8, s1
91cabdff1aSopenharmony_ci        vstmia          a2!, {s24-s27}
92cabdff1aSopenharmony_ci        vstmia          a2!, {s28-s31}
93cabdff1aSopenharmony_ci1:
94cabdff1aSopenharmony_ci        vldmia          a3!, {s24-s31}
95cabdff1aSopenharmony_ci        vldmia          a4!, {s3}
96cabdff1aSopenharmony_ci        vcvt.f32.s32    s16, s16
97cabdff1aSopenharmony_ci        vcvt.f32.s32    s17, s17
98cabdff1aSopenharmony_ci        vcvt.f32.s32    s18, s18
99cabdff1aSopenharmony_ci        vcvt.f32.s32    s19, s19
100cabdff1aSopenharmony_ci        vcvt.f32.s32    s20, s20
101cabdff1aSopenharmony_ci        vcvt.f32.s32    s21, s21
102cabdff1aSopenharmony_ci        vcvt.f32.s32    s22, s22
103cabdff1aSopenharmony_ci        vcvt.f32.s32    s23, s23
104cabdff1aSopenharmony_ci        vmul.f32        s16, s16, s2
105cabdff1aSopenharmony_ci        vstmia          a2!, {s8-s11}
106cabdff1aSopenharmony_ci        vstmia          a2!, {s12-s15}
107cabdff1aSopenharmony_ci
108cabdff1aSopenharmony_ci        subs            lr, lr, #8*3
109cabdff1aSopenharmony_ci        bpl             3b
110cabdff1aSopenharmony_ci
111cabdff1aSopenharmony_ci        vcvt.f32.s32    s24, s24
112cabdff1aSopenharmony_ci        vcvt.f32.s32    s25, s25
113cabdff1aSopenharmony_ci        vcvt.f32.s32    s26, s26
114cabdff1aSopenharmony_ci        vcvt.f32.s32    s27, s27
115cabdff1aSopenharmony_ci        vcvt.f32.s32    s28, s28
116cabdff1aSopenharmony_ci        vcvt.f32.s32    s29, s29
117cabdff1aSopenharmony_ci        vcvt.f32.s32    s30, s30
118cabdff1aSopenharmony_ci        vcvt.f32.s32    s31, s31
119cabdff1aSopenharmony_ci        vmul.f32        s24, s24, s3
120cabdff1aSopenharmony_ci        vstmia          a2!, {s16-s19}
121cabdff1aSopenharmony_ci        vstmia          a2!, {s20-s23}
122cabdff1aSopenharmony_ci        vstmia          a2!, {s24-s27}
123cabdff1aSopenharmony_ci        vstmia          a2!, {s28-s31}
124cabdff1aSopenharmony_ci
125cabdff1aSopenharmony_ci        fmxr    FPSCR, a1
126cabdff1aSopenharmony_ci        vpop    {s16-s31}
127cabdff1aSopenharmony_ci        pop     {pc}
128cabdff1aSopenharmony_ci
129cabdff1aSopenharmony_ci10:     @ Array is (multiple of 3) x 8 floats long
130cabdff1aSopenharmony_ci        vldmia          a3!, {s8-s15}
131cabdff1aSopenharmony_ci        vldmia          a4!, {s1,s2}
132cabdff1aSopenharmony_ci        vldmia          a3!, {s16-s23}
133cabdff1aSopenharmony_ci        vcvt.f32.s32    s8, s8
134cabdff1aSopenharmony_ci        vcvt.f32.s32    s9, s9
135cabdff1aSopenharmony_ci        vcvt.f32.s32    s10, s10
136cabdff1aSopenharmony_ci        vcvt.f32.s32    s11, s11
137cabdff1aSopenharmony_ci        vcvt.f32.s32    s12, s12
138cabdff1aSopenharmony_ci        vcvt.f32.s32    s13, s13
139cabdff1aSopenharmony_ci        vcvt.f32.s32    s14, s14
140cabdff1aSopenharmony_ci        vcvt.f32.s32    s15, s15
141cabdff1aSopenharmony_ci        vmul.f32        s8, s8, s1
142cabdff1aSopenharmony_ci        b               1b
143cabdff1aSopenharmony_ci
144cabdff1aSopenharmony_ci11:     @ Array is (1 + multiple of 3) x 8 floats long
145cabdff1aSopenharmony_ci        vldmia          a3!, {s24-s31}
146cabdff1aSopenharmony_ci        vldmia          a4!, {s3}
147cabdff1aSopenharmony_ci        vldmia          a3!, {s8-s15}
148cabdff1aSopenharmony_ci        vldmia          a4!, {s1}
149cabdff1aSopenharmony_ci        vcvt.f32.s32    s24, s24
150cabdff1aSopenharmony_ci        vcvt.f32.s32    s25, s25
151cabdff1aSopenharmony_ci        vcvt.f32.s32    s26, s26
152cabdff1aSopenharmony_ci        vcvt.f32.s32    s27, s27
153cabdff1aSopenharmony_ci        vcvt.f32.s32    s28, s28
154cabdff1aSopenharmony_ci        vcvt.f32.s32    s29, s29
155cabdff1aSopenharmony_ci        vcvt.f32.s32    s30, s30
156cabdff1aSopenharmony_ci        vcvt.f32.s32    s31, s31
157cabdff1aSopenharmony_ci        vmul.f32        s24, s24, s3
158cabdff1aSopenharmony_ci        b               2b
159cabdff1aSopenharmony_ci
160cabdff1aSopenharmony_ci50:
161cabdff1aSopenharmony_ci        ldr     lr, =0x03070000         @ RunFast mode, short vectors of length 8, stride 1
162cabdff1aSopenharmony_ci        fmrx    ip, FPSCR
163cabdff1aSopenharmony_ci        fmxr    FPSCR, lr
164cabdff1aSopenharmony_ci51:
165cabdff1aSopenharmony_ci        vldmia          a3!, {s8-s15}
166cabdff1aSopenharmony_ci        vldmia          a4!, {s0}
167cabdff1aSopenharmony_ci        vcvt.f32.s32    s8, s8
168cabdff1aSopenharmony_ci        vcvt.f32.s32    s9, s9
169cabdff1aSopenharmony_ci        vcvt.f32.s32    s10, s10
170cabdff1aSopenharmony_ci        vcvt.f32.s32    s11, s11
171cabdff1aSopenharmony_ci        vcvt.f32.s32    s12, s12
172cabdff1aSopenharmony_ci        vcvt.f32.s32    s13, s13
173cabdff1aSopenharmony_ci        vcvt.f32.s32    s14, s14
174cabdff1aSopenharmony_ci        vcvt.f32.s32    s15, s15
175cabdff1aSopenharmony_ci        vmul.f32        s8, s8, s0
176cabdff1aSopenharmony_ci        subs            a1, a1, #8
177cabdff1aSopenharmony_ci        vstmia          a2!, {s8-s11}
178cabdff1aSopenharmony_ci        vstmia          a2!, {s12-s15}
179cabdff1aSopenharmony_ci        bne             51b
180cabdff1aSopenharmony_ci
181cabdff1aSopenharmony_ci        fmxr    FPSCR, ip
182cabdff1aSopenharmony_ci        pop     {pc}
183cabdff1aSopenharmony_ciendfunc
184cabdff1aSopenharmony_ci
185cabdff1aSopenharmony_ci/**
186cabdff1aSopenharmony_ci * ARM VFP optimised int32 to float conversion.
187cabdff1aSopenharmony_ci * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
188cabdff1aSopenharmony_ci * (16 bytes alignment is best for BCM2835), little-endian.
189cabdff1aSopenharmony_ci * TODO: could be further optimised by unrolling and interleaving, as above
190cabdff1aSopenharmony_ci */
191cabdff1aSopenharmony_ci@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len)
192cabdff1aSopenharmony_cifunction ff_int32_to_float_fmul_scalar_vfp, export=1
193cabdff1aSopenharmony_ciVFP     tmp     .req    a4
194cabdff1aSopenharmony_ciVFP     len     .req    a3
195cabdff1aSopenharmony_ciNOVFP   tmp     .req    a3
196cabdff1aSopenharmony_ciNOVFP   len     .req    a4
197cabdff1aSopenharmony_ciNOVFP   vmov    s0, a3
198cabdff1aSopenharmony_ci        ldr     tmp, =0x03070000           @ RunFast mode, short vectors of length 8, stride 1
199cabdff1aSopenharmony_ci        fmrx    ip, FPSCR
200cabdff1aSopenharmony_ci        fmxr    FPSCR, tmp
201cabdff1aSopenharmony_ci1:
202cabdff1aSopenharmony_ci        vldmia          a2!, {s8-s15}
203cabdff1aSopenharmony_ci        vcvt.f32.s32    s8, s8
204cabdff1aSopenharmony_ci        vcvt.f32.s32    s9, s9
205cabdff1aSopenharmony_ci        vcvt.f32.s32    s10, s10
206cabdff1aSopenharmony_ci        vcvt.f32.s32    s11, s11
207cabdff1aSopenharmony_ci        vcvt.f32.s32    s12, s12
208cabdff1aSopenharmony_ci        vcvt.f32.s32    s13, s13
209cabdff1aSopenharmony_ci        vcvt.f32.s32    s14, s14
210cabdff1aSopenharmony_ci        vcvt.f32.s32    s15, s15
211cabdff1aSopenharmony_ci        vmul.f32        s8, s8, s0
212cabdff1aSopenharmony_ci        subs            len, len, #8
213cabdff1aSopenharmony_ci        vstmia          a1!, {s8-s11}
214cabdff1aSopenharmony_ci        vstmia          a1!, {s12-s15}
215cabdff1aSopenharmony_ci        bne             1b
216cabdff1aSopenharmony_ci
217cabdff1aSopenharmony_ci        fmxr    FPSCR, ip
218cabdff1aSopenharmony_ci        bx      lr
219cabdff1aSopenharmony_ciendfunc
220cabdff1aSopenharmony_ci        .unreq  tmp
221cabdff1aSopenharmony_ci        .unreq  len
222