1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "config.h"
22cabdff1aSopenharmony_ci#include "asm.S"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci/**
25cabdff1aSopenharmony_ci * Assume that len is a positive number and is multiple of 8
26cabdff1aSopenharmony_ci */
27cabdff1aSopenharmony_ci@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len)
28cabdff1aSopenharmony_cifunction ff_vector_fmul_vfp, export=1
29cabdff1aSopenharmony_ci        vpush           {d8-d15}
30cabdff1aSopenharmony_ci        fmrx            r12, fpscr
31cabdff1aSopenharmony_ci        orr             r12, r12, #(3 << 16) /* set vector size to 4 */
32cabdff1aSopenharmony_ci        fmxr            fpscr, r12
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_ci        vldmia          r1!, {s0-s3}
35cabdff1aSopenharmony_ci        vldmia          r2!, {s8-s11}
36cabdff1aSopenharmony_ci        vldmia          r1!, {s4-s7}
37cabdff1aSopenharmony_ci        vldmia          r2!, {s12-s15}
38cabdff1aSopenharmony_ci        vmul.f32        s8,  s0,  s8
39cabdff1aSopenharmony_ci1:
40cabdff1aSopenharmony_ci        subs            r3,  r3,  #16
41cabdff1aSopenharmony_ci        vmul.f32        s12, s4,  s12
42cabdff1aSopenharmony_ci        itttt           ge
43cabdff1aSopenharmony_ci        vldmiage        r1!, {s16-s19}
44cabdff1aSopenharmony_ci        vldmiage        r2!, {s24-s27}
45cabdff1aSopenharmony_ci        vldmiage        r1!, {s20-s23}
46cabdff1aSopenharmony_ci        vldmiage        r2!, {s28-s31}
47cabdff1aSopenharmony_ci        it              ge
48cabdff1aSopenharmony_ci        vmulge.f32      s24, s16, s24
49cabdff1aSopenharmony_ci        vstmia          r0!, {s8-s11}
50cabdff1aSopenharmony_ci        vstmia          r0!, {s12-s15}
51cabdff1aSopenharmony_ci        it              ge
52cabdff1aSopenharmony_ci        vmulge.f32      s28, s20, s28
53cabdff1aSopenharmony_ci        itttt           gt
54cabdff1aSopenharmony_ci        vldmiagt        r1!, {s0-s3}
55cabdff1aSopenharmony_ci        vldmiagt        r2!, {s8-s11}
56cabdff1aSopenharmony_ci        vldmiagt        r1!, {s4-s7}
57cabdff1aSopenharmony_ci        vldmiagt        r2!, {s12-s15}
58cabdff1aSopenharmony_ci        ittt            ge
59cabdff1aSopenharmony_ci        vmulge.f32      s8,  s0,  s8
60cabdff1aSopenharmony_ci        vstmiage        r0!, {s24-s27}
61cabdff1aSopenharmony_ci        vstmiage        r0!, {s28-s31}
62cabdff1aSopenharmony_ci        bgt             1b
63cabdff1aSopenharmony_ci
64cabdff1aSopenharmony_ci        bic             r12, r12, #(7 << 16) /* set vector size back to 1 */
65cabdff1aSopenharmony_ci        fmxr            fpscr, r12
66cabdff1aSopenharmony_ci        vpop            {d8-d15}
67cabdff1aSopenharmony_ci        bx              lr
68cabdff1aSopenharmony_ciendfunc
69cabdff1aSopenharmony_ci
70cabdff1aSopenharmony_ci/**
71cabdff1aSopenharmony_ci * ARM VFP implementation of 'vector_fmul_window_c' function
72cabdff1aSopenharmony_ci * Assume that len is a positive non-zero number
73cabdff1aSopenharmony_ci */
74cabdff1aSopenharmony_ci@ void ff_vector_fmul_window_vfp(float *dst, const float *src0,
75cabdff1aSopenharmony_ci@                                const float *src1, const float *win, int len)
76cabdff1aSopenharmony_cifunction ff_vector_fmul_window_vfp, export=1
77cabdff1aSopenharmony_ciDST0    .req    a1
78cabdff1aSopenharmony_ciSRC0    .req    a2
79cabdff1aSopenharmony_ciSRC1    .req    a3
80cabdff1aSopenharmony_ciWIN0    .req    a4
81cabdff1aSopenharmony_ciLEN     .req    v1
82cabdff1aSopenharmony_ciDST1    .req    v2
83cabdff1aSopenharmony_ciWIN1    .req    v3
84cabdff1aSopenharmony_ciOLDFPSCR .req   ip
85cabdff1aSopenharmony_ci
86cabdff1aSopenharmony_ci        push    {v1-v3,lr}
87cabdff1aSopenharmony_ci        ldr     LEN, [sp, #4*4+0]
88cabdff1aSopenharmony_ci        vpush   {s16-s31}
89cabdff1aSopenharmony_ci        fmrx    OLDFPSCR, FPSCR
90cabdff1aSopenharmony_ci        add     DST1, DST0, LEN, lsl #3
91cabdff1aSopenharmony_ci        add     SRC1, SRC1, LEN, lsl #2
92cabdff1aSopenharmony_ci        add     WIN1, WIN0, LEN, lsl #3
93cabdff1aSopenharmony_ci
94cabdff1aSopenharmony_ci        tst     LEN, #7
95cabdff1aSopenharmony_ci        beq     4f                          @ common case: len is a multiple of 8
96cabdff1aSopenharmony_ci
97cabdff1aSopenharmony_ci        ldr     lr, =0x03000000             @ RunFast mode, scalar mode
98cabdff1aSopenharmony_ci        fmxr    FPSCR, lr
99cabdff1aSopenharmony_ci
100cabdff1aSopenharmony_ci        tst     LEN, #1
101cabdff1aSopenharmony_ci        beq     1f
102cabdff1aSopenharmony_ci        vldmdb  WIN1!, {s0}
103cabdff1aSopenharmony_ci        vldmia  SRC0!, {s8}
104cabdff1aSopenharmony_ci        vldmia  WIN0!, {s16}
105cabdff1aSopenharmony_ci        vmul.f  s24, s0, s8
106cabdff1aSopenharmony_ci        vldmdb  SRC1!, {s20}
107cabdff1aSopenharmony_ci        vmul.f  s8, s16, s8
108cabdff1aSopenharmony_ci        vmls.f  s24, s16, s20
109cabdff1aSopenharmony_ci        vmla.f  s8, s0, s20
110cabdff1aSopenharmony_ci        vstmia  DST0!, {s24}
111cabdff1aSopenharmony_ci        vstmdb  DST1!, {s8}
112cabdff1aSopenharmony_ci1:
113cabdff1aSopenharmony_ci        tst     LEN, #2
114cabdff1aSopenharmony_ci        beq     2f
115cabdff1aSopenharmony_ci        vldmdb  WIN1!, {s0}
116cabdff1aSopenharmony_ci        vldmdb  WIN1!, {s1}
117cabdff1aSopenharmony_ci        vldmia  SRC0!, {s8-s9}
118cabdff1aSopenharmony_ci        vldmia  WIN0!, {s16-s17}
119cabdff1aSopenharmony_ci        vmul.f  s24, s0, s8
120cabdff1aSopenharmony_ci        vmul.f  s25, s1, s9
121cabdff1aSopenharmony_ci        vldmdb  SRC1!, {s20}
122cabdff1aSopenharmony_ci        vldmdb  SRC1!, {s21}
123cabdff1aSopenharmony_ci        vmul.f  s8, s16, s8
124cabdff1aSopenharmony_ci        vmul.f  s9, s17, s9
125cabdff1aSopenharmony_ci        vmls.f  s24, s16, s20
126cabdff1aSopenharmony_ci        vmls.f  s25, s17, s21
127cabdff1aSopenharmony_ci        vmla.f  s8, s0, s20
128cabdff1aSopenharmony_ci        vmla.f  s9, s1, s21
129cabdff1aSopenharmony_ci        vstmia  DST0!, {s24-s25}
130cabdff1aSopenharmony_ci        vstmdb  DST1!, {s8}
131cabdff1aSopenharmony_ci        vstmdb  DST1!, {s9}
132cabdff1aSopenharmony_ci2:
133cabdff1aSopenharmony_ci        tst     LEN, #4
134cabdff1aSopenharmony_ci        beq     3f
135cabdff1aSopenharmony_ci        vldmdb  WIN1!, {s0}
136cabdff1aSopenharmony_ci        vldmdb  WIN1!, {s1}
137cabdff1aSopenharmony_ci        vldmdb  WIN1!, {s2}
138cabdff1aSopenharmony_ci        vldmdb  WIN1!, {s3}
139cabdff1aSopenharmony_ci        vldmia  SRC0!, {s8-s11}
140cabdff1aSopenharmony_ci        vldmia  WIN0!, {s16-s19}
141cabdff1aSopenharmony_ci        vmul.f  s24, s0, s8
142cabdff1aSopenharmony_ci        vmul.f  s25, s1, s9
143cabdff1aSopenharmony_ci        vmul.f  s26, s2, s10
144cabdff1aSopenharmony_ci        vmul.f  s27, s3, s11
145cabdff1aSopenharmony_ci        vldmdb  SRC1!, {s20}
146cabdff1aSopenharmony_ci        vldmdb  SRC1!, {s21}
147cabdff1aSopenharmony_ci        vldmdb  SRC1!, {s22}
148cabdff1aSopenharmony_ci        vldmdb  SRC1!, {s23}
149cabdff1aSopenharmony_ci        vmul.f  s8, s16, s8
150cabdff1aSopenharmony_ci        vmul.f  s9, s17, s9
151cabdff1aSopenharmony_ci        vmul.f  s10, s18, s10
152cabdff1aSopenharmony_ci        vmul.f  s11, s19, s11
153cabdff1aSopenharmony_ci        vmls.f  s24, s16, s20
154cabdff1aSopenharmony_ci        vmls.f  s25, s17, s21
155cabdff1aSopenharmony_ci        vmls.f  s26, s18, s22
156cabdff1aSopenharmony_ci        vmls.f  s27, s19, s23
157cabdff1aSopenharmony_ci        vmla.f  s8, s0, s20
158cabdff1aSopenharmony_ci        vmla.f  s9, s1, s21
159cabdff1aSopenharmony_ci        vmla.f  s10, s2, s22
160cabdff1aSopenharmony_ci        vmla.f  s11, s3, s23
161cabdff1aSopenharmony_ci        vstmia  DST0!, {s24-s27}
162cabdff1aSopenharmony_ci        vstmdb  DST1!, {s8}
163cabdff1aSopenharmony_ci        vstmdb  DST1!, {s9}
164cabdff1aSopenharmony_ci        vstmdb  DST1!, {s10}
165cabdff1aSopenharmony_ci        vstmdb  DST1!, {s11}
166cabdff1aSopenharmony_ci3:
167cabdff1aSopenharmony_ci        bics    LEN, LEN, #7
168cabdff1aSopenharmony_ci        beq     7f
169cabdff1aSopenharmony_ci4:
170cabdff1aSopenharmony_ci        ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
171cabdff1aSopenharmony_ci        fmxr    FPSCR, lr
172cabdff1aSopenharmony_ci
173cabdff1aSopenharmony_ci        vldmdb  WIN1!, {s0}
174cabdff1aSopenharmony_ci        vldmdb  WIN1!, {s1}
175cabdff1aSopenharmony_ci        vldmdb  WIN1!, {s2}
176cabdff1aSopenharmony_ci        vldmdb  WIN1!, {s3}
177cabdff1aSopenharmony_ci        vldmia  SRC0!, {s8-s11}
178cabdff1aSopenharmony_ci        vldmia  WIN0!, {s16-s19}
179cabdff1aSopenharmony_ci        vmul.f  s24, s0, s8                     @ vector * vector
180cabdff1aSopenharmony_ci        vldmdb  SRC1!, {s20}
181cabdff1aSopenharmony_ci        vldmdb  SRC1!, {s21}
182cabdff1aSopenharmony_ci        vldmdb  SRC1!, {s22}
183cabdff1aSopenharmony_ci        vldmdb  SRC1!, {s23}
184cabdff1aSopenharmony_ci        vmul.f  s8, s16, s8                     @ vector * vector
185cabdff1aSopenharmony_ci        vmls.f  s24, s16, s20                   @ vector * vector
186cabdff1aSopenharmony_ci            vldmdb  WIN1!, {s4}
187cabdff1aSopenharmony_ci            vldmdb  WIN1!, {s5}
188cabdff1aSopenharmony_ci            vldmdb  WIN1!, {s6}
189cabdff1aSopenharmony_ci            vldmdb  WIN1!, {s7}
190cabdff1aSopenharmony_ci            vldmia  SRC0!, {s12-s13}
191cabdff1aSopenharmony_ci        vmla.f  s8, s0, s20                     @ vector * vector
192cabdff1aSopenharmony_ci            vldmia  SRC0!, {s14-s15}
193cabdff1aSopenharmony_ci        subs    LEN, LEN, #8
194cabdff1aSopenharmony_ci        beq     6f
195cabdff1aSopenharmony_ci5:          vldmia  WIN0!, {s20-s23}
196cabdff1aSopenharmony_ci            vmul.f  s28, s4, s12                @ vector * vector
197cabdff1aSopenharmony_ci        vstmia  DST0!, {s24-s25}
198cabdff1aSopenharmony_ci            vldmdb  SRC1!, {s16}
199cabdff1aSopenharmony_ci            vldmdb  SRC1!, {s17}
200cabdff1aSopenharmony_ci            vldmdb  SRC1!, {s18}
201cabdff1aSopenharmony_ci            vldmdb  SRC1!, {s19}
202cabdff1aSopenharmony_ci            vmul.f  s12, s20, s12               @ vector * vector
203cabdff1aSopenharmony_ci        vstmia  DST0!, {s26-s27}
204cabdff1aSopenharmony_ci        vstmdb  DST1!, {s8}
205cabdff1aSopenharmony_ci        vstmdb  DST1!, {s9}
206cabdff1aSopenharmony_ci        vstmdb  DST1!, {s10}
207cabdff1aSopenharmony_ci        vstmdb  DST1!, {s11}
208cabdff1aSopenharmony_ci            vmls.f  s28, s20, s16               @ vector * vector
209cabdff1aSopenharmony_ci                vldmdb  WIN1!, {s0}
210cabdff1aSopenharmony_ci                vldmdb  WIN1!, {s1}
211cabdff1aSopenharmony_ci                vldmdb  WIN1!, {s2}
212cabdff1aSopenharmony_ci                vldmdb  WIN1!, {s3}
213cabdff1aSopenharmony_ci                vldmia  SRC0!, {s8-s9}
214cabdff1aSopenharmony_ci            vmla.f  s12, s4, s16                @ vector * vector
215cabdff1aSopenharmony_ci                vldmia  SRC0!, {s10-s11}
216cabdff1aSopenharmony_ci        subs    LEN, LEN, #8
217cabdff1aSopenharmony_ci                vldmia  WIN0!, {s16-s19}
218cabdff1aSopenharmony_ci                vmul.f  s24, s0, s8             @ vector * vector
219cabdff1aSopenharmony_ci            vstmia  DST0!, {s28-s29}
220cabdff1aSopenharmony_ci                vldmdb  SRC1!, {s20}
221cabdff1aSopenharmony_ci                vldmdb  SRC1!, {s21}
222cabdff1aSopenharmony_ci                vldmdb  SRC1!, {s22}
223cabdff1aSopenharmony_ci                vldmdb  SRC1!, {s23}
224cabdff1aSopenharmony_ci                vmul.f  s8, s16, s8             @ vector * vector
225cabdff1aSopenharmony_ci            vstmia  DST0!, {s30-s31}
226cabdff1aSopenharmony_ci            vstmdb  DST1!, {s12}
227cabdff1aSopenharmony_ci            vstmdb  DST1!, {s13}
228cabdff1aSopenharmony_ci            vstmdb  DST1!, {s14}
229cabdff1aSopenharmony_ci            vstmdb  DST1!, {s15}
230cabdff1aSopenharmony_ci                vmls.f  s24, s16, s20           @ vector * vector
231cabdff1aSopenharmony_ci                    vldmdb  WIN1!, {s4}
232cabdff1aSopenharmony_ci                    vldmdb  WIN1!, {s5}
233cabdff1aSopenharmony_ci                    vldmdb  WIN1!, {s6}
234cabdff1aSopenharmony_ci                    vldmdb  WIN1!, {s7}
235cabdff1aSopenharmony_ci                    vldmia  SRC0!, {s12-s13}
236cabdff1aSopenharmony_ci                vmla.f  s8, s0, s20             @ vector * vector
237cabdff1aSopenharmony_ci                    vldmia  SRC0!, {s14-s15}
238cabdff1aSopenharmony_ci        bne     5b
239cabdff1aSopenharmony_ci6:                  vldmia  WIN0!, {s20-s23}
240cabdff1aSopenharmony_ci                    vmul.f  s28, s4, s12        @ vector * vector
241cabdff1aSopenharmony_ci                vstmia  DST0!, {s24-s25}
242cabdff1aSopenharmony_ci                    vldmdb  SRC1!, {s16}
243cabdff1aSopenharmony_ci                    vldmdb  SRC1!, {s17}
244cabdff1aSopenharmony_ci                    vldmdb  SRC1!, {s18}
245cabdff1aSopenharmony_ci                    vldmdb  SRC1!, {s19}
246cabdff1aSopenharmony_ci                    vmul.f  s12, s20, s12       @ vector * vector
247cabdff1aSopenharmony_ci                vstmia  DST0!, {s26-s27}
248cabdff1aSopenharmony_ci                vstmdb  DST1!, {s8}
249cabdff1aSopenharmony_ci                vstmdb  DST1!, {s9}
250cabdff1aSopenharmony_ci                vstmdb  DST1!, {s10}
251cabdff1aSopenharmony_ci                vstmdb  DST1!, {s11}
252cabdff1aSopenharmony_ci                    vmls.f  s28, s20, s16       @ vector * vector
253cabdff1aSopenharmony_ci                    vmla.f  s12, s4, s16        @ vector * vector
254cabdff1aSopenharmony_ci                    vstmia  DST0!, {s28-s31}
255cabdff1aSopenharmony_ci                    vstmdb  DST1!, {s12}
256cabdff1aSopenharmony_ci                    vstmdb  DST1!, {s13}
257cabdff1aSopenharmony_ci                    vstmdb  DST1!, {s14}
258cabdff1aSopenharmony_ci                    vstmdb  DST1!, {s15}
259cabdff1aSopenharmony_ci7:
260cabdff1aSopenharmony_ci        fmxr    FPSCR, OLDFPSCR
261cabdff1aSopenharmony_ci        vpop    {s16-s31}
262cabdff1aSopenharmony_ci        pop     {v1-v3,pc}
263cabdff1aSopenharmony_ci
264cabdff1aSopenharmony_ci        .unreq  DST0
265cabdff1aSopenharmony_ci        .unreq  SRC0
266cabdff1aSopenharmony_ci        .unreq  SRC1
267cabdff1aSopenharmony_ci        .unreq  WIN0
268cabdff1aSopenharmony_ci        .unreq  LEN
269cabdff1aSopenharmony_ci        .unreq  OLDFPSCR
270cabdff1aSopenharmony_ci        .unreq  DST1
271cabdff1aSopenharmony_ci        .unreq  WIN1
272cabdff1aSopenharmony_ciendfunc
273cabdff1aSopenharmony_ci
274cabdff1aSopenharmony_ci/**
275cabdff1aSopenharmony_ci * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
276cabdff1aSopenharmony_ci * Assume that len is a positive number and is multiple of 8
277cabdff1aSopenharmony_ci */
278cabdff1aSopenharmony_ci@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
279cabdff1aSopenharmony_ci@                                 const float *src1, int len)
280cabdff1aSopenharmony_cifunction ff_vector_fmul_reverse_vfp, export=1
281cabdff1aSopenharmony_ci        vpush           {d8-d15}
282cabdff1aSopenharmony_ci        add             r2,  r2,  r3, lsl #2
283cabdff1aSopenharmony_ci        vldmdb          r2!, {s0-s3}
284cabdff1aSopenharmony_ci        vldmia          r1!, {s8-s11}
285cabdff1aSopenharmony_ci        vldmdb          r2!, {s4-s7}
286cabdff1aSopenharmony_ci        vldmia          r1!, {s12-s15}
287cabdff1aSopenharmony_ci        vmul.f32        s8,  s3,  s8
288cabdff1aSopenharmony_ci        vmul.f32        s9,  s2,  s9
289cabdff1aSopenharmony_ci        vmul.f32        s10, s1,  s10
290cabdff1aSopenharmony_ci        vmul.f32        s11, s0,  s11
291cabdff1aSopenharmony_ci1:
292cabdff1aSopenharmony_ci        subs            r3,  r3,  #16
293cabdff1aSopenharmony_ci        it              ge
294cabdff1aSopenharmony_ci        vldmdbge        r2!, {s16-s19}
295cabdff1aSopenharmony_ci        vmul.f32        s12, s7,  s12
296cabdff1aSopenharmony_ci        it              ge
297cabdff1aSopenharmony_ci        vldmiage        r1!, {s24-s27}
298cabdff1aSopenharmony_ci        vmul.f32        s13, s6,  s13
299cabdff1aSopenharmony_ci        it              ge
300cabdff1aSopenharmony_ci        vldmdbge        r2!, {s20-s23}
301cabdff1aSopenharmony_ci        vmul.f32        s14, s5,  s14
302cabdff1aSopenharmony_ci        it              ge
303cabdff1aSopenharmony_ci        vldmiage        r1!, {s28-s31}
304cabdff1aSopenharmony_ci        vmul.f32        s15, s4,  s15
305cabdff1aSopenharmony_ci        it              ge
306cabdff1aSopenharmony_ci        vmulge.f32      s24, s19, s24
307cabdff1aSopenharmony_ci        it              gt
308cabdff1aSopenharmony_ci        vldmdbgt        r2!, {s0-s3}
309cabdff1aSopenharmony_ci        it              ge
310cabdff1aSopenharmony_ci        vmulge.f32      s25, s18, s25
311cabdff1aSopenharmony_ci        vstmia          r0!, {s8-s13}
312cabdff1aSopenharmony_ci        it              ge
313cabdff1aSopenharmony_ci        vmulge.f32      s26, s17, s26
314cabdff1aSopenharmony_ci        it              gt
315cabdff1aSopenharmony_ci        vldmiagt        r1!, {s8-s11}
316cabdff1aSopenharmony_ci        itt             ge
317cabdff1aSopenharmony_ci        vmulge.f32      s27, s16, s27
318cabdff1aSopenharmony_ci        vmulge.f32      s28, s23, s28
319cabdff1aSopenharmony_ci        it              gt
320cabdff1aSopenharmony_ci        vldmdbgt        r2!, {s4-s7}
321cabdff1aSopenharmony_ci        it              ge
322cabdff1aSopenharmony_ci        vmulge.f32      s29, s22, s29
323cabdff1aSopenharmony_ci        vstmia          r0!, {s14-s15}
324cabdff1aSopenharmony_ci        ittt            ge
325cabdff1aSopenharmony_ci        vmulge.f32      s30, s21, s30
326cabdff1aSopenharmony_ci        vmulge.f32      s31, s20, s31
327cabdff1aSopenharmony_ci        vmulge.f32      s8,  s3,  s8
328cabdff1aSopenharmony_ci        it              gt
329cabdff1aSopenharmony_ci        vldmiagt        r1!, {s12-s15}
330cabdff1aSopenharmony_ci        itttt           ge
331cabdff1aSopenharmony_ci        vmulge.f32      s9,  s2,  s9
332cabdff1aSopenharmony_ci        vmulge.f32      s10, s1,  s10
333cabdff1aSopenharmony_ci        vstmiage        r0!, {s24-s27}
334cabdff1aSopenharmony_ci        vmulge.f32      s11, s0,  s11
335cabdff1aSopenharmony_ci        it              ge
336cabdff1aSopenharmony_ci        vstmiage        r0!, {s28-s31}
337cabdff1aSopenharmony_ci        bgt             1b
338cabdff1aSopenharmony_ci
339cabdff1aSopenharmony_ci        vpop            {d8-d15}
340cabdff1aSopenharmony_ci        bx              lr
341cabdff1aSopenharmony_ciendfunc
342cabdff1aSopenharmony_ci
343cabdff1aSopenharmony_ci/**
344cabdff1aSopenharmony_ci * ARM VFP implementation of 'butterflies_float_c' function
345cabdff1aSopenharmony_ci * Assume that len is a positive non-zero number
346cabdff1aSopenharmony_ci */
347cabdff1aSopenharmony_ci@ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len)
348cabdff1aSopenharmony_cifunction ff_butterflies_float_vfp, export=1
349cabdff1aSopenharmony_ciBASE1   .req    a1
350cabdff1aSopenharmony_ciBASE2   .req    a2
351cabdff1aSopenharmony_ciLEN     .req    a3
352cabdff1aSopenharmony_ciOLDFPSCR .req   a4
353cabdff1aSopenharmony_ci
354cabdff1aSopenharmony_ci        vpush   {s16-s31}
355cabdff1aSopenharmony_ci        fmrx    OLDFPSCR, FPSCR
356cabdff1aSopenharmony_ci
357cabdff1aSopenharmony_ci        tst     LEN, #7
358cabdff1aSopenharmony_ci        beq     4f                          @ common case: len is a multiple of 8
359cabdff1aSopenharmony_ci
360cabdff1aSopenharmony_ci        ldr     ip, =0x03000000             @ RunFast mode, scalar mode
361cabdff1aSopenharmony_ci        fmxr    FPSCR, ip
362cabdff1aSopenharmony_ci
363cabdff1aSopenharmony_ci        tst     LEN, #1
364cabdff1aSopenharmony_ci        beq     1f
365cabdff1aSopenharmony_ci        vldmia  BASE1!, {s0}
366cabdff1aSopenharmony_ci        vldmia  BASE2!, {s8}
367cabdff1aSopenharmony_ci        vadd.f  s16, s0, s8
368cabdff1aSopenharmony_ci        vsub.f  s24, s0, s8
369cabdff1aSopenharmony_ci        vstr    s16, [BASE1, #0-4*1]
370cabdff1aSopenharmony_ci        vstr    s24, [BASE2, #0-4*1]
371cabdff1aSopenharmony_ci1:
372cabdff1aSopenharmony_ci        tst     LEN, #2
373cabdff1aSopenharmony_ci        beq     2f
374cabdff1aSopenharmony_ci        vldmia  BASE1!, {s0-s1}
375cabdff1aSopenharmony_ci        vldmia  BASE2!, {s8-s9}
376cabdff1aSopenharmony_ci        vadd.f  s16, s0, s8
377cabdff1aSopenharmony_ci        vadd.f  s17, s1, s9
378cabdff1aSopenharmony_ci        vsub.f  s24, s0, s8
379cabdff1aSopenharmony_ci        vsub.f  s25, s1, s9
380cabdff1aSopenharmony_ci        vstr    d8, [BASE1, #0-8*1]    @ s16,s17
381cabdff1aSopenharmony_ci        vstr    d12, [BASE2, #0-8*1]   @ s24,s25
382cabdff1aSopenharmony_ci2:
383cabdff1aSopenharmony_ci        tst     LEN, #4
384cabdff1aSopenharmony_ci        beq     3f
385cabdff1aSopenharmony_ci        vldmia  BASE1!, {s0-s1}
386cabdff1aSopenharmony_ci        vldmia  BASE2!, {s8-s9}
387cabdff1aSopenharmony_ci        vldmia  BASE1!, {s2-s3}
388cabdff1aSopenharmony_ci        vldmia  BASE2!, {s10-s11}
389cabdff1aSopenharmony_ci        vadd.f  s16, s0, s8
390cabdff1aSopenharmony_ci        vadd.f  s17, s1, s9
391cabdff1aSopenharmony_ci        vsub.f  s24, s0, s8
392cabdff1aSopenharmony_ci        vsub.f  s25, s1, s9
393cabdff1aSopenharmony_ci        vadd.f  s18, s2, s10
394cabdff1aSopenharmony_ci        vadd.f  s19, s3, s11
395cabdff1aSopenharmony_ci        vsub.f  s26, s2, s10
396cabdff1aSopenharmony_ci        vsub.f  s27, s3, s11
397cabdff1aSopenharmony_ci        vstr    d8, [BASE1, #0-16*1]    @ s16,s17
398cabdff1aSopenharmony_ci        vstr    d12, [BASE2, #0-16*1]   @ s24,s25
399cabdff1aSopenharmony_ci        vstr    d9, [BASE1, #8-16*1]    @ s18,s19
400cabdff1aSopenharmony_ci        vstr    d13, [BASE2, #8-16*1]   @ s26,s27
401cabdff1aSopenharmony_ci3:
402cabdff1aSopenharmony_ci        bics    LEN, LEN, #7
403cabdff1aSopenharmony_ci        beq     7f
404cabdff1aSopenharmony_ci4:
405cabdff1aSopenharmony_ci        ldr     ip, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
406cabdff1aSopenharmony_ci        fmxr    FPSCR, ip
407cabdff1aSopenharmony_ci
408cabdff1aSopenharmony_ci        vldmia  BASE1!, {s0-s1}
409cabdff1aSopenharmony_ci        vldmia  BASE2!, {s8-s9}
410cabdff1aSopenharmony_ci        vldmia  BASE1!, {s2-s3}
411cabdff1aSopenharmony_ci        vldmia  BASE2!, {s10-s11}
412cabdff1aSopenharmony_ci        vadd.f  s16, s0, s8
413cabdff1aSopenharmony_ci            vldmia  BASE1!, {s4-s5}
414cabdff1aSopenharmony_ci            vldmia  BASE2!, {s12-s13}
415cabdff1aSopenharmony_ci            vldmia  BASE1!, {s6-s7}
416cabdff1aSopenharmony_ci            vldmia  BASE2!, {s14-s15}
417cabdff1aSopenharmony_ci        vsub.f  s24, s0, s8
418cabdff1aSopenharmony_ci            vadd.f  s20, s4, s12
419cabdff1aSopenharmony_ci        subs    LEN, LEN, #8
420cabdff1aSopenharmony_ci        beq     6f
421cabdff1aSopenharmony_ci5:              vldmia  BASE1!, {s0-s3}
422cabdff1aSopenharmony_ci                vldmia  BASE2!, {s8-s11}
423cabdff1aSopenharmony_ci            vsub.f  s28, s4, s12
424cabdff1aSopenharmony_ci        vstr    d8, [BASE1, #0-16*3]    @ s16,s17
425cabdff1aSopenharmony_ci        vstr    d9, [BASE1, #8-16*3]    @ s18,s19
426cabdff1aSopenharmony_ci        vstr    d12, [BASE2, #0-16*3]   @ s24,s25
427cabdff1aSopenharmony_ci        vstr    d13, [BASE2, #8-16*3]   @ s26,s27
428cabdff1aSopenharmony_ci                vadd.f  s16, s0, s8
429cabdff1aSopenharmony_ci                    vldmia  BASE1!, {s4-s7}
430cabdff1aSopenharmony_ci                    vldmia  BASE2!, {s12-s15}
431cabdff1aSopenharmony_ci                vsub.f  s24, s0, s8
432cabdff1aSopenharmony_ci            vstr    d10, [BASE1, #0-16*3]   @ s20,s21
433cabdff1aSopenharmony_ci            vstr    d11, [BASE1, #8-16*3]   @ s22,s23
434cabdff1aSopenharmony_ci            vstr    d14, [BASE2, #0-16*3]   @ s28,s29
435cabdff1aSopenharmony_ci            vstr    d15, [BASE2, #8-16*3]   @ s30,s31
436cabdff1aSopenharmony_ci                    vadd.f  s20, s4, s12
437cabdff1aSopenharmony_ci        subs    LEN, LEN, #8
438cabdff1aSopenharmony_ci        bne     5b
439cabdff1aSopenharmony_ci6:                   vsub.f  s28, s4, s12
440cabdff1aSopenharmony_ci                vstr    d8, [BASE1, #0-16*2]    @ s16,s17
441cabdff1aSopenharmony_ci                vstr    d9, [BASE1, #8-16*2]    @ s18,s19
442cabdff1aSopenharmony_ci                vstr    d12, [BASE2, #0-16*2]   @ s24,s25
443cabdff1aSopenharmony_ci                vstr    d13, [BASE2, #8-16*2]   @ s26,s27
444cabdff1aSopenharmony_ci                    vstr    d10, [BASE1, #0-16*1]   @ s20,s21
445cabdff1aSopenharmony_ci                    vstr    d11, [BASE1, #8-16*1]   @ s22,s23
446cabdff1aSopenharmony_ci                    vstr    d14, [BASE2, #0-16*1]   @ s28,s29
447cabdff1aSopenharmony_ci                    vstr    d15, [BASE2, #8-16*1]   @ s30,s31
448cabdff1aSopenharmony_ci7:
449cabdff1aSopenharmony_ci        fmxr    FPSCR, OLDFPSCR
450cabdff1aSopenharmony_ci        vpop    {s16-s31}
451cabdff1aSopenharmony_ci        bx      lr
452cabdff1aSopenharmony_ci
453cabdff1aSopenharmony_ci        .unreq  BASE1
454cabdff1aSopenharmony_ci        .unreq  BASE2
455cabdff1aSopenharmony_ci        .unreq  LEN
456cabdff1aSopenharmony_ci        .unreq  OLDFPSCR
457cabdff1aSopenharmony_ciendfunc
458