1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * ARM NEON optimised Float DSP functions
3cabdff1aSopenharmony_ci * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4cabdff1aSopenharmony_ci *
5cabdff1aSopenharmony_ci * This file is part of FFmpeg.
6cabdff1aSopenharmony_ci *
7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci *
12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci * Lesser General Public License for more details.
16cabdff1aSopenharmony_ci *
17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci */
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci#include "config.h"
23cabdff1aSopenharmony_ci#include "asm.S"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_cifunction ff_vector_fmul_neon, export=1
26cabdff1aSopenharmony_ci        subs            r3,  r3,  #8
27cabdff1aSopenharmony_ci        vld1.32         {d0-d3},  [r1,:128]!
28cabdff1aSopenharmony_ci        vld1.32         {d4-d7},  [r2,:128]!
29cabdff1aSopenharmony_ci        vmul.f32        q8,  q0,  q2
30cabdff1aSopenharmony_ci        vmul.f32        q9,  q1,  q3
31cabdff1aSopenharmony_ci        beq             3f
32cabdff1aSopenharmony_ci        bics            ip,  r3,  #15
33cabdff1aSopenharmony_ci        beq             2f
34cabdff1aSopenharmony_ci1:      subs            ip,  ip,  #16
35cabdff1aSopenharmony_ci        vld1.32         {d0-d1},  [r1,:128]!
36cabdff1aSopenharmony_ci        vld1.32         {d4-d5},  [r2,:128]!
37cabdff1aSopenharmony_ci        vmul.f32        q10, q0,  q2
38cabdff1aSopenharmony_ci        vld1.32         {d2-d3},  [r1,:128]!
39cabdff1aSopenharmony_ci        vld1.32         {d6-d7},  [r2,:128]!
40cabdff1aSopenharmony_ci        vmul.f32        q11, q1,  q3
41cabdff1aSopenharmony_ci        vst1.32         {d16-d19},[r0,:128]!
42cabdff1aSopenharmony_ci        vld1.32         {d0-d1},  [r1,:128]!
43cabdff1aSopenharmony_ci        vld1.32         {d4-d5},  [r2,:128]!
44cabdff1aSopenharmony_ci        vmul.f32        q8,  q0,  q2
45cabdff1aSopenharmony_ci        vld1.32         {d2-d3},  [r1,:128]!
46cabdff1aSopenharmony_ci        vld1.32         {d6-d7},  [r2,:128]!
47cabdff1aSopenharmony_ci        vmul.f32        q9,  q1,  q3
48cabdff1aSopenharmony_ci        vst1.32         {d20-d23},[r0,:128]!
49cabdff1aSopenharmony_ci        bne             1b
50cabdff1aSopenharmony_ci        ands            r3,  r3,  #15
51cabdff1aSopenharmony_ci        beq             3f
52cabdff1aSopenharmony_ci2:      vld1.32         {d0-d1},  [r1,:128]!
53cabdff1aSopenharmony_ci        vld1.32         {d4-d5},  [r2,:128]!
54cabdff1aSopenharmony_ci        vst1.32         {d16-d17},[r0,:128]!
55cabdff1aSopenharmony_ci        vmul.f32        q8,  q0,  q2
56cabdff1aSopenharmony_ci        vld1.32         {d2-d3},  [r1,:128]!
57cabdff1aSopenharmony_ci        vld1.32         {d6-d7},  [r2,:128]!
58cabdff1aSopenharmony_ci        vst1.32         {d18-d19},[r0,:128]!
59cabdff1aSopenharmony_ci        vmul.f32        q9,  q1,  q3
60cabdff1aSopenharmony_ci3:      vst1.32         {d16-d19},[r0,:128]!
61cabdff1aSopenharmony_ci        bx              lr
62cabdff1aSopenharmony_ciendfunc
63cabdff1aSopenharmony_ci
64cabdff1aSopenharmony_cifunction ff_vector_fmac_scalar_neon, export=1
65cabdff1aSopenharmony_ciVFP     len .req r2
66cabdff1aSopenharmony_ciVFP     acc .req r3
67cabdff1aSopenharmony_ciNOVFP   len .req r3
68cabdff1aSopenharmony_ciNOVFP   acc .req r2
69cabdff1aSopenharmony_ciVFP     vdup.32         q15, d0[0]
70cabdff1aSopenharmony_ciNOVFP   vdup.32         q15, r2
71cabdff1aSopenharmony_ci        bics            r12, len, #15
72cabdff1aSopenharmony_ci        mov             acc, r0
73cabdff1aSopenharmony_ci        beq             3f
74cabdff1aSopenharmony_ci        vld1.32         {q0},     [r1,:128]!
75cabdff1aSopenharmony_ci        vld1.32         {q8},     [acc,:128]!
76cabdff1aSopenharmony_ci        vld1.32         {q1},     [r1,:128]!
77cabdff1aSopenharmony_ci        vld1.32         {q9},     [acc,:128]!
78cabdff1aSopenharmony_ci1:      vmla.f32        q8,  q0,  q15
79cabdff1aSopenharmony_ci        vld1.32         {q2},     [r1,:128]!
80cabdff1aSopenharmony_ci        vld1.32         {q10},    [acc,:128]!
81cabdff1aSopenharmony_ci        vmla.f32        q9,  q1,  q15
82cabdff1aSopenharmony_ci        vld1.32         {q3},     [r1,:128]!
83cabdff1aSopenharmony_ci        vld1.32         {q11},    [acc,:128]!
84cabdff1aSopenharmony_ci        vmla.f32        q10, q2,  q15
85cabdff1aSopenharmony_ci        vst1.32         {q8},     [r0,:128]!
86cabdff1aSopenharmony_ci        vmla.f32        q11, q3,  q15
87cabdff1aSopenharmony_ci        vst1.32         {q9},     [r0,:128]!
88cabdff1aSopenharmony_ci        subs            r12, r12, #16
89cabdff1aSopenharmony_ci        beq             2f
90cabdff1aSopenharmony_ci        vld1.32         {q0},     [r1,:128]!
91cabdff1aSopenharmony_ci        vld1.32         {q8},     [acc,:128]!
92cabdff1aSopenharmony_ci        vst1.32         {q10},    [r0,:128]!
93cabdff1aSopenharmony_ci        vld1.32         {q1},     [r1,:128]!
94cabdff1aSopenharmony_ci        vld1.32         {q9},     [acc,:128]!
95cabdff1aSopenharmony_ci        vst1.32         {q11},    [r0,:128]!
96cabdff1aSopenharmony_ci        b               1b
97cabdff1aSopenharmony_ci2:      vst1.32         {q10},    [r0,:128]!
98cabdff1aSopenharmony_ci        vst1.32         {q11},    [r0,:128]!
99cabdff1aSopenharmony_ci        ands            len, len, #15
100cabdff1aSopenharmony_ci        it              eq
101cabdff1aSopenharmony_ci        bxeq            lr
102cabdff1aSopenharmony_ci3:      vld1.32         {q0},     [r1,:128]!
103cabdff1aSopenharmony_ci        vld1.32         {q8},     [acc,:128]!
104cabdff1aSopenharmony_ci        vmla.f32        q8,  q0,  q15
105cabdff1aSopenharmony_ci        vst1.32         {q8},     [r0,:128]!
106cabdff1aSopenharmony_ci        subs            len, len, #4
107cabdff1aSopenharmony_ci        bgt             3b
108cabdff1aSopenharmony_ci        bx              lr
109cabdff1aSopenharmony_ci        .unreq          len
110cabdff1aSopenharmony_ciendfunc
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_cifunction ff_vector_fmul_scalar_neon, export=1
113cabdff1aSopenharmony_ciVFP     len .req r2
114cabdff1aSopenharmony_ciNOVFP   len .req r3
115cabdff1aSopenharmony_ciVFP     vdup.32         q8,  d0[0]
116cabdff1aSopenharmony_ciNOVFP   vdup.32         q8,  r2
117cabdff1aSopenharmony_ci        bics            r12, len, #15
118cabdff1aSopenharmony_ci        beq             3f
119cabdff1aSopenharmony_ci        vld1.32         {q0},[r1,:128]!
120cabdff1aSopenharmony_ci        vld1.32         {q1},[r1,:128]!
121cabdff1aSopenharmony_ci1:      vmul.f32        q0,  q0,  q8
122cabdff1aSopenharmony_ci        vld1.32         {q2},[r1,:128]!
123cabdff1aSopenharmony_ci        vmul.f32        q1,  q1,  q8
124cabdff1aSopenharmony_ci        vld1.32         {q3},[r1,:128]!
125cabdff1aSopenharmony_ci        vmul.f32        q2,  q2,  q8
126cabdff1aSopenharmony_ci        vst1.32         {q0},[r0,:128]!
127cabdff1aSopenharmony_ci        vmul.f32        q3,  q3,  q8
128cabdff1aSopenharmony_ci        vst1.32         {q1},[r0,:128]!
129cabdff1aSopenharmony_ci        subs            r12, r12, #16
130cabdff1aSopenharmony_ci        beq             2f
131cabdff1aSopenharmony_ci        vld1.32         {q0},[r1,:128]!
132cabdff1aSopenharmony_ci        vst1.32         {q2},[r0,:128]!
133cabdff1aSopenharmony_ci        vld1.32         {q1},[r1,:128]!
134cabdff1aSopenharmony_ci        vst1.32         {q3},[r0,:128]!
135cabdff1aSopenharmony_ci        b               1b
136cabdff1aSopenharmony_ci2:      vst1.32         {q2},[r0,:128]!
137cabdff1aSopenharmony_ci        vst1.32         {q3},[r0,:128]!
138cabdff1aSopenharmony_ci        ands            len, len, #15
139cabdff1aSopenharmony_ci        it              eq
140cabdff1aSopenharmony_ci        bxeq            lr
141cabdff1aSopenharmony_ci3:      vld1.32         {q0},[r1,:128]!
142cabdff1aSopenharmony_ci        vmul.f32        q0,  q0,  q8
143cabdff1aSopenharmony_ci        vst1.32         {q0},[r0,:128]!
144cabdff1aSopenharmony_ci        subs            len, len, #4
145cabdff1aSopenharmony_ci        bgt             3b
146cabdff1aSopenharmony_ci        bx              lr
147cabdff1aSopenharmony_ci        .unreq          len
148cabdff1aSopenharmony_ciendfunc
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_cifunction ff_vector_fmul_window_neon, export=1
151cabdff1aSopenharmony_ci        push            {r4,r5,lr}
152cabdff1aSopenharmony_ci        ldr             lr,  [sp, #12]
153cabdff1aSopenharmony_ci        sub             r2,  r2,  #8
154cabdff1aSopenharmony_ci        sub             r5,  lr,  #2
155cabdff1aSopenharmony_ci        add             r2,  r2,  r5, lsl #2
156cabdff1aSopenharmony_ci        add             r4,  r3,  r5, lsl #3
157cabdff1aSopenharmony_ci        add             ip,  r0,  r5, lsl #3
158cabdff1aSopenharmony_ci        mov             r5,  #-16
159cabdff1aSopenharmony_ci        vld1.32         {d0,d1},  [r1,:128]!
160cabdff1aSopenharmony_ci        vld1.32         {d2,d3},  [r2,:128], r5
161cabdff1aSopenharmony_ci        vld1.32         {d4,d5},  [r3,:128]!
162cabdff1aSopenharmony_ci        vld1.32         {d6,d7},  [r4,:128], r5
163cabdff1aSopenharmony_ci1:      subs            lr,  lr,  #4
164cabdff1aSopenharmony_ci        vmul.f32        d22, d0,  d4
165cabdff1aSopenharmony_ci        vrev64.32       q3,  q3
166cabdff1aSopenharmony_ci        vmul.f32        d23, d1,  d5
167cabdff1aSopenharmony_ci        vrev64.32       q1,  q1
168cabdff1aSopenharmony_ci        vmul.f32        d20, d0,  d7
169cabdff1aSopenharmony_ci        vmul.f32        d21, d1,  d6
170cabdff1aSopenharmony_ci        beq             2f
171cabdff1aSopenharmony_ci        vmla.f32        d22, d3,  d7
172cabdff1aSopenharmony_ci        vld1.32         {d0,d1},  [r1,:128]!
173cabdff1aSopenharmony_ci        vmla.f32        d23, d2,  d6
174cabdff1aSopenharmony_ci        vld1.32         {d18,d19},[r2,:128], r5
175cabdff1aSopenharmony_ci        vmls.f32        d20, d3,  d4
176cabdff1aSopenharmony_ci        vld1.32         {d24,d25},[r3,:128]!
177cabdff1aSopenharmony_ci        vmls.f32        d21, d2,  d5
178cabdff1aSopenharmony_ci        vld1.32         {d6,d7},  [r4,:128], r5
179cabdff1aSopenharmony_ci        vmov            q1,  q9
180cabdff1aSopenharmony_ci        vrev64.32       q11, q11
181cabdff1aSopenharmony_ci        vmov            q2,  q12
182cabdff1aSopenharmony_ci        vswp            d22, d23
183cabdff1aSopenharmony_ci        vst1.32         {d20,d21},[r0,:128]!
184cabdff1aSopenharmony_ci        vst1.32         {d22,d23},[ip,:128], r5
185cabdff1aSopenharmony_ci        b               1b
186cabdff1aSopenharmony_ci2:      vmla.f32        d22, d3,  d7
187cabdff1aSopenharmony_ci        vmla.f32        d23, d2,  d6
188cabdff1aSopenharmony_ci        vmls.f32        d20, d3,  d4
189cabdff1aSopenharmony_ci        vmls.f32        d21, d2,  d5
190cabdff1aSopenharmony_ci        vrev64.32       q11, q11
191cabdff1aSopenharmony_ci        vswp            d22, d23
192cabdff1aSopenharmony_ci        vst1.32         {d20,d21},[r0,:128]!
193cabdff1aSopenharmony_ci        vst1.32         {d22,d23},[ip,:128], r5
194cabdff1aSopenharmony_ci        pop             {r4,r5,pc}
195cabdff1aSopenharmony_ciendfunc
196cabdff1aSopenharmony_ci
197cabdff1aSopenharmony_cifunction ff_vector_fmul_add_neon, export=1
198cabdff1aSopenharmony_ci        ldr             r12, [sp]
199cabdff1aSopenharmony_ci        vld1.32         {q0-q1},  [r1,:128]!
200cabdff1aSopenharmony_ci        vld1.32         {q8-q9},  [r2,:128]!
201cabdff1aSopenharmony_ci        vld1.32         {q2-q3},  [r3,:128]!
202cabdff1aSopenharmony_ci        vmul.f32        q10, q0,  q8
203cabdff1aSopenharmony_ci        vmul.f32        q11, q1,  q9
204cabdff1aSopenharmony_ci1:      vadd.f32        q12, q2,  q10
205cabdff1aSopenharmony_ci        vadd.f32        q13, q3,  q11
206cabdff1aSopenharmony_ci        pld             [r1, #16]
207cabdff1aSopenharmony_ci        pld             [r2, #16]
208cabdff1aSopenharmony_ci        pld             [r3, #16]
209cabdff1aSopenharmony_ci        subs            r12, r12, #8
210cabdff1aSopenharmony_ci        beq             2f
211cabdff1aSopenharmony_ci        vld1.32         {q0},     [r1,:128]!
212cabdff1aSopenharmony_ci        vld1.32         {q8},     [r2,:128]!
213cabdff1aSopenharmony_ci        vmul.f32        q10, q0,  q8
214cabdff1aSopenharmony_ci        vld1.32         {q1},     [r1,:128]!
215cabdff1aSopenharmony_ci        vld1.32         {q9},     [r2,:128]!
216cabdff1aSopenharmony_ci        vmul.f32        q11, q1,  q9
217cabdff1aSopenharmony_ci        vld1.32         {q2-q3},  [r3,:128]!
218cabdff1aSopenharmony_ci        vst1.32         {q12-q13},[r0,:128]!
219cabdff1aSopenharmony_ci        b               1b
220cabdff1aSopenharmony_ci2:      vst1.32         {q12-q13},[r0,:128]!
221cabdff1aSopenharmony_ci        bx              lr
222cabdff1aSopenharmony_ciendfunc
223cabdff1aSopenharmony_ci
224cabdff1aSopenharmony_cifunction ff_vector_fmul_reverse_neon, export=1
225cabdff1aSopenharmony_ci        add             r2,  r2,  r3,  lsl #2
226cabdff1aSopenharmony_ci        sub             r2,  r2,  #32
227cabdff1aSopenharmony_ci        mov             r12, #-32
228cabdff1aSopenharmony_ci        vld1.32         {q0-q1},  [r1,:128]!
229cabdff1aSopenharmony_ci        vld1.32         {q2-q3},  [r2,:128], r12
230cabdff1aSopenharmony_ci1:      pld             [r1, #32]
231cabdff1aSopenharmony_ci        vrev64.32       q3,  q3
232cabdff1aSopenharmony_ci        vmul.f32        d16, d0,  d7
233cabdff1aSopenharmony_ci        vmul.f32        d17, d1,  d6
234cabdff1aSopenharmony_ci        pld             [r2, #-32]
235cabdff1aSopenharmony_ci        vrev64.32       q2,  q2
236cabdff1aSopenharmony_ci        vmul.f32        d18, d2,  d5
237cabdff1aSopenharmony_ci        vmul.f32        d19, d3,  d4
238cabdff1aSopenharmony_ci        subs            r3,  r3,  #8
239cabdff1aSopenharmony_ci        beq             2f
240cabdff1aSopenharmony_ci        vld1.32         {q0-q1},  [r1,:128]!
241cabdff1aSopenharmony_ci        vld1.32         {q2-q3},  [r2,:128], r12
242cabdff1aSopenharmony_ci        vst1.32         {q8-q9},  [r0,:128]!
243cabdff1aSopenharmony_ci        b               1b
244cabdff1aSopenharmony_ci2:      vst1.32         {q8-q9},  [r0,:128]!
245cabdff1aSopenharmony_ci        bx              lr
246cabdff1aSopenharmony_ciendfunc
247cabdff1aSopenharmony_ci
248cabdff1aSopenharmony_cifunction ff_butterflies_float_neon, export=1
249cabdff1aSopenharmony_ci1:      vld1.32         {q0},[r0,:128]
250cabdff1aSopenharmony_ci        vld1.32         {q1},[r1,:128]
251cabdff1aSopenharmony_ci        vsub.f32        q2,  q0,  q1
252cabdff1aSopenharmony_ci        vadd.f32        q1,  q0,  q1
253cabdff1aSopenharmony_ci        vst1.32         {q2},[r1,:128]!
254cabdff1aSopenharmony_ci        vst1.32         {q1},[r0,:128]!
255cabdff1aSopenharmony_ci        subs            r2,  r2,  #4
256cabdff1aSopenharmony_ci        bgt             1b
257cabdff1aSopenharmony_ci        bx              lr
258cabdff1aSopenharmony_ciendfunc
259cabdff1aSopenharmony_ci
260cabdff1aSopenharmony_cifunction ff_scalarproduct_float_neon, export=1
261cabdff1aSopenharmony_ci        vmov.f32        q2,  #0.0
262cabdff1aSopenharmony_ci1:      vld1.32         {q0},[r0,:128]!
263cabdff1aSopenharmony_ci        vld1.32         {q1},[r1,:128]!
264cabdff1aSopenharmony_ci        vmla.f32        q2,  q0,  q1
265cabdff1aSopenharmony_ci        subs            r2,  r2,  #4
266cabdff1aSopenharmony_ci        bgt             1b
267cabdff1aSopenharmony_ci        vadd.f32        d0,  d4,  d5
268cabdff1aSopenharmony_ci        vpadd.f32       d0,  d0,  d0
269cabdff1aSopenharmony_ciNOVFP   vmov.32         r0,  d0[0]
270cabdff1aSopenharmony_ci        bx              lr
271cabdff1aSopenharmony_ciendfunc
272