1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * ARM NEON optimised Float DSP functions 3cabdff1aSopenharmony_ci * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 4cabdff1aSopenharmony_ci * 5cabdff1aSopenharmony_ci * This file is part of FFmpeg. 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci * 12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci * Lesser General Public License for more details. 16cabdff1aSopenharmony_ci * 17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci */ 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci#include "config.h" 23cabdff1aSopenharmony_ci#include "asm.S" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_cifunction ff_vector_fmul_neon, export=1 26cabdff1aSopenharmony_ci subs r3, r3, #8 27cabdff1aSopenharmony_ci vld1.32 {d0-d3}, [r1,:128]! 28cabdff1aSopenharmony_ci vld1.32 {d4-d7}, [r2,:128]! 29cabdff1aSopenharmony_ci vmul.f32 q8, q0, q2 30cabdff1aSopenharmony_ci vmul.f32 q9, q1, q3 31cabdff1aSopenharmony_ci beq 3f 32cabdff1aSopenharmony_ci bics ip, r3, #15 33cabdff1aSopenharmony_ci beq 2f 34cabdff1aSopenharmony_ci1: subs ip, ip, #16 35cabdff1aSopenharmony_ci vld1.32 {d0-d1}, [r1,:128]! 36cabdff1aSopenharmony_ci vld1.32 {d4-d5}, [r2,:128]! 37cabdff1aSopenharmony_ci vmul.f32 q10, q0, q2 38cabdff1aSopenharmony_ci vld1.32 {d2-d3}, [r1,:128]! 39cabdff1aSopenharmony_ci vld1.32 {d6-d7}, [r2,:128]! 40cabdff1aSopenharmony_ci vmul.f32 q11, q1, q3 41cabdff1aSopenharmony_ci vst1.32 {d16-d19},[r0,:128]! 42cabdff1aSopenharmony_ci vld1.32 {d0-d1}, [r1,:128]! 43cabdff1aSopenharmony_ci vld1.32 {d4-d5}, [r2,:128]! 44cabdff1aSopenharmony_ci vmul.f32 q8, q0, q2 45cabdff1aSopenharmony_ci vld1.32 {d2-d3}, [r1,:128]! 46cabdff1aSopenharmony_ci vld1.32 {d6-d7}, [r2,:128]! 47cabdff1aSopenharmony_ci vmul.f32 q9, q1, q3 48cabdff1aSopenharmony_ci vst1.32 {d20-d23},[r0,:128]! 49cabdff1aSopenharmony_ci bne 1b 50cabdff1aSopenharmony_ci ands r3, r3, #15 51cabdff1aSopenharmony_ci beq 3f 52cabdff1aSopenharmony_ci2: vld1.32 {d0-d1}, [r1,:128]! 53cabdff1aSopenharmony_ci vld1.32 {d4-d5}, [r2,:128]! 54cabdff1aSopenharmony_ci vst1.32 {d16-d17},[r0,:128]! 55cabdff1aSopenharmony_ci vmul.f32 q8, q0, q2 56cabdff1aSopenharmony_ci vld1.32 {d2-d3}, [r1,:128]! 57cabdff1aSopenharmony_ci vld1.32 {d6-d7}, [r2,:128]! 58cabdff1aSopenharmony_ci vst1.32 {d18-d19},[r0,:128]! 59cabdff1aSopenharmony_ci vmul.f32 q9, q1, q3 60cabdff1aSopenharmony_ci3: vst1.32 {d16-d19},[r0,:128]! 61cabdff1aSopenharmony_ci bx lr 62cabdff1aSopenharmony_ciendfunc 63cabdff1aSopenharmony_ci 64cabdff1aSopenharmony_cifunction ff_vector_fmac_scalar_neon, export=1 65cabdff1aSopenharmony_ciVFP len .req r2 66cabdff1aSopenharmony_ciVFP acc .req r3 67cabdff1aSopenharmony_ciNOVFP len .req r3 68cabdff1aSopenharmony_ciNOVFP acc .req r2 69cabdff1aSopenharmony_ciVFP vdup.32 q15, d0[0] 70cabdff1aSopenharmony_ciNOVFP vdup.32 q15, r2 71cabdff1aSopenharmony_ci bics r12, len, #15 72cabdff1aSopenharmony_ci mov acc, r0 73cabdff1aSopenharmony_ci beq 3f 74cabdff1aSopenharmony_ci vld1.32 {q0}, [r1,:128]! 75cabdff1aSopenharmony_ci vld1.32 {q8}, [acc,:128]! 76cabdff1aSopenharmony_ci vld1.32 {q1}, [r1,:128]! 77cabdff1aSopenharmony_ci vld1.32 {q9}, [acc,:128]! 78cabdff1aSopenharmony_ci1: vmla.f32 q8, q0, q15 79cabdff1aSopenharmony_ci vld1.32 {q2}, [r1,:128]! 80cabdff1aSopenharmony_ci vld1.32 {q10}, [acc,:128]! 81cabdff1aSopenharmony_ci vmla.f32 q9, q1, q15 82cabdff1aSopenharmony_ci vld1.32 {q3}, [r1,:128]! 83cabdff1aSopenharmony_ci vld1.32 {q11}, [acc,:128]! 84cabdff1aSopenharmony_ci vmla.f32 q10, q2, q15 85cabdff1aSopenharmony_ci vst1.32 {q8}, [r0,:128]! 86cabdff1aSopenharmony_ci vmla.f32 q11, q3, q15 87cabdff1aSopenharmony_ci vst1.32 {q9}, [r0,:128]! 88cabdff1aSopenharmony_ci subs r12, r12, #16 89cabdff1aSopenharmony_ci beq 2f 90cabdff1aSopenharmony_ci vld1.32 {q0}, [r1,:128]! 91cabdff1aSopenharmony_ci vld1.32 {q8}, [acc,:128]! 92cabdff1aSopenharmony_ci vst1.32 {q10}, [r0,:128]! 93cabdff1aSopenharmony_ci vld1.32 {q1}, [r1,:128]! 94cabdff1aSopenharmony_ci vld1.32 {q9}, [acc,:128]! 95cabdff1aSopenharmony_ci vst1.32 {q11}, [r0,:128]! 96cabdff1aSopenharmony_ci b 1b 97cabdff1aSopenharmony_ci2: vst1.32 {q10}, [r0,:128]! 98cabdff1aSopenharmony_ci vst1.32 {q11}, [r0,:128]! 99cabdff1aSopenharmony_ci ands len, len, #15 100cabdff1aSopenharmony_ci it eq 101cabdff1aSopenharmony_ci bxeq lr 102cabdff1aSopenharmony_ci3: vld1.32 {q0}, [r1,:128]! 103cabdff1aSopenharmony_ci vld1.32 {q8}, [acc,:128]! 104cabdff1aSopenharmony_ci vmla.f32 q8, q0, q15 105cabdff1aSopenharmony_ci vst1.32 {q8}, [r0,:128]! 106cabdff1aSopenharmony_ci subs len, len, #4 107cabdff1aSopenharmony_ci bgt 3b 108cabdff1aSopenharmony_ci bx lr 109cabdff1aSopenharmony_ci .unreq len 110cabdff1aSopenharmony_ciendfunc 111cabdff1aSopenharmony_ci 112cabdff1aSopenharmony_cifunction ff_vector_fmul_scalar_neon, export=1 113cabdff1aSopenharmony_ciVFP len .req r2 114cabdff1aSopenharmony_ciNOVFP len .req r3 115cabdff1aSopenharmony_ciVFP vdup.32 q8, d0[0] 116cabdff1aSopenharmony_ciNOVFP vdup.32 q8, r2 117cabdff1aSopenharmony_ci bics r12, len, #15 118cabdff1aSopenharmony_ci beq 3f 119cabdff1aSopenharmony_ci vld1.32 {q0},[r1,:128]! 120cabdff1aSopenharmony_ci vld1.32 {q1},[r1,:128]! 121cabdff1aSopenharmony_ci1: vmul.f32 q0, q0, q8 122cabdff1aSopenharmony_ci vld1.32 {q2},[r1,:128]! 123cabdff1aSopenharmony_ci vmul.f32 q1, q1, q8 124cabdff1aSopenharmony_ci vld1.32 {q3},[r1,:128]! 125cabdff1aSopenharmony_ci vmul.f32 q2, q2, q8 126cabdff1aSopenharmony_ci vst1.32 {q0},[r0,:128]! 127cabdff1aSopenharmony_ci vmul.f32 q3, q3, q8 128cabdff1aSopenharmony_ci vst1.32 {q1},[r0,:128]! 129cabdff1aSopenharmony_ci subs r12, r12, #16 130cabdff1aSopenharmony_ci beq 2f 131cabdff1aSopenharmony_ci vld1.32 {q0},[r1,:128]! 132cabdff1aSopenharmony_ci vst1.32 {q2},[r0,:128]! 133cabdff1aSopenharmony_ci vld1.32 {q1},[r1,:128]! 134cabdff1aSopenharmony_ci vst1.32 {q3},[r0,:128]! 135cabdff1aSopenharmony_ci b 1b 136cabdff1aSopenharmony_ci2: vst1.32 {q2},[r0,:128]! 137cabdff1aSopenharmony_ci vst1.32 {q3},[r0,:128]! 138cabdff1aSopenharmony_ci ands len, len, #15 139cabdff1aSopenharmony_ci it eq 140cabdff1aSopenharmony_ci bxeq lr 141cabdff1aSopenharmony_ci3: vld1.32 {q0},[r1,:128]! 142cabdff1aSopenharmony_ci vmul.f32 q0, q0, q8 143cabdff1aSopenharmony_ci vst1.32 {q0},[r0,:128]! 144cabdff1aSopenharmony_ci subs len, len, #4 145cabdff1aSopenharmony_ci bgt 3b 146cabdff1aSopenharmony_ci bx lr 147cabdff1aSopenharmony_ci .unreq len 148cabdff1aSopenharmony_ciendfunc 149cabdff1aSopenharmony_ci 150cabdff1aSopenharmony_cifunction ff_vector_fmul_window_neon, export=1 151cabdff1aSopenharmony_ci push {r4,r5,lr} 152cabdff1aSopenharmony_ci ldr lr, [sp, #12] 153cabdff1aSopenharmony_ci sub r2, r2, #8 154cabdff1aSopenharmony_ci sub r5, lr, #2 155cabdff1aSopenharmony_ci add r2, r2, r5, lsl #2 156cabdff1aSopenharmony_ci add r4, r3, r5, lsl #3 157cabdff1aSopenharmony_ci add ip, r0, r5, lsl #3 158cabdff1aSopenharmony_ci mov r5, #-16 159cabdff1aSopenharmony_ci vld1.32 {d0,d1}, [r1,:128]! 160cabdff1aSopenharmony_ci vld1.32 {d2,d3}, [r2,:128], r5 161cabdff1aSopenharmony_ci vld1.32 {d4,d5}, [r3,:128]! 162cabdff1aSopenharmony_ci vld1.32 {d6,d7}, [r4,:128], r5 163cabdff1aSopenharmony_ci1: subs lr, lr, #4 164cabdff1aSopenharmony_ci vmul.f32 d22, d0, d4 165cabdff1aSopenharmony_ci vrev64.32 q3, q3 166cabdff1aSopenharmony_ci vmul.f32 d23, d1, d5 167cabdff1aSopenharmony_ci vrev64.32 q1, q1 168cabdff1aSopenharmony_ci vmul.f32 d20, d0, d7 169cabdff1aSopenharmony_ci vmul.f32 d21, d1, d6 170cabdff1aSopenharmony_ci beq 2f 171cabdff1aSopenharmony_ci vmla.f32 d22, d3, d7 172cabdff1aSopenharmony_ci vld1.32 {d0,d1}, [r1,:128]! 173cabdff1aSopenharmony_ci vmla.f32 d23, d2, d6 174cabdff1aSopenharmony_ci vld1.32 {d18,d19},[r2,:128], r5 175cabdff1aSopenharmony_ci vmls.f32 d20, d3, d4 176cabdff1aSopenharmony_ci vld1.32 {d24,d25},[r3,:128]! 177cabdff1aSopenharmony_ci vmls.f32 d21, d2, d5 178cabdff1aSopenharmony_ci vld1.32 {d6,d7}, [r4,:128], r5 179cabdff1aSopenharmony_ci vmov q1, q9 180cabdff1aSopenharmony_ci vrev64.32 q11, q11 181cabdff1aSopenharmony_ci vmov q2, q12 182cabdff1aSopenharmony_ci vswp d22, d23 183cabdff1aSopenharmony_ci vst1.32 {d20,d21},[r0,:128]! 184cabdff1aSopenharmony_ci vst1.32 {d22,d23},[ip,:128], r5 185cabdff1aSopenharmony_ci b 1b 186cabdff1aSopenharmony_ci2: vmla.f32 d22, d3, d7 187cabdff1aSopenharmony_ci vmla.f32 d23, d2, d6 188cabdff1aSopenharmony_ci vmls.f32 d20, d3, d4 189cabdff1aSopenharmony_ci vmls.f32 d21, d2, d5 190cabdff1aSopenharmony_ci vrev64.32 q11, q11 191cabdff1aSopenharmony_ci vswp d22, d23 192cabdff1aSopenharmony_ci vst1.32 {d20,d21},[r0,:128]! 193cabdff1aSopenharmony_ci vst1.32 {d22,d23},[ip,:128], r5 194cabdff1aSopenharmony_ci pop {r4,r5,pc} 195cabdff1aSopenharmony_ciendfunc 196cabdff1aSopenharmony_ci 197cabdff1aSopenharmony_cifunction ff_vector_fmul_add_neon, export=1 198cabdff1aSopenharmony_ci ldr r12, [sp] 199cabdff1aSopenharmony_ci vld1.32 {q0-q1}, [r1,:128]! 200cabdff1aSopenharmony_ci vld1.32 {q8-q9}, [r2,:128]! 201cabdff1aSopenharmony_ci vld1.32 {q2-q3}, [r3,:128]! 202cabdff1aSopenharmony_ci vmul.f32 q10, q0, q8 203cabdff1aSopenharmony_ci vmul.f32 q11, q1, q9 204cabdff1aSopenharmony_ci1: vadd.f32 q12, q2, q10 205cabdff1aSopenharmony_ci vadd.f32 q13, q3, q11 206cabdff1aSopenharmony_ci pld [r1, #16] 207cabdff1aSopenharmony_ci pld [r2, #16] 208cabdff1aSopenharmony_ci pld [r3, #16] 209cabdff1aSopenharmony_ci subs r12, r12, #8 210cabdff1aSopenharmony_ci beq 2f 211cabdff1aSopenharmony_ci vld1.32 {q0}, [r1,:128]! 212cabdff1aSopenharmony_ci vld1.32 {q8}, [r2,:128]! 213cabdff1aSopenharmony_ci vmul.f32 q10, q0, q8 214cabdff1aSopenharmony_ci vld1.32 {q1}, [r1,:128]! 215cabdff1aSopenharmony_ci vld1.32 {q9}, [r2,:128]! 216cabdff1aSopenharmony_ci vmul.f32 q11, q1, q9 217cabdff1aSopenharmony_ci vld1.32 {q2-q3}, [r3,:128]! 218cabdff1aSopenharmony_ci vst1.32 {q12-q13},[r0,:128]! 219cabdff1aSopenharmony_ci b 1b 220cabdff1aSopenharmony_ci2: vst1.32 {q12-q13},[r0,:128]! 221cabdff1aSopenharmony_ci bx lr 222cabdff1aSopenharmony_ciendfunc 223cabdff1aSopenharmony_ci 224cabdff1aSopenharmony_cifunction ff_vector_fmul_reverse_neon, export=1 225cabdff1aSopenharmony_ci add r2, r2, r3, lsl #2 226cabdff1aSopenharmony_ci sub r2, r2, #32 227cabdff1aSopenharmony_ci mov r12, #-32 228cabdff1aSopenharmony_ci vld1.32 {q0-q1}, [r1,:128]! 229cabdff1aSopenharmony_ci vld1.32 {q2-q3}, [r2,:128], r12 230cabdff1aSopenharmony_ci1: pld [r1, #32] 231cabdff1aSopenharmony_ci vrev64.32 q3, q3 232cabdff1aSopenharmony_ci vmul.f32 d16, d0, d7 233cabdff1aSopenharmony_ci vmul.f32 d17, d1, d6 234cabdff1aSopenharmony_ci pld [r2, #-32] 235cabdff1aSopenharmony_ci vrev64.32 q2, q2 236cabdff1aSopenharmony_ci vmul.f32 d18, d2, d5 237cabdff1aSopenharmony_ci vmul.f32 d19, d3, d4 238cabdff1aSopenharmony_ci subs r3, r3, #8 239cabdff1aSopenharmony_ci beq 2f 240cabdff1aSopenharmony_ci vld1.32 {q0-q1}, [r1,:128]! 241cabdff1aSopenharmony_ci vld1.32 {q2-q3}, [r2,:128], r12 242cabdff1aSopenharmony_ci vst1.32 {q8-q9}, [r0,:128]! 243cabdff1aSopenharmony_ci b 1b 244cabdff1aSopenharmony_ci2: vst1.32 {q8-q9}, [r0,:128]! 245cabdff1aSopenharmony_ci bx lr 246cabdff1aSopenharmony_ciendfunc 247cabdff1aSopenharmony_ci 248cabdff1aSopenharmony_cifunction ff_butterflies_float_neon, export=1 249cabdff1aSopenharmony_ci1: vld1.32 {q0},[r0,:128] 250cabdff1aSopenharmony_ci vld1.32 {q1},[r1,:128] 251cabdff1aSopenharmony_ci vsub.f32 q2, q0, q1 252cabdff1aSopenharmony_ci vadd.f32 q1, q0, q1 253cabdff1aSopenharmony_ci vst1.32 {q2},[r1,:128]! 254cabdff1aSopenharmony_ci vst1.32 {q1},[r0,:128]! 255cabdff1aSopenharmony_ci subs r2, r2, #4 256cabdff1aSopenharmony_ci bgt 1b 257cabdff1aSopenharmony_ci bx lr 258cabdff1aSopenharmony_ciendfunc 259cabdff1aSopenharmony_ci 260cabdff1aSopenharmony_cifunction ff_scalarproduct_float_neon, export=1 261cabdff1aSopenharmony_ci vmov.f32 q2, #0.0 262cabdff1aSopenharmony_ci1: vld1.32 {q0},[r0,:128]! 263cabdff1aSopenharmony_ci vld1.32 {q1},[r1,:128]! 264cabdff1aSopenharmony_ci vmla.f32 q2, q0, q1 265cabdff1aSopenharmony_ci subs r2, r2, #4 266cabdff1aSopenharmony_ci bgt 1b 267cabdff1aSopenharmony_ci vadd.f32 d0, d4, d5 268cabdff1aSopenharmony_ci vpadd.f32 d0, d0, d0 269cabdff1aSopenharmony_ciNOVFP vmov.32 r0, d0[0] 270cabdff1aSopenharmony_ci bx lr 271cabdff1aSopenharmony_ciendfunc 272