1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "config.h" 22cabdff1aSopenharmony_ci#include "asm.S" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci/** 25cabdff1aSopenharmony_ci * Assume that len is a positive number and is multiple of 8 26cabdff1aSopenharmony_ci */ 27cabdff1aSopenharmony_ci@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len) 28cabdff1aSopenharmony_cifunction ff_vector_fmul_vfp, export=1 29cabdff1aSopenharmony_ci vpush {d8-d15} 30cabdff1aSopenharmony_ci fmrx r12, fpscr 31cabdff1aSopenharmony_ci orr r12, r12, #(3 << 16) /* set vector size to 4 */ 32cabdff1aSopenharmony_ci fmxr fpscr, r12 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ci vldmia r1!, {s0-s3} 35cabdff1aSopenharmony_ci vldmia r2!, {s8-s11} 36cabdff1aSopenharmony_ci vldmia r1!, {s4-s7} 37cabdff1aSopenharmony_ci vldmia r2!, {s12-s15} 38cabdff1aSopenharmony_ci vmul.f32 s8, s0, s8 39cabdff1aSopenharmony_ci1: 40cabdff1aSopenharmony_ci subs r3, r3, #16 41cabdff1aSopenharmony_ci vmul.f32 s12, s4, s12 42cabdff1aSopenharmony_ci itttt ge 43cabdff1aSopenharmony_ci vldmiage r1!, {s16-s19} 44cabdff1aSopenharmony_ci vldmiage r2!, {s24-s27} 45cabdff1aSopenharmony_ci vldmiage r1!, {s20-s23} 46cabdff1aSopenharmony_ci vldmiage r2!, {s28-s31} 47cabdff1aSopenharmony_ci it ge 48cabdff1aSopenharmony_ci vmulge.f32 s24, s16, s24 49cabdff1aSopenharmony_ci vstmia r0!, {s8-s11} 50cabdff1aSopenharmony_ci vstmia r0!, {s12-s15} 51cabdff1aSopenharmony_ci it ge 52cabdff1aSopenharmony_ci vmulge.f32 s28, s20, s28 53cabdff1aSopenharmony_ci itttt gt 54cabdff1aSopenharmony_ci vldmiagt r1!, {s0-s3} 55cabdff1aSopenharmony_ci vldmiagt r2!, {s8-s11} 56cabdff1aSopenharmony_ci vldmiagt r1!, {s4-s7} 57cabdff1aSopenharmony_ci vldmiagt r2!, {s12-s15} 58cabdff1aSopenharmony_ci ittt ge 59cabdff1aSopenharmony_ci vmulge.f32 s8, s0, s8 60cabdff1aSopenharmony_ci vstmiage r0!, {s24-s27} 61cabdff1aSopenharmony_ci vstmiage r0!, {s28-s31} 62cabdff1aSopenharmony_ci bgt 1b 63cabdff1aSopenharmony_ci 64cabdff1aSopenharmony_ci bic r12, r12, #(7 << 16) /* set vector size back to 1 */ 65cabdff1aSopenharmony_ci fmxr fpscr, r12 66cabdff1aSopenharmony_ci vpop {d8-d15} 67cabdff1aSopenharmony_ci bx lr 68cabdff1aSopenharmony_ciendfunc 69cabdff1aSopenharmony_ci 70cabdff1aSopenharmony_ci/** 71cabdff1aSopenharmony_ci * ARM VFP implementation of 'vector_fmul_window_c' function 72cabdff1aSopenharmony_ci * Assume that len is a positive non-zero number 73cabdff1aSopenharmony_ci */ 74cabdff1aSopenharmony_ci@ void ff_vector_fmul_window_vfp(float *dst, const float *src0, 75cabdff1aSopenharmony_ci@ const float *src1, const float *win, int len) 76cabdff1aSopenharmony_cifunction ff_vector_fmul_window_vfp, export=1 77cabdff1aSopenharmony_ciDST0 .req a1 78cabdff1aSopenharmony_ciSRC0 .req a2 79cabdff1aSopenharmony_ciSRC1 .req a3 80cabdff1aSopenharmony_ciWIN0 .req a4 81cabdff1aSopenharmony_ciLEN .req v1 82cabdff1aSopenharmony_ciDST1 .req v2 83cabdff1aSopenharmony_ciWIN1 .req v3 84cabdff1aSopenharmony_ciOLDFPSCR .req ip 85cabdff1aSopenharmony_ci 86cabdff1aSopenharmony_ci push {v1-v3,lr} 87cabdff1aSopenharmony_ci ldr LEN, [sp, #4*4+0] 88cabdff1aSopenharmony_ci vpush {s16-s31} 89cabdff1aSopenharmony_ci fmrx OLDFPSCR, FPSCR 90cabdff1aSopenharmony_ci add DST1, DST0, LEN, lsl #3 91cabdff1aSopenharmony_ci add SRC1, SRC1, LEN, lsl #2 92cabdff1aSopenharmony_ci add WIN1, WIN0, LEN, lsl #3 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_ci tst LEN, #7 95cabdff1aSopenharmony_ci beq 4f @ common case: len is a multiple of 8 96cabdff1aSopenharmony_ci 97cabdff1aSopenharmony_ci ldr lr, =0x03000000 @ RunFast mode, scalar mode 98cabdff1aSopenharmony_ci fmxr FPSCR, lr 99cabdff1aSopenharmony_ci 100cabdff1aSopenharmony_ci tst LEN, #1 101cabdff1aSopenharmony_ci beq 1f 102cabdff1aSopenharmony_ci vldmdb WIN1!, {s0} 103cabdff1aSopenharmony_ci vldmia SRC0!, {s8} 104cabdff1aSopenharmony_ci vldmia WIN0!, {s16} 105cabdff1aSopenharmony_ci vmul.f s24, s0, s8 106cabdff1aSopenharmony_ci vldmdb SRC1!, {s20} 107cabdff1aSopenharmony_ci vmul.f s8, s16, s8 108cabdff1aSopenharmony_ci vmls.f s24, s16, s20 109cabdff1aSopenharmony_ci vmla.f s8, s0, s20 110cabdff1aSopenharmony_ci vstmia DST0!, {s24} 111cabdff1aSopenharmony_ci vstmdb DST1!, {s8} 112cabdff1aSopenharmony_ci1: 113cabdff1aSopenharmony_ci tst LEN, #2 114cabdff1aSopenharmony_ci beq 2f 115cabdff1aSopenharmony_ci vldmdb WIN1!, {s0} 116cabdff1aSopenharmony_ci vldmdb WIN1!, {s1} 117cabdff1aSopenharmony_ci vldmia SRC0!, {s8-s9} 118cabdff1aSopenharmony_ci vldmia WIN0!, {s16-s17} 119cabdff1aSopenharmony_ci vmul.f s24, s0, s8 120cabdff1aSopenharmony_ci vmul.f s25, s1, s9 121cabdff1aSopenharmony_ci vldmdb SRC1!, {s20} 122cabdff1aSopenharmony_ci vldmdb SRC1!, {s21} 123cabdff1aSopenharmony_ci vmul.f s8, s16, s8 124cabdff1aSopenharmony_ci vmul.f s9, s17, s9 125cabdff1aSopenharmony_ci vmls.f s24, s16, s20 126cabdff1aSopenharmony_ci vmls.f s25, s17, s21 127cabdff1aSopenharmony_ci vmla.f s8, s0, s20 128cabdff1aSopenharmony_ci vmla.f s9, s1, s21 129cabdff1aSopenharmony_ci vstmia DST0!, {s24-s25} 130cabdff1aSopenharmony_ci vstmdb DST1!, {s8} 131cabdff1aSopenharmony_ci vstmdb DST1!, {s9} 132cabdff1aSopenharmony_ci2: 133cabdff1aSopenharmony_ci tst LEN, #4 134cabdff1aSopenharmony_ci beq 3f 135cabdff1aSopenharmony_ci vldmdb WIN1!, {s0} 136cabdff1aSopenharmony_ci vldmdb WIN1!, {s1} 137cabdff1aSopenharmony_ci vldmdb WIN1!, {s2} 138cabdff1aSopenharmony_ci vldmdb WIN1!, {s3} 139cabdff1aSopenharmony_ci vldmia SRC0!, {s8-s11} 140cabdff1aSopenharmony_ci vldmia WIN0!, {s16-s19} 141cabdff1aSopenharmony_ci vmul.f s24, s0, s8 142cabdff1aSopenharmony_ci vmul.f s25, s1, s9 143cabdff1aSopenharmony_ci vmul.f s26, s2, s10 144cabdff1aSopenharmony_ci vmul.f s27, s3, s11 145cabdff1aSopenharmony_ci vldmdb SRC1!, {s20} 146cabdff1aSopenharmony_ci vldmdb SRC1!, {s21} 147cabdff1aSopenharmony_ci vldmdb SRC1!, {s22} 148cabdff1aSopenharmony_ci vldmdb SRC1!, {s23} 149cabdff1aSopenharmony_ci vmul.f s8, s16, s8 150cabdff1aSopenharmony_ci vmul.f s9, s17, s9 151cabdff1aSopenharmony_ci vmul.f s10, s18, s10 152cabdff1aSopenharmony_ci vmul.f s11, s19, s11 153cabdff1aSopenharmony_ci vmls.f s24, s16, s20 154cabdff1aSopenharmony_ci vmls.f s25, s17, s21 155cabdff1aSopenharmony_ci vmls.f s26, s18, s22 156cabdff1aSopenharmony_ci vmls.f s27, s19, s23 157cabdff1aSopenharmony_ci vmla.f s8, s0, s20 158cabdff1aSopenharmony_ci vmla.f s9, s1, s21 159cabdff1aSopenharmony_ci vmla.f s10, s2, s22 160cabdff1aSopenharmony_ci vmla.f s11, s3, s23 161cabdff1aSopenharmony_ci vstmia DST0!, {s24-s27} 162cabdff1aSopenharmony_ci vstmdb DST1!, {s8} 163cabdff1aSopenharmony_ci vstmdb DST1!, {s9} 164cabdff1aSopenharmony_ci vstmdb DST1!, {s10} 165cabdff1aSopenharmony_ci vstmdb DST1!, {s11} 166cabdff1aSopenharmony_ci3: 167cabdff1aSopenharmony_ci bics LEN, LEN, #7 168cabdff1aSopenharmony_ci beq 7f 169cabdff1aSopenharmony_ci4: 170cabdff1aSopenharmony_ci ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 171cabdff1aSopenharmony_ci fmxr FPSCR, lr 172cabdff1aSopenharmony_ci 173cabdff1aSopenharmony_ci vldmdb WIN1!, {s0} 174cabdff1aSopenharmony_ci vldmdb WIN1!, {s1} 175cabdff1aSopenharmony_ci vldmdb WIN1!, {s2} 176cabdff1aSopenharmony_ci vldmdb WIN1!, {s3} 177cabdff1aSopenharmony_ci vldmia SRC0!, {s8-s11} 178cabdff1aSopenharmony_ci vldmia WIN0!, {s16-s19} 179cabdff1aSopenharmony_ci vmul.f s24, s0, s8 @ vector * vector 180cabdff1aSopenharmony_ci vldmdb SRC1!, {s20} 181cabdff1aSopenharmony_ci vldmdb SRC1!, {s21} 182cabdff1aSopenharmony_ci vldmdb SRC1!, {s22} 183cabdff1aSopenharmony_ci vldmdb SRC1!, {s23} 184cabdff1aSopenharmony_ci vmul.f s8, s16, s8 @ vector * vector 185cabdff1aSopenharmony_ci vmls.f s24, s16, s20 @ vector * vector 186cabdff1aSopenharmony_ci vldmdb WIN1!, {s4} 187cabdff1aSopenharmony_ci vldmdb WIN1!, {s5} 188cabdff1aSopenharmony_ci vldmdb WIN1!, {s6} 189cabdff1aSopenharmony_ci vldmdb WIN1!, {s7} 190cabdff1aSopenharmony_ci vldmia SRC0!, {s12-s13} 191cabdff1aSopenharmony_ci vmla.f s8, s0, s20 @ vector * vector 192cabdff1aSopenharmony_ci vldmia SRC0!, {s14-s15} 193cabdff1aSopenharmony_ci subs LEN, LEN, #8 194cabdff1aSopenharmony_ci beq 6f 195cabdff1aSopenharmony_ci5: vldmia WIN0!, {s20-s23} 196cabdff1aSopenharmony_ci vmul.f s28, s4, s12 @ vector * vector 197cabdff1aSopenharmony_ci vstmia DST0!, {s24-s25} 198cabdff1aSopenharmony_ci vldmdb SRC1!, {s16} 199cabdff1aSopenharmony_ci vldmdb SRC1!, {s17} 200cabdff1aSopenharmony_ci vldmdb SRC1!, {s18} 201cabdff1aSopenharmony_ci vldmdb SRC1!, {s19} 202cabdff1aSopenharmony_ci vmul.f s12, s20, s12 @ vector * vector 203cabdff1aSopenharmony_ci vstmia DST0!, {s26-s27} 204cabdff1aSopenharmony_ci vstmdb DST1!, {s8} 205cabdff1aSopenharmony_ci vstmdb DST1!, {s9} 206cabdff1aSopenharmony_ci vstmdb DST1!, {s10} 207cabdff1aSopenharmony_ci vstmdb DST1!, {s11} 208cabdff1aSopenharmony_ci vmls.f s28, s20, s16 @ vector * vector 209cabdff1aSopenharmony_ci vldmdb WIN1!, {s0} 210cabdff1aSopenharmony_ci vldmdb WIN1!, {s1} 211cabdff1aSopenharmony_ci vldmdb WIN1!, {s2} 212cabdff1aSopenharmony_ci vldmdb WIN1!, {s3} 213cabdff1aSopenharmony_ci vldmia SRC0!, {s8-s9} 214cabdff1aSopenharmony_ci vmla.f s12, s4, s16 @ vector * vector 215cabdff1aSopenharmony_ci vldmia SRC0!, {s10-s11} 216cabdff1aSopenharmony_ci subs LEN, LEN, #8 217cabdff1aSopenharmony_ci vldmia WIN0!, {s16-s19} 218cabdff1aSopenharmony_ci vmul.f s24, s0, s8 @ vector * vector 219cabdff1aSopenharmony_ci vstmia DST0!, {s28-s29} 220cabdff1aSopenharmony_ci vldmdb SRC1!, {s20} 221cabdff1aSopenharmony_ci vldmdb SRC1!, {s21} 222cabdff1aSopenharmony_ci vldmdb SRC1!, {s22} 223cabdff1aSopenharmony_ci vldmdb SRC1!, {s23} 224cabdff1aSopenharmony_ci vmul.f s8, s16, s8 @ vector * vector 225cabdff1aSopenharmony_ci vstmia DST0!, {s30-s31} 226cabdff1aSopenharmony_ci vstmdb DST1!, {s12} 227cabdff1aSopenharmony_ci vstmdb DST1!, {s13} 228cabdff1aSopenharmony_ci vstmdb DST1!, {s14} 229cabdff1aSopenharmony_ci vstmdb DST1!, {s15} 230cabdff1aSopenharmony_ci vmls.f s24, s16, s20 @ vector * vector 231cabdff1aSopenharmony_ci vldmdb WIN1!, {s4} 232cabdff1aSopenharmony_ci vldmdb WIN1!, {s5} 233cabdff1aSopenharmony_ci vldmdb WIN1!, {s6} 234cabdff1aSopenharmony_ci vldmdb WIN1!, {s7} 235cabdff1aSopenharmony_ci vldmia SRC0!, {s12-s13} 236cabdff1aSopenharmony_ci vmla.f s8, s0, s20 @ vector * vector 237cabdff1aSopenharmony_ci vldmia SRC0!, {s14-s15} 238cabdff1aSopenharmony_ci bne 5b 239cabdff1aSopenharmony_ci6: vldmia WIN0!, {s20-s23} 240cabdff1aSopenharmony_ci vmul.f s28, s4, s12 @ vector * vector 241cabdff1aSopenharmony_ci vstmia DST0!, {s24-s25} 242cabdff1aSopenharmony_ci vldmdb SRC1!, {s16} 243cabdff1aSopenharmony_ci vldmdb SRC1!, {s17} 244cabdff1aSopenharmony_ci vldmdb SRC1!, {s18} 245cabdff1aSopenharmony_ci vldmdb SRC1!, {s19} 246cabdff1aSopenharmony_ci vmul.f s12, s20, s12 @ vector * vector 247cabdff1aSopenharmony_ci vstmia DST0!, {s26-s27} 248cabdff1aSopenharmony_ci vstmdb DST1!, {s8} 249cabdff1aSopenharmony_ci vstmdb DST1!, {s9} 250cabdff1aSopenharmony_ci vstmdb DST1!, {s10} 251cabdff1aSopenharmony_ci vstmdb DST1!, {s11} 252cabdff1aSopenharmony_ci vmls.f s28, s20, s16 @ vector * vector 253cabdff1aSopenharmony_ci vmla.f s12, s4, s16 @ vector * vector 254cabdff1aSopenharmony_ci vstmia DST0!, {s28-s31} 255cabdff1aSopenharmony_ci vstmdb DST1!, {s12} 256cabdff1aSopenharmony_ci vstmdb DST1!, {s13} 257cabdff1aSopenharmony_ci vstmdb DST1!, {s14} 258cabdff1aSopenharmony_ci vstmdb DST1!, {s15} 259cabdff1aSopenharmony_ci7: 260cabdff1aSopenharmony_ci fmxr FPSCR, OLDFPSCR 261cabdff1aSopenharmony_ci vpop {s16-s31} 262cabdff1aSopenharmony_ci pop {v1-v3,pc} 263cabdff1aSopenharmony_ci 264cabdff1aSopenharmony_ci .unreq DST0 265cabdff1aSopenharmony_ci .unreq SRC0 266cabdff1aSopenharmony_ci .unreq SRC1 267cabdff1aSopenharmony_ci .unreq WIN0 268cabdff1aSopenharmony_ci .unreq LEN 269cabdff1aSopenharmony_ci .unreq OLDFPSCR 270cabdff1aSopenharmony_ci .unreq DST1 271cabdff1aSopenharmony_ci .unreq WIN1 272cabdff1aSopenharmony_ciendfunc 273cabdff1aSopenharmony_ci 274cabdff1aSopenharmony_ci/** 275cabdff1aSopenharmony_ci * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. 276cabdff1aSopenharmony_ci * Assume that len is a positive number and is multiple of 8 277cabdff1aSopenharmony_ci */ 278cabdff1aSopenharmony_ci@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, 279cabdff1aSopenharmony_ci@ const float *src1, int len) 280cabdff1aSopenharmony_cifunction ff_vector_fmul_reverse_vfp, export=1 281cabdff1aSopenharmony_ci vpush {d8-d15} 282cabdff1aSopenharmony_ci add r2, r2, r3, lsl #2 283cabdff1aSopenharmony_ci vldmdb r2!, {s0-s3} 284cabdff1aSopenharmony_ci vldmia r1!, {s8-s11} 285cabdff1aSopenharmony_ci vldmdb r2!, {s4-s7} 286cabdff1aSopenharmony_ci vldmia r1!, {s12-s15} 287cabdff1aSopenharmony_ci vmul.f32 s8, s3, s8 288cabdff1aSopenharmony_ci vmul.f32 s9, s2, s9 289cabdff1aSopenharmony_ci vmul.f32 s10, s1, s10 290cabdff1aSopenharmony_ci vmul.f32 s11, s0, s11 291cabdff1aSopenharmony_ci1: 292cabdff1aSopenharmony_ci subs r3, r3, #16 293cabdff1aSopenharmony_ci it ge 294cabdff1aSopenharmony_ci vldmdbge r2!, {s16-s19} 295cabdff1aSopenharmony_ci vmul.f32 s12, s7, s12 296cabdff1aSopenharmony_ci it ge 297cabdff1aSopenharmony_ci vldmiage r1!, {s24-s27} 298cabdff1aSopenharmony_ci vmul.f32 s13, s6, s13 299cabdff1aSopenharmony_ci it ge 300cabdff1aSopenharmony_ci vldmdbge r2!, {s20-s23} 301cabdff1aSopenharmony_ci vmul.f32 s14, s5, s14 302cabdff1aSopenharmony_ci it ge 303cabdff1aSopenharmony_ci vldmiage r1!, {s28-s31} 304cabdff1aSopenharmony_ci vmul.f32 s15, s4, s15 305cabdff1aSopenharmony_ci it ge 306cabdff1aSopenharmony_ci vmulge.f32 s24, s19, s24 307cabdff1aSopenharmony_ci it gt 308cabdff1aSopenharmony_ci vldmdbgt r2!, {s0-s3} 309cabdff1aSopenharmony_ci it ge 310cabdff1aSopenharmony_ci vmulge.f32 s25, s18, s25 311cabdff1aSopenharmony_ci vstmia r0!, {s8-s13} 312cabdff1aSopenharmony_ci it ge 313cabdff1aSopenharmony_ci vmulge.f32 s26, s17, s26 314cabdff1aSopenharmony_ci it gt 315cabdff1aSopenharmony_ci vldmiagt r1!, {s8-s11} 316cabdff1aSopenharmony_ci itt ge 317cabdff1aSopenharmony_ci vmulge.f32 s27, s16, s27 318cabdff1aSopenharmony_ci vmulge.f32 s28, s23, s28 319cabdff1aSopenharmony_ci it gt 320cabdff1aSopenharmony_ci vldmdbgt r2!, {s4-s7} 321cabdff1aSopenharmony_ci it ge 322cabdff1aSopenharmony_ci vmulge.f32 s29, s22, s29 323cabdff1aSopenharmony_ci vstmia r0!, {s14-s15} 324cabdff1aSopenharmony_ci ittt ge 325cabdff1aSopenharmony_ci vmulge.f32 s30, s21, s30 326cabdff1aSopenharmony_ci vmulge.f32 s31, s20, s31 327cabdff1aSopenharmony_ci vmulge.f32 s8, s3, s8 328cabdff1aSopenharmony_ci it gt 329cabdff1aSopenharmony_ci vldmiagt r1!, {s12-s15} 330cabdff1aSopenharmony_ci itttt ge 331cabdff1aSopenharmony_ci vmulge.f32 s9, s2, s9 332cabdff1aSopenharmony_ci vmulge.f32 s10, s1, s10 333cabdff1aSopenharmony_ci vstmiage r0!, {s24-s27} 334cabdff1aSopenharmony_ci vmulge.f32 s11, s0, s11 335cabdff1aSopenharmony_ci it ge 336cabdff1aSopenharmony_ci vstmiage r0!, {s28-s31} 337cabdff1aSopenharmony_ci bgt 1b 338cabdff1aSopenharmony_ci 339cabdff1aSopenharmony_ci vpop {d8-d15} 340cabdff1aSopenharmony_ci bx lr 341cabdff1aSopenharmony_ciendfunc 342cabdff1aSopenharmony_ci 343cabdff1aSopenharmony_ci/** 344cabdff1aSopenharmony_ci * ARM VFP implementation of 'butterflies_float_c' function 345cabdff1aSopenharmony_ci * Assume that len is a positive non-zero number 346cabdff1aSopenharmony_ci */ 347cabdff1aSopenharmony_ci@ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len) 348cabdff1aSopenharmony_cifunction ff_butterflies_float_vfp, export=1 349cabdff1aSopenharmony_ciBASE1 .req a1 350cabdff1aSopenharmony_ciBASE2 .req a2 351cabdff1aSopenharmony_ciLEN .req a3 352cabdff1aSopenharmony_ciOLDFPSCR .req a4 353cabdff1aSopenharmony_ci 354cabdff1aSopenharmony_ci vpush {s16-s31} 355cabdff1aSopenharmony_ci fmrx OLDFPSCR, FPSCR 356cabdff1aSopenharmony_ci 357cabdff1aSopenharmony_ci tst LEN, #7 358cabdff1aSopenharmony_ci beq 4f @ common case: len is a multiple of 8 359cabdff1aSopenharmony_ci 360cabdff1aSopenharmony_ci ldr ip, =0x03000000 @ RunFast mode, scalar mode 361cabdff1aSopenharmony_ci fmxr FPSCR, ip 362cabdff1aSopenharmony_ci 363cabdff1aSopenharmony_ci tst LEN, #1 364cabdff1aSopenharmony_ci beq 1f 365cabdff1aSopenharmony_ci vldmia BASE1!, {s0} 366cabdff1aSopenharmony_ci vldmia BASE2!, {s8} 367cabdff1aSopenharmony_ci vadd.f s16, s0, s8 368cabdff1aSopenharmony_ci vsub.f s24, s0, s8 369cabdff1aSopenharmony_ci vstr s16, [BASE1, #0-4*1] 370cabdff1aSopenharmony_ci vstr s24, [BASE2, #0-4*1] 371cabdff1aSopenharmony_ci1: 372cabdff1aSopenharmony_ci tst LEN, #2 373cabdff1aSopenharmony_ci beq 2f 374cabdff1aSopenharmony_ci vldmia BASE1!, {s0-s1} 375cabdff1aSopenharmony_ci vldmia BASE2!, {s8-s9} 376cabdff1aSopenharmony_ci vadd.f s16, s0, s8 377cabdff1aSopenharmony_ci vadd.f s17, s1, s9 378cabdff1aSopenharmony_ci vsub.f s24, s0, s8 379cabdff1aSopenharmony_ci vsub.f s25, s1, s9 380cabdff1aSopenharmony_ci vstr d8, [BASE1, #0-8*1] @ s16,s17 381cabdff1aSopenharmony_ci vstr d12, [BASE2, #0-8*1] @ s24,s25 382cabdff1aSopenharmony_ci2: 383cabdff1aSopenharmony_ci tst LEN, #4 384cabdff1aSopenharmony_ci beq 3f 385cabdff1aSopenharmony_ci vldmia BASE1!, {s0-s1} 386cabdff1aSopenharmony_ci vldmia BASE2!, {s8-s9} 387cabdff1aSopenharmony_ci vldmia BASE1!, {s2-s3} 388cabdff1aSopenharmony_ci vldmia BASE2!, {s10-s11} 389cabdff1aSopenharmony_ci vadd.f s16, s0, s8 390cabdff1aSopenharmony_ci vadd.f s17, s1, s9 391cabdff1aSopenharmony_ci vsub.f s24, s0, s8 392cabdff1aSopenharmony_ci vsub.f s25, s1, s9 393cabdff1aSopenharmony_ci vadd.f s18, s2, s10 394cabdff1aSopenharmony_ci vadd.f s19, s3, s11 395cabdff1aSopenharmony_ci vsub.f s26, s2, s10 396cabdff1aSopenharmony_ci vsub.f s27, s3, s11 397cabdff1aSopenharmony_ci vstr d8, [BASE1, #0-16*1] @ s16,s17 398cabdff1aSopenharmony_ci vstr d12, [BASE2, #0-16*1] @ s24,s25 399cabdff1aSopenharmony_ci vstr d9, [BASE1, #8-16*1] @ s18,s19 400cabdff1aSopenharmony_ci vstr d13, [BASE2, #8-16*1] @ s26,s27 401cabdff1aSopenharmony_ci3: 402cabdff1aSopenharmony_ci bics LEN, LEN, #7 403cabdff1aSopenharmony_ci beq 7f 404cabdff1aSopenharmony_ci4: 405cabdff1aSopenharmony_ci ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 406cabdff1aSopenharmony_ci fmxr FPSCR, ip 407cabdff1aSopenharmony_ci 408cabdff1aSopenharmony_ci vldmia BASE1!, {s0-s1} 409cabdff1aSopenharmony_ci vldmia BASE2!, {s8-s9} 410cabdff1aSopenharmony_ci vldmia BASE1!, {s2-s3} 411cabdff1aSopenharmony_ci vldmia BASE2!, {s10-s11} 412cabdff1aSopenharmony_ci vadd.f s16, s0, s8 413cabdff1aSopenharmony_ci vldmia BASE1!, {s4-s5} 414cabdff1aSopenharmony_ci vldmia BASE2!, {s12-s13} 415cabdff1aSopenharmony_ci vldmia BASE1!, {s6-s7} 416cabdff1aSopenharmony_ci vldmia BASE2!, {s14-s15} 417cabdff1aSopenharmony_ci vsub.f s24, s0, s8 418cabdff1aSopenharmony_ci vadd.f s20, s4, s12 419cabdff1aSopenharmony_ci subs LEN, LEN, #8 420cabdff1aSopenharmony_ci beq 6f 421cabdff1aSopenharmony_ci5: vldmia BASE1!, {s0-s3} 422cabdff1aSopenharmony_ci vldmia BASE2!, {s8-s11} 423cabdff1aSopenharmony_ci vsub.f s28, s4, s12 424cabdff1aSopenharmony_ci vstr d8, [BASE1, #0-16*3] @ s16,s17 425cabdff1aSopenharmony_ci vstr d9, [BASE1, #8-16*3] @ s18,s19 426cabdff1aSopenharmony_ci vstr d12, [BASE2, #0-16*3] @ s24,s25 427cabdff1aSopenharmony_ci vstr d13, [BASE2, #8-16*3] @ s26,s27 428cabdff1aSopenharmony_ci vadd.f s16, s0, s8 429cabdff1aSopenharmony_ci vldmia BASE1!, {s4-s7} 430cabdff1aSopenharmony_ci vldmia BASE2!, {s12-s15} 431cabdff1aSopenharmony_ci vsub.f s24, s0, s8 432cabdff1aSopenharmony_ci vstr d10, [BASE1, #0-16*3] @ s20,s21 433cabdff1aSopenharmony_ci vstr d11, [BASE1, #8-16*3] @ s22,s23 434cabdff1aSopenharmony_ci vstr d14, [BASE2, #0-16*3] @ s28,s29 435cabdff1aSopenharmony_ci vstr d15, [BASE2, #8-16*3] @ s30,s31 436cabdff1aSopenharmony_ci vadd.f s20, s4, s12 437cabdff1aSopenharmony_ci subs LEN, LEN, #8 438cabdff1aSopenharmony_ci bne 5b 439cabdff1aSopenharmony_ci6: vsub.f s28, s4, s12 440cabdff1aSopenharmony_ci vstr d8, [BASE1, #0-16*2] @ s16,s17 441cabdff1aSopenharmony_ci vstr d9, [BASE1, #8-16*2] @ s18,s19 442cabdff1aSopenharmony_ci vstr d12, [BASE2, #0-16*2] @ s24,s25 443cabdff1aSopenharmony_ci vstr d13, [BASE2, #8-16*2] @ s26,s27 444cabdff1aSopenharmony_ci vstr d10, [BASE1, #0-16*1] @ s20,s21 445cabdff1aSopenharmony_ci vstr d11, [BASE1, #8-16*1] @ s22,s23 446cabdff1aSopenharmony_ci vstr d14, [BASE2, #0-16*1] @ s28,s29 447cabdff1aSopenharmony_ci vstr d15, [BASE2, #8-16*1] @ s30,s31 448cabdff1aSopenharmony_ci7: 449cabdff1aSopenharmony_ci fmxr FPSCR, OLDFPSCR 450cabdff1aSopenharmony_ci vpop {s16-s31} 451cabdff1aSopenharmony_ci bx lr 452cabdff1aSopenharmony_ci 453cabdff1aSopenharmony_ci .unreq BASE1 454cabdff1aSopenharmony_ci .unreq BASE2 455cabdff1aSopenharmony_ci .unreq LEN 456cabdff1aSopenharmony_ci .unreq OLDFPSCR 457cabdff1aSopenharmony_ciendfunc 458