1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org> 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "config.h" 22cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci/** 25cabdff1aSopenharmony_ci * ARM VFP optimised int32 to float conversion. 26cabdff1aSopenharmony_ci * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned 27cabdff1aSopenharmony_ci * (16 bytes alignment is best for BCM2835), little-endian. 28cabdff1aSopenharmony_ci */ 29cabdff1aSopenharmony_ci@ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, const int32_t *src, const float *mul, int len) 30cabdff1aSopenharmony_cifunction ff_int32_to_float_fmul_array8_vfp, export=1 31cabdff1aSopenharmony_ci push {lr} 32cabdff1aSopenharmony_ci ldr a1, [sp, #4] 33cabdff1aSopenharmony_ci subs lr, a1, #3*8 34cabdff1aSopenharmony_ci bcc 50f @ too short to pipeline 35cabdff1aSopenharmony_ci @ Now need to find (len / 8) % 3. The approximation 36cabdff1aSopenharmony_ci @ x / 24 = (x * 0xAB) >> 12 37cabdff1aSopenharmony_ci @ is good for x < 4096, which is true for both AC3 and DCA. 38cabdff1aSopenharmony_ci mov a1, #0xAB 39cabdff1aSopenharmony_ci ldr ip, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1 40cabdff1aSopenharmony_ci mul a1, lr, a1 41cabdff1aSopenharmony_ci vpush {s16-s31} 42cabdff1aSopenharmony_ci mov a1, a1, lsr #12 43cabdff1aSopenharmony_ci add a1, a1, a1, lsl #1 44cabdff1aSopenharmony_ci rsb a1, a1, lr, lsr #3 45cabdff1aSopenharmony_ci cmp a1, #1 46cabdff1aSopenharmony_ci fmrx a1, FPSCR 47cabdff1aSopenharmony_ci fmxr FPSCR, ip 48cabdff1aSopenharmony_ci beq 11f 49cabdff1aSopenharmony_ci blo 10f 50cabdff1aSopenharmony_ci @ Array is (2 + multiple of 3) x 8 floats long 51cabdff1aSopenharmony_ci @ drop through... 52cabdff1aSopenharmony_ci vldmia a3!, {s16-s23} 53cabdff1aSopenharmony_ci vldmia a4!, {s2,s3} 54cabdff1aSopenharmony_ci vldmia a3!, {s24-s31} 55cabdff1aSopenharmony_ci vcvt.f32.s32 s16, s16 56cabdff1aSopenharmony_ci vcvt.f32.s32 s17, s17 57cabdff1aSopenharmony_ci vcvt.f32.s32 s18, s18 58cabdff1aSopenharmony_ci vcvt.f32.s32 s19, s19 59cabdff1aSopenharmony_ci vcvt.f32.s32 s20, s20 60cabdff1aSopenharmony_ci vcvt.f32.s32 s21, s21 61cabdff1aSopenharmony_ci vcvt.f32.s32 s22, s22 62cabdff1aSopenharmony_ci vcvt.f32.s32 s23, s23 63cabdff1aSopenharmony_ci vmul.f32 s16, s16, s2 64cabdff1aSopenharmony_ci @ drop through... 65cabdff1aSopenharmony_ci3: 66cabdff1aSopenharmony_ci vldmia a3!, {s8-s15} 67cabdff1aSopenharmony_ci vldmia a4!, {s1} 68cabdff1aSopenharmony_ci vcvt.f32.s32 s24, s24 69cabdff1aSopenharmony_ci vcvt.f32.s32 s25, s25 70cabdff1aSopenharmony_ci vcvt.f32.s32 s26, s26 71cabdff1aSopenharmony_ci vcvt.f32.s32 s27, s27 72cabdff1aSopenharmony_ci vcvt.f32.s32 s28, s28 73cabdff1aSopenharmony_ci vcvt.f32.s32 s29, s29 74cabdff1aSopenharmony_ci vcvt.f32.s32 s30, s30 75cabdff1aSopenharmony_ci vcvt.f32.s32 s31, s31 76cabdff1aSopenharmony_ci vmul.f32 s24, s24, s3 77cabdff1aSopenharmony_ci vstmia a2!, {s16-s19} 78cabdff1aSopenharmony_ci vstmia a2!, {s20-s23} 79cabdff1aSopenharmony_ci2: 80cabdff1aSopenharmony_ci vldmia a3!, {s16-s23} 81cabdff1aSopenharmony_ci vldmia a4!, {s2} 82cabdff1aSopenharmony_ci vcvt.f32.s32 s8, s8 83cabdff1aSopenharmony_ci vcvt.f32.s32 s9, s9 84cabdff1aSopenharmony_ci vcvt.f32.s32 s10, s10 85cabdff1aSopenharmony_ci vcvt.f32.s32 s11, s11 86cabdff1aSopenharmony_ci vcvt.f32.s32 s12, s12 87cabdff1aSopenharmony_ci vcvt.f32.s32 s13, s13 88cabdff1aSopenharmony_ci vcvt.f32.s32 s14, s14 89cabdff1aSopenharmony_ci vcvt.f32.s32 s15, s15 90cabdff1aSopenharmony_ci vmul.f32 s8, s8, s1 91cabdff1aSopenharmony_ci vstmia a2!, {s24-s27} 92cabdff1aSopenharmony_ci vstmia a2!, {s28-s31} 93cabdff1aSopenharmony_ci1: 94cabdff1aSopenharmony_ci vldmia a3!, {s24-s31} 95cabdff1aSopenharmony_ci vldmia a4!, {s3} 96cabdff1aSopenharmony_ci vcvt.f32.s32 s16, s16 97cabdff1aSopenharmony_ci vcvt.f32.s32 s17, s17 98cabdff1aSopenharmony_ci vcvt.f32.s32 s18, s18 99cabdff1aSopenharmony_ci vcvt.f32.s32 s19, s19 100cabdff1aSopenharmony_ci vcvt.f32.s32 s20, s20 101cabdff1aSopenharmony_ci vcvt.f32.s32 s21, s21 102cabdff1aSopenharmony_ci vcvt.f32.s32 s22, s22 103cabdff1aSopenharmony_ci vcvt.f32.s32 s23, s23 104cabdff1aSopenharmony_ci vmul.f32 s16, s16, s2 105cabdff1aSopenharmony_ci vstmia a2!, {s8-s11} 106cabdff1aSopenharmony_ci vstmia a2!, {s12-s15} 107cabdff1aSopenharmony_ci 108cabdff1aSopenharmony_ci subs lr, lr, #8*3 109cabdff1aSopenharmony_ci bpl 3b 110cabdff1aSopenharmony_ci 111cabdff1aSopenharmony_ci vcvt.f32.s32 s24, s24 112cabdff1aSopenharmony_ci vcvt.f32.s32 s25, s25 113cabdff1aSopenharmony_ci vcvt.f32.s32 s26, s26 114cabdff1aSopenharmony_ci vcvt.f32.s32 s27, s27 115cabdff1aSopenharmony_ci vcvt.f32.s32 s28, s28 116cabdff1aSopenharmony_ci vcvt.f32.s32 s29, s29 117cabdff1aSopenharmony_ci vcvt.f32.s32 s30, s30 118cabdff1aSopenharmony_ci vcvt.f32.s32 s31, s31 119cabdff1aSopenharmony_ci vmul.f32 s24, s24, s3 120cabdff1aSopenharmony_ci vstmia a2!, {s16-s19} 121cabdff1aSopenharmony_ci vstmia a2!, {s20-s23} 122cabdff1aSopenharmony_ci vstmia a2!, {s24-s27} 123cabdff1aSopenharmony_ci vstmia a2!, {s28-s31} 124cabdff1aSopenharmony_ci 125cabdff1aSopenharmony_ci fmxr FPSCR, a1 126cabdff1aSopenharmony_ci vpop {s16-s31} 127cabdff1aSopenharmony_ci pop {pc} 128cabdff1aSopenharmony_ci 129cabdff1aSopenharmony_ci10: @ Array is (multiple of 3) x 8 floats long 130cabdff1aSopenharmony_ci vldmia a3!, {s8-s15} 131cabdff1aSopenharmony_ci vldmia a4!, {s1,s2} 132cabdff1aSopenharmony_ci vldmia a3!, {s16-s23} 133cabdff1aSopenharmony_ci vcvt.f32.s32 s8, s8 134cabdff1aSopenharmony_ci vcvt.f32.s32 s9, s9 135cabdff1aSopenharmony_ci vcvt.f32.s32 s10, s10 136cabdff1aSopenharmony_ci vcvt.f32.s32 s11, s11 137cabdff1aSopenharmony_ci vcvt.f32.s32 s12, s12 138cabdff1aSopenharmony_ci vcvt.f32.s32 s13, s13 139cabdff1aSopenharmony_ci vcvt.f32.s32 s14, s14 140cabdff1aSopenharmony_ci vcvt.f32.s32 s15, s15 141cabdff1aSopenharmony_ci vmul.f32 s8, s8, s1 142cabdff1aSopenharmony_ci b 1b 143cabdff1aSopenharmony_ci 144cabdff1aSopenharmony_ci11: @ Array is (1 + multiple of 3) x 8 floats long 145cabdff1aSopenharmony_ci vldmia a3!, {s24-s31} 146cabdff1aSopenharmony_ci vldmia a4!, {s3} 147cabdff1aSopenharmony_ci vldmia a3!, {s8-s15} 148cabdff1aSopenharmony_ci vldmia a4!, {s1} 149cabdff1aSopenharmony_ci vcvt.f32.s32 s24, s24 150cabdff1aSopenharmony_ci vcvt.f32.s32 s25, s25 151cabdff1aSopenharmony_ci vcvt.f32.s32 s26, s26 152cabdff1aSopenharmony_ci vcvt.f32.s32 s27, s27 153cabdff1aSopenharmony_ci vcvt.f32.s32 s28, s28 154cabdff1aSopenharmony_ci vcvt.f32.s32 s29, s29 155cabdff1aSopenharmony_ci vcvt.f32.s32 s30, s30 156cabdff1aSopenharmony_ci vcvt.f32.s32 s31, s31 157cabdff1aSopenharmony_ci vmul.f32 s24, s24, s3 158cabdff1aSopenharmony_ci b 2b 159cabdff1aSopenharmony_ci 160cabdff1aSopenharmony_ci50: 161cabdff1aSopenharmony_ci ldr lr, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1 162cabdff1aSopenharmony_ci fmrx ip, FPSCR 163cabdff1aSopenharmony_ci fmxr FPSCR, lr 164cabdff1aSopenharmony_ci51: 165cabdff1aSopenharmony_ci vldmia a3!, {s8-s15} 166cabdff1aSopenharmony_ci vldmia a4!, {s0} 167cabdff1aSopenharmony_ci vcvt.f32.s32 s8, s8 168cabdff1aSopenharmony_ci vcvt.f32.s32 s9, s9 169cabdff1aSopenharmony_ci vcvt.f32.s32 s10, s10 170cabdff1aSopenharmony_ci vcvt.f32.s32 s11, s11 171cabdff1aSopenharmony_ci vcvt.f32.s32 s12, s12 172cabdff1aSopenharmony_ci vcvt.f32.s32 s13, s13 173cabdff1aSopenharmony_ci vcvt.f32.s32 s14, s14 174cabdff1aSopenharmony_ci vcvt.f32.s32 s15, s15 175cabdff1aSopenharmony_ci vmul.f32 s8, s8, s0 176cabdff1aSopenharmony_ci subs a1, a1, #8 177cabdff1aSopenharmony_ci vstmia a2!, {s8-s11} 178cabdff1aSopenharmony_ci vstmia a2!, {s12-s15} 179cabdff1aSopenharmony_ci bne 51b 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_ci fmxr FPSCR, ip 182cabdff1aSopenharmony_ci pop {pc} 183cabdff1aSopenharmony_ciendfunc 184cabdff1aSopenharmony_ci 185cabdff1aSopenharmony_ci/** 186cabdff1aSopenharmony_ci * ARM VFP optimised int32 to float conversion. 187cabdff1aSopenharmony_ci * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned 188cabdff1aSopenharmony_ci * (16 bytes alignment is best for BCM2835), little-endian. 189cabdff1aSopenharmony_ci * TODO: could be further optimised by unrolling and interleaving, as above 190cabdff1aSopenharmony_ci */ 191cabdff1aSopenharmony_ci@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len) 192cabdff1aSopenharmony_cifunction ff_int32_to_float_fmul_scalar_vfp, export=1 193cabdff1aSopenharmony_ciVFP tmp .req a4 194cabdff1aSopenharmony_ciVFP len .req a3 195cabdff1aSopenharmony_ciNOVFP tmp .req a3 196cabdff1aSopenharmony_ciNOVFP len .req a4 197cabdff1aSopenharmony_ciNOVFP vmov s0, a3 198cabdff1aSopenharmony_ci ldr tmp, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1 199cabdff1aSopenharmony_ci fmrx ip, FPSCR 200cabdff1aSopenharmony_ci fmxr FPSCR, tmp 201cabdff1aSopenharmony_ci1: 202cabdff1aSopenharmony_ci vldmia a2!, {s8-s15} 203cabdff1aSopenharmony_ci vcvt.f32.s32 s8, s8 204cabdff1aSopenharmony_ci vcvt.f32.s32 s9, s9 205cabdff1aSopenharmony_ci vcvt.f32.s32 s10, s10 206cabdff1aSopenharmony_ci vcvt.f32.s32 s11, s11 207cabdff1aSopenharmony_ci vcvt.f32.s32 s12, s12 208cabdff1aSopenharmony_ci vcvt.f32.s32 s13, s13 209cabdff1aSopenharmony_ci vcvt.f32.s32 s14, s14 210cabdff1aSopenharmony_ci vcvt.f32.s32 s15, s15 211cabdff1aSopenharmony_ci vmul.f32 s8, s8, s0 212cabdff1aSopenharmony_ci subs len, len, #8 213cabdff1aSopenharmony_ci vstmia a1!, {s8-s11} 214cabdff1aSopenharmony_ci vstmia a1!, {s12-s15} 215cabdff1aSopenharmony_ci bne 1b 216cabdff1aSopenharmony_ci 217cabdff1aSopenharmony_ci fmxr FPSCR, ip 218cabdff1aSopenharmony_ci bx lr 219cabdff1aSopenharmony_ciendfunc 220cabdff1aSopenharmony_ci .unreq tmp 221cabdff1aSopenharmony_ci .unreq len 222