1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * AArch64 NEON optimised MDCT 3cabdff1aSopenharmony_ci * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 4cabdff1aSopenharmony_ci * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * This file is part of FFmpeg. 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci * Lesser General Public License for more details. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci */ 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_cifunction ff_imdct_half_neon, export=1 26cabdff1aSopenharmony_ci sub sp, sp, #32 27cabdff1aSopenharmony_ci stp x19, x20, [sp] 28cabdff1aSopenharmony_ci AARCH64_SIGN_LINK_REGISTER 29cabdff1aSopenharmony_ci str x30, [sp, #16] 30cabdff1aSopenharmony_ci mov x12, #1 31cabdff1aSopenharmony_ci ldr w14, [x0, #28] // mdct_bits 32cabdff1aSopenharmony_ci ldr x4, [x0, #32] // tcos 33cabdff1aSopenharmony_ci ldr x3, [x0, #8] // revtab 34cabdff1aSopenharmony_ci lsl x12, x12, x14 // n = 1 << nbits 35cabdff1aSopenharmony_ci lsr x14, x12, #2 // n4 = n >> 2 36cabdff1aSopenharmony_ci add x7, x2, x12, lsl #1 37cabdff1aSopenharmony_ci mov x12, #-16 38cabdff1aSopenharmony_ci sub x7, x7, #16 39cabdff1aSopenharmony_ci 40cabdff1aSopenharmony_ci ld2 {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0 41cabdff1aSopenharmony_ci ld2 {v0.2s,v1.2s}, [x2], #16 // d0 =m0,x d1 =m1,x 42cabdff1aSopenharmony_ci rev64 v17.2s, v17.2s 43cabdff1aSopenharmony_ci ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2 44cabdff1aSopenharmony_ci fmul v6.2s, v17.2s, v2.2s 45cabdff1aSopenharmony_ci fmul v7.2s, v0.2s, v2.2s 46cabdff1aSopenharmony_ci1: 47cabdff1aSopenharmony_ci subs x14, x14, #2 48cabdff1aSopenharmony_ci ldr w6, [x3], #4 49cabdff1aSopenharmony_ci fmul v4.2s, v0.2s, v3.2s 50cabdff1aSopenharmony_ci fmul v5.2s, v17.2s, v3.2s 51cabdff1aSopenharmony_ci fsub v4.2s, v6.2s, v4.2s 52cabdff1aSopenharmony_ci fadd v5.2s, v5.2s, v7.2s 53cabdff1aSopenharmony_ci ubfm x8, x6, #16, #31 54cabdff1aSopenharmony_ci ubfm x6, x6, #0, #15 55cabdff1aSopenharmony_ci add x8, x1, x8, lsl #3 56cabdff1aSopenharmony_ci add x6, x1, x6, lsl #3 57cabdff1aSopenharmony_ci b.eq 2f 58cabdff1aSopenharmony_ci ld2 {v16.2s,v17.2s}, [x7], x12 59cabdff1aSopenharmony_ci ld2 {v0.2s,v1.2s}, [x2], #16 60cabdff1aSopenharmony_ci rev64 v17.2s, v17.2s 61cabdff1aSopenharmony_ci ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2 62cabdff1aSopenharmony_ci fmul v6.2s, v17.2s, v2.2s 63cabdff1aSopenharmony_ci fmul v7.2s, v0.2s, v2.2s 64cabdff1aSopenharmony_ci st2 {v4.s,v5.s}[0], [x6] 65cabdff1aSopenharmony_ci st2 {v4.s,v5.s}[1], [x8] 66cabdff1aSopenharmony_ci b 1b 67cabdff1aSopenharmony_ci2: 68cabdff1aSopenharmony_ci st2 {v4.s,v5.s}[0], [x6] 69cabdff1aSopenharmony_ci st2 {v4.s,v5.s}[1], [x8] 70cabdff1aSopenharmony_ci 71cabdff1aSopenharmony_ci mov x19, x0 72cabdff1aSopenharmony_ci mov x20, x1 73cabdff1aSopenharmony_ci bl X(ff_fft_calc_neon) 74cabdff1aSopenharmony_ci 75cabdff1aSopenharmony_ci mov x12, #1 76cabdff1aSopenharmony_ci ldr w14, [x19, #28] // mdct_bits 77cabdff1aSopenharmony_ci ldr x4, [x19, #32] // tcos 78cabdff1aSopenharmony_ci lsl x12, x12, x14 // n = 1 << nbits 79cabdff1aSopenharmony_ci lsr x14, x12, #3 // n8 = n >> 3 80cabdff1aSopenharmony_ci 81cabdff1aSopenharmony_ci add x4, x4, x14, lsl #3 82cabdff1aSopenharmony_ci add x6, x20, x14, lsl #3 83cabdff1aSopenharmony_ci sub x1, x4, #16 84cabdff1aSopenharmony_ci sub x3, x6, #16 85cabdff1aSopenharmony_ci 86cabdff1aSopenharmony_ci mov x7, #-16 87cabdff1aSopenharmony_ci mov x8, x6 88cabdff1aSopenharmony_ci mov x0, x3 89cabdff1aSopenharmony_ci 90cabdff1aSopenharmony_ci ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =i1,r1 d1 =i0,r0 91cabdff1aSopenharmony_ci ld2 {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3 92cabdff1aSopenharmony_ci ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0 93cabdff1aSopenharmony_ci3: 94cabdff1aSopenharmony_ci subs x14, x14, #2 95cabdff1aSopenharmony_ci fmul v7.2s, v0.2s, v17.2s 96cabdff1aSopenharmony_ci ld2 {v18.2s,v19.2s},[x4], #16 // d17=c2,c3 d19=s2,s3 97cabdff1aSopenharmony_ci fmul v4.2s, v1.2s, v17.2s 98cabdff1aSopenharmony_ci fmul v6.2s, v21.2s, v19.2s 99cabdff1aSopenharmony_ci fmul v5.2s, v20.2s, v19.2s 100cabdff1aSopenharmony_ci fmul v22.2s, v1.2s, v16.2s 101cabdff1aSopenharmony_ci fmul v23.2s, v21.2s, v18.2s 102cabdff1aSopenharmony_ci fmul v24.2s, v0.2s, v16.2s 103cabdff1aSopenharmony_ci fmul v25.2s, v20.2s, v18.2s 104cabdff1aSopenharmony_ci fadd v7.2s, v7.2s, v22.2s 105cabdff1aSopenharmony_ci fadd v5.2s, v5.2s, v23.2s 106cabdff1aSopenharmony_ci fsub v4.2s, v4.2s, v24.2s 107cabdff1aSopenharmony_ci fsub v6.2s, v6.2s, v25.2s 108cabdff1aSopenharmony_ci b.eq 4f 109cabdff1aSopenharmony_ci ld2 {v0.2s,v1.2s}, [x3], x7 110cabdff1aSopenharmony_ci ld2 {v20.2s,v21.2s},[x6], #16 111cabdff1aSopenharmony_ci ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0 112cabdff1aSopenharmony_ci rev64 v5.2s, v5.2s 113cabdff1aSopenharmony_ci rev64 v7.2s, v7.2s 114cabdff1aSopenharmony_ci st2 {v4.2s,v5.2s}, [x0], x7 115cabdff1aSopenharmony_ci st2 {v6.2s,v7.2s}, [x8], #16 116cabdff1aSopenharmony_ci b 3b 117cabdff1aSopenharmony_ci4: 118cabdff1aSopenharmony_ci rev64 v5.2s, v5.2s 119cabdff1aSopenharmony_ci rev64 v7.2s, v7.2s 120cabdff1aSopenharmony_ci st2 {v4.2s,v5.2s}, [x0] 121cabdff1aSopenharmony_ci st2 {v6.2s,v7.2s}, [x8] 122cabdff1aSopenharmony_ci 123cabdff1aSopenharmony_ci ldp x19, x20, [sp] 124cabdff1aSopenharmony_ci ldr x30, [sp, #16] 125cabdff1aSopenharmony_ci AARCH64_VALIDATE_LINK_REGISTER 126cabdff1aSopenharmony_ci add sp, sp, #32 127cabdff1aSopenharmony_ci 128cabdff1aSopenharmony_ci ret 129cabdff1aSopenharmony_ciendfunc 130cabdff1aSopenharmony_ci 131cabdff1aSopenharmony_cifunction ff_imdct_calc_neon, export=1 132cabdff1aSopenharmony_ci sub sp, sp, #32 133cabdff1aSopenharmony_ci stp x19, x20, [sp] 134cabdff1aSopenharmony_ci AARCH64_SIGN_LINK_REGISTER 135cabdff1aSopenharmony_ci str x30, [sp, #16] 136cabdff1aSopenharmony_ci ldr w3, [x0, #28] // mdct_bits 137cabdff1aSopenharmony_ci mov x19, #1 138cabdff1aSopenharmony_ci mov x20, x1 139cabdff1aSopenharmony_ci lsl x19, x19, x3 140cabdff1aSopenharmony_ci add x1, x1, x19 141cabdff1aSopenharmony_ci 142cabdff1aSopenharmony_ci bl X(ff_imdct_half_neon) 143cabdff1aSopenharmony_ci 144cabdff1aSopenharmony_ci add x0, x20, x19, lsl #2 145cabdff1aSopenharmony_ci add x1, x20, x19, lsl #1 146cabdff1aSopenharmony_ci sub x0, x0, #8 147cabdff1aSopenharmony_ci sub x2, x1, #16 148cabdff1aSopenharmony_ci mov x3, #-16 149cabdff1aSopenharmony_ci mov x6, #-8 150cabdff1aSopenharmony_ci1: 151cabdff1aSopenharmony_ci ld1 {v0.4s}, [x2], x3 152cabdff1aSopenharmony_ci prfum pldl1keep, [x0, #-16] 153cabdff1aSopenharmony_ci rev64 v0.4s, v0.4s 154cabdff1aSopenharmony_ci ld1 {v2.2s,v3.2s}, [x1], #16 155cabdff1aSopenharmony_ci fneg v4.4s, v0.4s 156cabdff1aSopenharmony_ci prfum pldl1keep, [x2, #-16] 157cabdff1aSopenharmony_ci rev64 v2.2s, v2.2s 158cabdff1aSopenharmony_ci rev64 v3.2s, v3.2s 159cabdff1aSopenharmony_ci ext v4.16b, v4.16b, v4.16b, #8 160cabdff1aSopenharmony_ci st1 {v2.2s}, [x0], x6 161cabdff1aSopenharmony_ci st1 {v3.2s}, [x0], x6 162cabdff1aSopenharmony_ci st1 {v4.4s}, [x20], #16 163cabdff1aSopenharmony_ci subs x19, x19, #16 164cabdff1aSopenharmony_ci b.gt 1b 165cabdff1aSopenharmony_ci 166cabdff1aSopenharmony_ci ldp x19, x20, [sp] 167cabdff1aSopenharmony_ci ldr x30, [sp, #16] 168cabdff1aSopenharmony_ci AARCH64_VALIDATE_LINK_REGISTER 169cabdff1aSopenharmony_ci add sp, sp, #32 170cabdff1aSopenharmony_ci 171cabdff1aSopenharmony_ci ret 172cabdff1aSopenharmony_ciendfunc 173cabdff1aSopenharmony_ci 174cabdff1aSopenharmony_ci 175cabdff1aSopenharmony_cifunction ff_mdct_calc_neon, export=1 176cabdff1aSopenharmony_ci sub sp, sp, #32 177cabdff1aSopenharmony_ci stp x19, x20, [sp] 178cabdff1aSopenharmony_ci AARCH64_SIGN_LINK_REGISTER 179cabdff1aSopenharmony_ci str x30, [sp, #16] 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_ci mov x12, #1 182cabdff1aSopenharmony_ci ldr w14, [x0, #28] // mdct_bits 183cabdff1aSopenharmony_ci ldr x4, [x0, #32] // tcos 184cabdff1aSopenharmony_ci ldr x3, [x0, #8] // revtab 185cabdff1aSopenharmony_ci lsl x14, x12, x14 // n = 1 << nbits 186cabdff1aSopenharmony_ci add x7, x2, x14 // in4u 187cabdff1aSopenharmony_ci sub x9, x7, #16 // in4d 188cabdff1aSopenharmony_ci add x2, x7, x14, lsl #1 // in3u 189cabdff1aSopenharmony_ci add x8, x9, x14, lsl #1 // in3d 190cabdff1aSopenharmony_ci add x5, x4, x14, lsl #1 191cabdff1aSopenharmony_ci sub x5, x5, #16 192cabdff1aSopenharmony_ci sub x3, x3, #4 193cabdff1aSopenharmony_ci mov x12, #-16 194cabdff1aSopenharmony_ci lsr x13, x14, #1 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_ci ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0 197cabdff1aSopenharmony_ci ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0 198cabdff1aSopenharmony_ci ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0 199cabdff1aSopenharmony_ci rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1 200cabdff1aSopenharmony_ci rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1 201cabdff1aSopenharmony_ci ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0 202cabdff1aSopenharmony_ci fsub v0.2s, v17.2s, v0.2s // in4d-in4u I 203cabdff1aSopenharmony_ci ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1 204cabdff1aSopenharmony_ci rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1 205cabdff1aSopenharmony_ci rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1 206cabdff1aSopenharmony_ci ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3 207cabdff1aSopenharmony_ci fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R 208cabdff1aSopenharmony_ci fsub v16.2s, v16.2s, v1.2s // in0u-in2d R 209cabdff1aSopenharmony_ci fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I 210cabdff1aSopenharmony_ci1: 211cabdff1aSopenharmony_ci fmul v7.2s, v0.2s, v21.2s // I*s 212cabdff1aSopenharmony_ci ldr w10, [x3, x13] 213cabdff1aSopenharmony_ci fmul v6.2s, v2.2s, v20.2s // -R*c 214cabdff1aSopenharmony_ci ldr w6, [x3, #4]! 215cabdff1aSopenharmony_ci fmul v4.2s, v2.2s, v21.2s // -R*s 216cabdff1aSopenharmony_ci fmul v5.2s, v0.2s, v20.2s // I*c 217cabdff1aSopenharmony_ci fmul v24.2s, v16.2s, v30.2s // R*c 218cabdff1aSopenharmony_ci fmul v25.2s, v18.2s, v31.2s // -I*s 219cabdff1aSopenharmony_ci fmul v22.2s, v16.2s, v31.2s // R*s 220cabdff1aSopenharmony_ci fmul v23.2s, v18.2s, v30.2s // I*c 221cabdff1aSopenharmony_ci subs x14, x14, #16 222cabdff1aSopenharmony_ci subs x13, x13, #8 223cabdff1aSopenharmony_ci fsub v6.2s, v6.2s, v7.2s // -R*c-I*s 224cabdff1aSopenharmony_ci fadd v7.2s, v4.2s, v5.2s // -R*s+I*c 225cabdff1aSopenharmony_ci fsub v24.2s, v25.2s, v24.2s // I*s-R*c 226cabdff1aSopenharmony_ci fadd v25.2s, v22.2s, v23.2s // R*s-I*c 227cabdff1aSopenharmony_ci b.eq 1f 228cabdff1aSopenharmony_ci mov x12, #-16 229cabdff1aSopenharmony_ci ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0 230cabdff1aSopenharmony_ci ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0 231cabdff1aSopenharmony_ci fneg v7.2s, v7.2s // R*s-I*c 232cabdff1aSopenharmony_ci ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0 233cabdff1aSopenharmony_ci rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1 234cabdff1aSopenharmony_ci rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1 235cabdff1aSopenharmony_ci ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0 236cabdff1aSopenharmony_ci fsub v0.2s, v17.2s, v0.2s // in4d-in4u I 237cabdff1aSopenharmony_ci ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1 238cabdff1aSopenharmony_ci rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1 239cabdff1aSopenharmony_ci rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1 240cabdff1aSopenharmony_ci ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3 241cabdff1aSopenharmony_ci fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R 242cabdff1aSopenharmony_ci fsub v16.2s, v16.2s, v1.2s // in0u-in2d R 243cabdff1aSopenharmony_ci fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I 244cabdff1aSopenharmony_ci ubfm x12, x6, #16, #31 245cabdff1aSopenharmony_ci ubfm x6, x6, #0, #15 246cabdff1aSopenharmony_ci add x12, x1, x12, lsl #3 247cabdff1aSopenharmony_ci add x6, x1, x6, lsl #3 248cabdff1aSopenharmony_ci st2 {v6.s,v7.s}[0], [x6] 249cabdff1aSopenharmony_ci st2 {v6.s,v7.s}[1], [x12] 250cabdff1aSopenharmony_ci ubfm x6, x10, #16, #31 251cabdff1aSopenharmony_ci ubfm x10, x10, #0, #15 252cabdff1aSopenharmony_ci add x6 , x1, x6, lsl #3 253cabdff1aSopenharmony_ci add x10, x1, x10, lsl #3 254cabdff1aSopenharmony_ci st2 {v24.s,v25.s}[0], [x10] 255cabdff1aSopenharmony_ci st2 {v24.s,v25.s}[1], [x6] 256cabdff1aSopenharmony_ci b 1b 257cabdff1aSopenharmony_ci1: 258cabdff1aSopenharmony_ci fneg v7.2s, v7.2s // R*s-I*c 259cabdff1aSopenharmony_ci ubfm x12, x6, #16, #31 260cabdff1aSopenharmony_ci ubfm x6, x6, #0, #15 261cabdff1aSopenharmony_ci add x12, x1, x12, lsl #3 262cabdff1aSopenharmony_ci add x6, x1, x6, lsl #3 263cabdff1aSopenharmony_ci st2 {v6.s,v7.s}[0], [x6] 264cabdff1aSopenharmony_ci st2 {v6.s,v7.s}[1], [x12] 265cabdff1aSopenharmony_ci ubfm x6, x10, #16, #31 266cabdff1aSopenharmony_ci ubfm x10, x10, #0, #15 267cabdff1aSopenharmony_ci add x6 , x1, x6, lsl #3 268cabdff1aSopenharmony_ci add x10, x1, x10, lsl #3 269cabdff1aSopenharmony_ci st2 {v24.s,v25.s}[0], [x10] 270cabdff1aSopenharmony_ci st2 {v24.s,v25.s}[1], [x6] 271cabdff1aSopenharmony_ci 272cabdff1aSopenharmony_ci mov x19, x0 273cabdff1aSopenharmony_ci mov x20, x1 274cabdff1aSopenharmony_ci bl X(ff_fft_calc_neon) 275cabdff1aSopenharmony_ci 276cabdff1aSopenharmony_ci mov x12, #1 277cabdff1aSopenharmony_ci ldr w14, [x19, #28] // mdct_bits 278cabdff1aSopenharmony_ci ldr x4, [x19, #32] // tcos 279cabdff1aSopenharmony_ci lsl x12, x12, x14 // n = 1 << nbits 280cabdff1aSopenharmony_ci lsr x14, x12, #3 // n8 = n >> 3 281cabdff1aSopenharmony_ci 282cabdff1aSopenharmony_ci add x4, x4, x14, lsl #3 283cabdff1aSopenharmony_ci add x6, x20, x14, lsl #3 284cabdff1aSopenharmony_ci sub x1, x4, #16 285cabdff1aSopenharmony_ci sub x3, x6, #16 286cabdff1aSopenharmony_ci 287cabdff1aSopenharmony_ci mov x7, #-16 288cabdff1aSopenharmony_ci mov x8, x6 289cabdff1aSopenharmony_ci mov x0, x3 290cabdff1aSopenharmony_ci 291cabdff1aSopenharmony_ci ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =r1,i1 d1 =r0,i0 292cabdff1aSopenharmony_ci ld2 {v20.2s,v21.2s}, [x6], #16 // d20=r2,i2 d21=r3,i3 293cabdff1aSopenharmony_ci ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0 294cabdff1aSopenharmony_ci1: 295cabdff1aSopenharmony_ci subs x14, x14, #2 296cabdff1aSopenharmony_ci fmul v7.2s, v0.2s, v17.2s // r1*s1,r0*s0 297cabdff1aSopenharmony_ci ld2 {v18.2s,v19.2s}, [x4], #16 // c2,c3 s2,s3 298cabdff1aSopenharmony_ci fmul v4.2s, v1.2s, v17.2s // i1*s1,i0*s0 299cabdff1aSopenharmony_ci fmul v6.2s, v21.2s, v19.2s // i2*s2,i3*s3 300cabdff1aSopenharmony_ci fmul v5.2s, v20.2s, v19.2s // r2*s2,r3*s3 301cabdff1aSopenharmony_ci fmul v24.2s, v0.2s, v16.2s // r1*c1,r0*c0 302cabdff1aSopenharmony_ci fmul v25.2s, v20.2s, v18.2s // r2*c2,r3*c3 303cabdff1aSopenharmony_ci fmul v22.2s, v21.2s, v18.2s // i2*c2,i3*c3 304cabdff1aSopenharmony_ci fmul v23.2s, v1.2s, v16.2s // i1*c1,i0*c0 305cabdff1aSopenharmony_ci fadd v4.2s, v4.2s, v24.2s // i1*s1+r1*c1,i0*s0+r0*c0 306cabdff1aSopenharmony_ci fadd v6.2s, v6.2s, v25.2s // i2*s2+r2*c2,i3*s3+r3*c3 307cabdff1aSopenharmony_ci fsub v5.2s, v22.2s, v5.2s // i2*c2-r2*s2,i3*c3-r3*s3 308cabdff1aSopenharmony_ci fsub v7.2s, v23.2s, v7.2s // i1*c1-r1*s1,i0*c0-r0*s0 309cabdff1aSopenharmony_ci fneg v4.2s, v4.2s 310cabdff1aSopenharmony_ci fneg v6.2s, v6.2s 311cabdff1aSopenharmony_ci b.eq 1f 312cabdff1aSopenharmony_ci ld2 {v0.2s, v1.2s}, [x3], x7 313cabdff1aSopenharmony_ci ld2 {v20.2s,v21.2s}, [x6], #16 314cabdff1aSopenharmony_ci ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0 315cabdff1aSopenharmony_ci rev64 v5.2s, v5.2s 316cabdff1aSopenharmony_ci rev64 v7.2s, v7.2s 317cabdff1aSopenharmony_ci st2 {v4.2s,v5.2s}, [x0], x7 318cabdff1aSopenharmony_ci st2 {v6.2s,v7.2s}, [x8], #16 319cabdff1aSopenharmony_ci b 1b 320cabdff1aSopenharmony_ci1: 321cabdff1aSopenharmony_ci rev64 v5.2s, v5.2s 322cabdff1aSopenharmony_ci rev64 v7.2s, v7.2s 323cabdff1aSopenharmony_ci st2 {v4.2s,v5.2s}, [x0] 324cabdff1aSopenharmony_ci st2 {v6.2s,v7.2s}, [x8] 325cabdff1aSopenharmony_ci 326cabdff1aSopenharmony_ci ldp x19, x20, [sp] 327cabdff1aSopenharmony_ci ldr x30, [sp, #16] 328cabdff1aSopenharmony_ci AARCH64_VALIDATE_LINK_REGISTER 329cabdff1aSopenharmony_ci add sp, sp, #32 330cabdff1aSopenharmony_ci 331cabdff1aSopenharmony_ci ret 332cabdff1aSopenharmony_ciendfunc 333