1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * AArch64 NEON optimised MDCT
3cabdff1aSopenharmony_ci * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
4cabdff1aSopenharmony_ci * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * This file is part of FFmpeg.
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci *
13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci * Lesser General Public License for more details.
17cabdff1aSopenharmony_ci *
18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci */
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_cifunction ff_imdct_half_neon, export=1
26cabdff1aSopenharmony_ci        sub             sp,  sp,  #32
27cabdff1aSopenharmony_ci        stp             x19, x20, [sp]
28cabdff1aSopenharmony_ci        AARCH64_SIGN_LINK_REGISTER
29cabdff1aSopenharmony_ci        str             x30, [sp, #16]
30cabdff1aSopenharmony_ci        mov             x12, #1
31cabdff1aSopenharmony_ci        ldr             w14, [x0, #28]          // mdct_bits
32cabdff1aSopenharmony_ci        ldr             x4,  [x0, #32]          // tcos
33cabdff1aSopenharmony_ci        ldr             x3,  [x0, #8]           // revtab
34cabdff1aSopenharmony_ci        lsl             x12, x12, x14           // n  = 1 << nbits
35cabdff1aSopenharmony_ci        lsr             x14, x12, #2            // n4 = n >> 2
36cabdff1aSopenharmony_ci        add             x7,  x2,  x12,  lsl #1
37cabdff1aSopenharmony_ci        mov             x12, #-16
38cabdff1aSopenharmony_ci        sub             x7,  x7,  #16
39cabdff1aSopenharmony_ci
40cabdff1aSopenharmony_ci        ld2             {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0
41cabdff1aSopenharmony_ci        ld2             {v0.2s,v1.2s},   [x2], #16 // d0 =m0,x d1 =m1,x
42cabdff1aSopenharmony_ci        rev64           v17.2s, v17.2s
43cabdff1aSopenharmony_ci        ld2             {v2.2s,v3.2s},   [x4], #16 // d2=c0,c1 d3=s0,s2
44cabdff1aSopenharmony_ci        fmul            v6.2s,  v17.2s, v2.2s
45cabdff1aSopenharmony_ci        fmul            v7.2s,  v0.2s,  v2.2s
46cabdff1aSopenharmony_ci1:
47cabdff1aSopenharmony_ci        subs            x14, x14, #2
48cabdff1aSopenharmony_ci        ldr             w6,  [x3], #4
49cabdff1aSopenharmony_ci        fmul            v4.2s,  v0.2s,  v3.2s
50cabdff1aSopenharmony_ci        fmul            v5.2s,  v17.2s, v3.2s
51cabdff1aSopenharmony_ci        fsub            v4.2s,  v6.2s,  v4.2s
52cabdff1aSopenharmony_ci        fadd            v5.2s,  v5.2s,  v7.2s
53cabdff1aSopenharmony_ci        ubfm            x8,  x6,  #16, #31
54cabdff1aSopenharmony_ci        ubfm            x6,  x6,  #0,  #15
55cabdff1aSopenharmony_ci        add             x8,  x1,  x8,  lsl #3
56cabdff1aSopenharmony_ci        add             x6,  x1,  x6,  lsl #3
57cabdff1aSopenharmony_ci        b.eq            2f
58cabdff1aSopenharmony_ci        ld2             {v16.2s,v17.2s}, [x7], x12
59cabdff1aSopenharmony_ci        ld2             {v0.2s,v1.2s},   [x2], #16
60cabdff1aSopenharmony_ci        rev64           v17.2s, v17.2s
61cabdff1aSopenharmony_ci        ld2             {v2.2s,v3.2s},   [x4], #16    // d2=c0,c1 d3=s0,s2
62cabdff1aSopenharmony_ci        fmul            v6.2s,  v17.2s, v2.2s
63cabdff1aSopenharmony_ci        fmul            v7.2s,  v0.2s,  v2.2s
64cabdff1aSopenharmony_ci        st2             {v4.s,v5.s}[0], [x6]
65cabdff1aSopenharmony_ci        st2             {v4.s,v5.s}[1], [x8]
66cabdff1aSopenharmony_ci        b               1b
67cabdff1aSopenharmony_ci2:
68cabdff1aSopenharmony_ci        st2             {v4.s,v5.s}[0], [x6]
69cabdff1aSopenharmony_ci        st2             {v4.s,v5.s}[1], [x8]
70cabdff1aSopenharmony_ci
71cabdff1aSopenharmony_ci        mov             x19, x0
72cabdff1aSopenharmony_ci        mov             x20, x1
73cabdff1aSopenharmony_ci        bl              X(ff_fft_calc_neon)
74cabdff1aSopenharmony_ci
75cabdff1aSopenharmony_ci        mov             x12, #1
76cabdff1aSopenharmony_ci        ldr             w14, [x19, #28]          // mdct_bits
77cabdff1aSopenharmony_ci        ldr             x4,  [x19, #32]          // tcos
78cabdff1aSopenharmony_ci        lsl             x12, x12, x14            // n  = 1 << nbits
79cabdff1aSopenharmony_ci        lsr             x14, x12, #3             // n8 = n >> 3
80cabdff1aSopenharmony_ci
81cabdff1aSopenharmony_ci        add             x4,  x4,  x14, lsl #3
82cabdff1aSopenharmony_ci        add             x6,  x20, x14, lsl #3
83cabdff1aSopenharmony_ci        sub             x1,  x4,  #16
84cabdff1aSopenharmony_ci        sub             x3,  x6,  #16
85cabdff1aSopenharmony_ci
86cabdff1aSopenharmony_ci        mov             x7,  #-16
87cabdff1aSopenharmony_ci        mov             x8,  x6
88cabdff1aSopenharmony_ci        mov             x0,  x3
89cabdff1aSopenharmony_ci
90cabdff1aSopenharmony_ci        ld2             {v0.2s,v1.2s},  [x3], x7 // d0 =i1,r1 d1 =i0,r0
91cabdff1aSopenharmony_ci        ld2             {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3
92cabdff1aSopenharmony_ci        ld2             {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
93cabdff1aSopenharmony_ci3:
94cabdff1aSopenharmony_ci        subs            x14, x14, #2
95cabdff1aSopenharmony_ci        fmul            v7.2s,  v0.2s,  v17.2s
96cabdff1aSopenharmony_ci        ld2             {v18.2s,v19.2s},[x4], #16    // d17=c2,c3 d19=s2,s3
97cabdff1aSopenharmony_ci        fmul            v4.2s,  v1.2s,  v17.2s
98cabdff1aSopenharmony_ci        fmul            v6.2s,  v21.2s, v19.2s
99cabdff1aSopenharmony_ci        fmul            v5.2s,  v20.2s, v19.2s
100cabdff1aSopenharmony_ci        fmul            v22.2s, v1.2s,  v16.2s
101cabdff1aSopenharmony_ci        fmul            v23.2s, v21.2s, v18.2s
102cabdff1aSopenharmony_ci        fmul            v24.2s, v0.2s,  v16.2s
103cabdff1aSopenharmony_ci        fmul            v25.2s, v20.2s, v18.2s
104cabdff1aSopenharmony_ci        fadd            v7.2s,  v7.2s,  v22.2s
105cabdff1aSopenharmony_ci        fadd            v5.2s,  v5.2s,  v23.2s
106cabdff1aSopenharmony_ci        fsub            v4.2s,  v4.2s,  v24.2s
107cabdff1aSopenharmony_ci        fsub            v6.2s,  v6.2s,  v25.2s
108cabdff1aSopenharmony_ci        b.eq            4f
109cabdff1aSopenharmony_ci        ld2             {v0.2s,v1.2s},  [x3], x7
110cabdff1aSopenharmony_ci        ld2             {v20.2s,v21.2s},[x6], #16
111cabdff1aSopenharmony_ci        ld2             {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
112cabdff1aSopenharmony_ci        rev64           v5.2s,  v5.2s
113cabdff1aSopenharmony_ci        rev64           v7.2s,  v7.2s
114cabdff1aSopenharmony_ci        st2             {v4.2s,v5.2s},  [x0], x7
115cabdff1aSopenharmony_ci        st2             {v6.2s,v7.2s},  [x8], #16
116cabdff1aSopenharmony_ci        b               3b
117cabdff1aSopenharmony_ci4:
118cabdff1aSopenharmony_ci        rev64           v5.2s,  v5.2s
119cabdff1aSopenharmony_ci        rev64           v7.2s,  v7.2s
120cabdff1aSopenharmony_ci        st2             {v4.2s,v5.2s},  [x0]
121cabdff1aSopenharmony_ci        st2             {v6.2s,v7.2s},  [x8]
122cabdff1aSopenharmony_ci
123cabdff1aSopenharmony_ci        ldp             x19, x20, [sp]
124cabdff1aSopenharmony_ci        ldr             x30, [sp, #16]
125cabdff1aSopenharmony_ci        AARCH64_VALIDATE_LINK_REGISTER
126cabdff1aSopenharmony_ci        add             sp,  sp,  #32
127cabdff1aSopenharmony_ci
128cabdff1aSopenharmony_ci        ret
129cabdff1aSopenharmony_ciendfunc
130cabdff1aSopenharmony_ci
131cabdff1aSopenharmony_cifunction ff_imdct_calc_neon, export=1
132cabdff1aSopenharmony_ci        sub             sp,  sp,  #32
133cabdff1aSopenharmony_ci        stp             x19, x20, [sp]
134cabdff1aSopenharmony_ci        AARCH64_SIGN_LINK_REGISTER
135cabdff1aSopenharmony_ci        str             x30, [sp, #16]
136cabdff1aSopenharmony_ci        ldr             w3,  [x0, #28]          // mdct_bits
137cabdff1aSopenharmony_ci        mov             x19, #1
138cabdff1aSopenharmony_ci        mov             x20, x1
139cabdff1aSopenharmony_ci        lsl             x19, x19, x3
140cabdff1aSopenharmony_ci        add             x1,  x1,  x19
141cabdff1aSopenharmony_ci
142cabdff1aSopenharmony_ci        bl              X(ff_imdct_half_neon)
143cabdff1aSopenharmony_ci
144cabdff1aSopenharmony_ci        add             x0,  x20, x19,  lsl #2
145cabdff1aSopenharmony_ci        add             x1,  x20, x19,  lsl #1
146cabdff1aSopenharmony_ci        sub             x0,  x0,  #8
147cabdff1aSopenharmony_ci        sub             x2,  x1,  #16
148cabdff1aSopenharmony_ci        mov             x3,  #-16
149cabdff1aSopenharmony_ci        mov             x6,  #-8
150cabdff1aSopenharmony_ci1:
151cabdff1aSopenharmony_ci        ld1             {v0.4s}, [x2], x3
152cabdff1aSopenharmony_ci        prfum           pldl1keep, [x0, #-16]
153cabdff1aSopenharmony_ci        rev64           v0.4s, v0.4s
154cabdff1aSopenharmony_ci        ld1             {v2.2s,v3.2s}, [x1], #16
155cabdff1aSopenharmony_ci        fneg            v4.4s,  v0.4s
156cabdff1aSopenharmony_ci        prfum           pldl1keep, [x2, #-16]
157cabdff1aSopenharmony_ci        rev64           v2.2s, v2.2s
158cabdff1aSopenharmony_ci        rev64           v3.2s, v3.2s
159cabdff1aSopenharmony_ci        ext             v4.16b, v4.16b, v4.16b, #8
160cabdff1aSopenharmony_ci        st1             {v2.2s}, [x0], x6
161cabdff1aSopenharmony_ci        st1             {v3.2s}, [x0], x6
162cabdff1aSopenharmony_ci        st1             {v4.4s}, [x20], #16
163cabdff1aSopenharmony_ci        subs            x19, x19,  #16
164cabdff1aSopenharmony_ci        b.gt            1b
165cabdff1aSopenharmony_ci
166cabdff1aSopenharmony_ci        ldp             x19, x20, [sp]
167cabdff1aSopenharmony_ci        ldr             x30, [sp, #16]
168cabdff1aSopenharmony_ci        AARCH64_VALIDATE_LINK_REGISTER
169cabdff1aSopenharmony_ci        add             sp,  sp,  #32
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_ci        ret
172cabdff1aSopenharmony_ciendfunc
173cabdff1aSopenharmony_ci
174cabdff1aSopenharmony_ci
175cabdff1aSopenharmony_cifunction ff_mdct_calc_neon, export=1
176cabdff1aSopenharmony_ci        sub             sp,  sp,  #32
177cabdff1aSopenharmony_ci        stp             x19, x20, [sp]
178cabdff1aSopenharmony_ci        AARCH64_SIGN_LINK_REGISTER
179cabdff1aSopenharmony_ci        str             x30, [sp, #16]
180cabdff1aSopenharmony_ci
181cabdff1aSopenharmony_ci        mov             x12, #1
182cabdff1aSopenharmony_ci        ldr             w14, [x0, #28]          // mdct_bits
183cabdff1aSopenharmony_ci        ldr             x4,  [x0, #32]          // tcos
184cabdff1aSopenharmony_ci        ldr             x3,  [x0, #8]           // revtab
185cabdff1aSopenharmony_ci        lsl             x14, x12, x14           // n  = 1 << nbits
186cabdff1aSopenharmony_ci        add             x7,  x2,  x14           // in4u
187cabdff1aSopenharmony_ci        sub             x9,  x7,  #16           // in4d
188cabdff1aSopenharmony_ci        add             x2,  x7,  x14, lsl #1   // in3u
189cabdff1aSopenharmony_ci        add             x8,  x9,  x14, lsl #1   // in3d
190cabdff1aSopenharmony_ci        add             x5,  x4,  x14, lsl #1
191cabdff1aSopenharmony_ci        sub             x5,  x5,  #16
192cabdff1aSopenharmony_ci        sub             x3,  x3,  #4
193cabdff1aSopenharmony_ci        mov             x12, #-16
194cabdff1aSopenharmony_ci        lsr             x13, x14, #1
195cabdff1aSopenharmony_ci
196cabdff1aSopenharmony_ci        ld2             {v16.2s,v17.2s}, [x9], x12  // in0u0,in0u1 in4d1,in4d0
197cabdff1aSopenharmony_ci        ld2             {v18.2s,v19.2s}, [x8], x12  // in2u0,in2u1 in3d1,in3d0
198cabdff1aSopenharmony_ci        ld2             {v0.2s, v1.2s},  [x7], #16  // in4u0,in4u1 in2d1,in2d0
199cabdff1aSopenharmony_ci        rev64           v17.2s, v17.2s              // in4d0,in4d1 in3d0,in3d1
200cabdff1aSopenharmony_ci        rev64           v19.2s, v19.2s              // in4d0,in4d1 in3d0,in3d1
201cabdff1aSopenharmony_ci        ld2             {v2.2s, v3.2s},  [x2], #16  // in3u0,in3u1 in1d1,in1d0
202cabdff1aSopenharmony_ci        fsub            v0.2s,  v17.2s, v0.2s       // in4d-in4u      I
203cabdff1aSopenharmony_ci        ld2             {v20.2s,v21.2s}, [x4], #16  // c0,c1 s0,s1
204cabdff1aSopenharmony_ci        rev64           v1.2s,  v1.2s               // in2d0,in2d1 in1d0,in1d1
205cabdff1aSopenharmony_ci        rev64           v3.2s,  v3.2s               // in2d0,in2d1 in1d0,in1d1
206cabdff1aSopenharmony_ci        ld2             {v30.2s,v31.2s}, [x5], x12  // c2,c3 s2,s3
207cabdff1aSopenharmony_ci        fadd            v2.2s,  v2.2s,  v19.2s      // in3u+in3d     -R
208cabdff1aSopenharmony_ci        fsub            v16.2s, v16.2s, v1.2s       // in0u-in2d      R
209cabdff1aSopenharmony_ci        fadd            v18.2s, v18.2s, v3.2s       // in2u+in1d     -I
210cabdff1aSopenharmony_ci1:
211cabdff1aSopenharmony_ci        fmul            v7.2s,  v0.2s,  v21.2s      //  I*s
212cabdff1aSopenharmony_ci        ldr             w10, [x3, x13]
213cabdff1aSopenharmony_ci        fmul            v6.2s,  v2.2s,  v20.2s      // -R*c
214cabdff1aSopenharmony_ci        ldr             w6,  [x3, #4]!
215cabdff1aSopenharmony_ci        fmul            v4.2s,  v2.2s,  v21.2s      // -R*s
216cabdff1aSopenharmony_ci        fmul            v5.2s,  v0.2s,  v20.2s      //  I*c
217cabdff1aSopenharmony_ci        fmul            v24.2s, v16.2s, v30.2s      //  R*c
218cabdff1aSopenharmony_ci        fmul            v25.2s, v18.2s, v31.2s      // -I*s
219cabdff1aSopenharmony_ci        fmul            v22.2s, v16.2s, v31.2s      //  R*s
220cabdff1aSopenharmony_ci        fmul            v23.2s, v18.2s, v30.2s      //  I*c
221cabdff1aSopenharmony_ci        subs            x14, x14, #16
222cabdff1aSopenharmony_ci        subs            x13, x13, #8
223cabdff1aSopenharmony_ci        fsub            v6.2s,  v6.2s,  v7.2s       // -R*c-I*s
224cabdff1aSopenharmony_ci        fadd            v7.2s,  v4.2s,  v5.2s       // -R*s+I*c
225cabdff1aSopenharmony_ci        fsub            v24.2s, v25.2s, v24.2s      // I*s-R*c
226cabdff1aSopenharmony_ci        fadd            v25.2s, v22.2s, v23.2s      // R*s-I*c
227cabdff1aSopenharmony_ci        b.eq            1f
228cabdff1aSopenharmony_ci        mov             x12, #-16
229cabdff1aSopenharmony_ci        ld2             {v16.2s,v17.2s}, [x9], x12  // in0u0,in0u1 in4d1,in4d0
230cabdff1aSopenharmony_ci        ld2             {v18.2s,v19.2s}, [x8], x12  // in2u0,in2u1 in3d1,in3d0
231cabdff1aSopenharmony_ci        fneg            v7.2s,  v7.2s               //  R*s-I*c
232cabdff1aSopenharmony_ci        ld2             {v0.2s, v1.2s},  [x7], #16  // in4u0,in4u1 in2d1,in2d0
233cabdff1aSopenharmony_ci        rev64           v17.2s, v17.2s              // in4d0,in4d1 in3d0,in3d1
234cabdff1aSopenharmony_ci        rev64           v19.2s, v19.2s              // in4d0,in4d1 in3d0,in3d1
235cabdff1aSopenharmony_ci        ld2             {v2.2s, v3.2s},  [x2], #16  // in3u0,in3u1 in1d1,in1d0
236cabdff1aSopenharmony_ci        fsub            v0.2s,  v17.2s, v0.2s       // in4d-in4u      I
237cabdff1aSopenharmony_ci        ld2             {v20.2s,v21.2s}, [x4], #16  // c0,c1 s0,s1
238cabdff1aSopenharmony_ci        rev64           v1.2s,  v1.2s               // in2d0,in2d1 in1d0,in1d1
239cabdff1aSopenharmony_ci        rev64           v3.2s,  v3.2s               // in2d0,in2d1 in1d0,in1d1
240cabdff1aSopenharmony_ci        ld2             {v30.2s,v31.2s}, [x5], x12  // c2,c3 s2,s3
241cabdff1aSopenharmony_ci        fadd            v2.2s,  v2.2s,  v19.2s      // in3u+in3d     -R
242cabdff1aSopenharmony_ci        fsub            v16.2s, v16.2s, v1.2s       // in0u-in2d      R
243cabdff1aSopenharmony_ci        fadd            v18.2s, v18.2s, v3.2s       // in2u+in1d     -I
244cabdff1aSopenharmony_ci        ubfm            x12, x6,  #16, #31
245cabdff1aSopenharmony_ci        ubfm            x6,  x6,  #0,  #15
246cabdff1aSopenharmony_ci        add             x12, x1,  x12, lsl #3
247cabdff1aSopenharmony_ci        add             x6,  x1,  x6,  lsl #3
248cabdff1aSopenharmony_ci        st2             {v6.s,v7.s}[0],   [x6]
249cabdff1aSopenharmony_ci        st2             {v6.s,v7.s}[1],   [x12]
250cabdff1aSopenharmony_ci        ubfm            x6,  x10, #16, #31
251cabdff1aSopenharmony_ci        ubfm            x10, x10, #0,  #15
252cabdff1aSopenharmony_ci        add             x6 , x1,  x6,  lsl #3
253cabdff1aSopenharmony_ci        add             x10, x1,  x10, lsl #3
254cabdff1aSopenharmony_ci        st2             {v24.s,v25.s}[0], [x10]
255cabdff1aSopenharmony_ci        st2             {v24.s,v25.s}[1], [x6]
256cabdff1aSopenharmony_ci        b               1b
257cabdff1aSopenharmony_ci1:
258cabdff1aSopenharmony_ci        fneg            v7.2s,  v7.2s           //  R*s-I*c
259cabdff1aSopenharmony_ci        ubfm            x12, x6,  #16, #31
260cabdff1aSopenharmony_ci        ubfm            x6,  x6,  #0,  #15
261cabdff1aSopenharmony_ci        add             x12, x1,  x12, lsl #3
262cabdff1aSopenharmony_ci        add             x6,  x1,  x6,  lsl #3
263cabdff1aSopenharmony_ci        st2             {v6.s,v7.s}[0],   [x6]
264cabdff1aSopenharmony_ci        st2             {v6.s,v7.s}[1],   [x12]
265cabdff1aSopenharmony_ci        ubfm            x6,  x10, #16, #31
266cabdff1aSopenharmony_ci        ubfm            x10, x10, #0,  #15
267cabdff1aSopenharmony_ci        add             x6 , x1,  x6,  lsl #3
268cabdff1aSopenharmony_ci        add             x10, x1,  x10, lsl #3
269cabdff1aSopenharmony_ci        st2             {v24.s,v25.s}[0], [x10]
270cabdff1aSopenharmony_ci        st2             {v24.s,v25.s}[1], [x6]
271cabdff1aSopenharmony_ci
272cabdff1aSopenharmony_ci        mov             x19, x0
273cabdff1aSopenharmony_ci        mov             x20, x1
274cabdff1aSopenharmony_ci        bl              X(ff_fft_calc_neon)
275cabdff1aSopenharmony_ci
276cabdff1aSopenharmony_ci        mov             x12, #1
277cabdff1aSopenharmony_ci        ldr             w14, [x19, #28]         // mdct_bits
278cabdff1aSopenharmony_ci        ldr             x4,  [x19, #32]         // tcos
279cabdff1aSopenharmony_ci        lsl             x12, x12, x14           // n  = 1 << nbits
280cabdff1aSopenharmony_ci        lsr             x14, x12, #3            // n8 = n >> 3
281cabdff1aSopenharmony_ci
282cabdff1aSopenharmony_ci        add             x4,  x4,  x14, lsl #3
283cabdff1aSopenharmony_ci        add             x6,  x20, x14, lsl #3
284cabdff1aSopenharmony_ci        sub             x1,  x4,  #16
285cabdff1aSopenharmony_ci        sub             x3,  x6,  #16
286cabdff1aSopenharmony_ci
287cabdff1aSopenharmony_ci        mov             x7,  #-16
288cabdff1aSopenharmony_ci        mov             x8,  x6
289cabdff1aSopenharmony_ci        mov             x0,  x3
290cabdff1aSopenharmony_ci
291cabdff1aSopenharmony_ci        ld2             {v0.2s,v1.2s},   [x3], x7   // d0 =r1,i1 d1 =r0,i0
292cabdff1aSopenharmony_ci        ld2             {v20.2s,v21.2s}, [x6], #16  // d20=r2,i2 d21=r3,i3
293cabdff1aSopenharmony_ci        ld2             {v16.2s,v17.2s}, [x1], x7   // c1,c0 s1,s0
294cabdff1aSopenharmony_ci1:
295cabdff1aSopenharmony_ci        subs            x14, x14, #2
296cabdff1aSopenharmony_ci        fmul            v7.2s,  v0.2s,  v17.2s      // r1*s1,r0*s0
297cabdff1aSopenharmony_ci        ld2             {v18.2s,v19.2s}, [x4], #16  // c2,c3 s2,s3
298cabdff1aSopenharmony_ci        fmul            v4.2s,  v1.2s,  v17.2s      // i1*s1,i0*s0
299cabdff1aSopenharmony_ci        fmul            v6.2s,  v21.2s, v19.2s      // i2*s2,i3*s3
300cabdff1aSopenharmony_ci        fmul            v5.2s,  v20.2s, v19.2s      // r2*s2,r3*s3
301cabdff1aSopenharmony_ci        fmul            v24.2s, v0.2s,  v16.2s      // r1*c1,r0*c0
302cabdff1aSopenharmony_ci        fmul            v25.2s, v20.2s, v18.2s      // r2*c2,r3*c3
303cabdff1aSopenharmony_ci        fmul            v22.2s, v21.2s, v18.2s      // i2*c2,i3*c3
304cabdff1aSopenharmony_ci        fmul            v23.2s, v1.2s,  v16.2s      // i1*c1,i0*c0
305cabdff1aSopenharmony_ci        fadd            v4.2s,  v4.2s,  v24.2s      // i1*s1+r1*c1,i0*s0+r0*c0
306cabdff1aSopenharmony_ci        fadd            v6.2s,  v6.2s,  v25.2s      // i2*s2+r2*c2,i3*s3+r3*c3
307cabdff1aSopenharmony_ci        fsub            v5.2s,  v22.2s, v5.2s       // i2*c2-r2*s2,i3*c3-r3*s3
308cabdff1aSopenharmony_ci        fsub            v7.2s,  v23.2s, v7.2s       // i1*c1-r1*s1,i0*c0-r0*s0
309cabdff1aSopenharmony_ci        fneg            v4.2s,  v4.2s
310cabdff1aSopenharmony_ci        fneg            v6.2s,  v6.2s
311cabdff1aSopenharmony_ci        b.eq            1f
312cabdff1aSopenharmony_ci        ld2             {v0.2s, v1.2s},  [x3], x7
313cabdff1aSopenharmony_ci        ld2             {v20.2s,v21.2s}, [x6], #16
314cabdff1aSopenharmony_ci        ld2             {v16.2s,v17.2s}, [x1], x7   // c1,c0 s1,s0
315cabdff1aSopenharmony_ci        rev64           v5.2s,  v5.2s
316cabdff1aSopenharmony_ci        rev64           v7.2s,  v7.2s
317cabdff1aSopenharmony_ci        st2             {v4.2s,v5.2s},  [x0], x7
318cabdff1aSopenharmony_ci        st2             {v6.2s,v7.2s},  [x8], #16
319cabdff1aSopenharmony_ci        b               1b
320cabdff1aSopenharmony_ci1:
321cabdff1aSopenharmony_ci        rev64           v5.2s,  v5.2s
322cabdff1aSopenharmony_ci        rev64           v7.2s,  v7.2s
323cabdff1aSopenharmony_ci        st2             {v4.2s,v5.2s},  [x0]
324cabdff1aSopenharmony_ci        st2             {v6.2s,v7.2s},  [x8]
325cabdff1aSopenharmony_ci
326cabdff1aSopenharmony_ci        ldp             x19, x20, [sp]
327cabdff1aSopenharmony_ci        ldr             x30, [sp, #16]
328cabdff1aSopenharmony_ci        AARCH64_VALIDATE_LINK_REGISTER
329cabdff1aSopenharmony_ci        add             sp,  sp,  #32
330cabdff1aSopenharmony_ci
331cabdff1aSopenharmony_ci        ret
332cabdff1aSopenharmony_ciendfunc
333