1From c580b97cbfea388ac393f617c4d960021bf11322 Mon Sep 17 00:00:00 2001
2From: chengfeng27 <chengfeng27@huawei.com>
3Date: Mon, 12 Aug 2024 11:42:12 +0800
4Subject: [PATCH] fix arm64/fp16 assemble can not protect stack in mutil-thread
5 switch case
6
7---
8 .../kernel/nnacl/assembly/arm64/AdderFp32.S   | 10 ++---
9 .../nnacl/assembly/arm64/BigMatmulFp32Opt.S   | 22 +++++-----
10 .../assembly/arm64/ConvDw3x3Fp32Stride1.S     | 12 ++---
11 .../assembly/arm64/ConvDw3x3Fp32Stride2.S     | 12 ++---
12 .../nnacl/assembly/arm64/ConvDw3x3Int8.S      | 34 +++++++-------
13 .../assembly/arm64/ConvDw3x3Int8Corner.S      | 19 ++++----
14 .../assembly/arm64/ConvDw3x3Int8Horizontal.S  | 25 +++++------
15 .../assembly/arm64/ConvDw3x3Int8Stride2.S     | 34 +++++++-------
16 .../assembly/arm64/ConvDw3x3Int8Vertical.S    | 19 ++++----
17 .../nnacl/assembly/arm64/ConvDw3x3Line.S      |  6 +--
18 .../nnacl/assembly/arm64/ConvDwFp32Center.S   | 30 ++++++-------
19 .../assembly/arm64/ConvDwFp32Indirect3x3.S    |  7 ++-
20 .../nnacl/assembly/arm64/ConvDwInt8Center.S   | 44 +++++++++----------
21 .../nnacl/assembly/arm64/ConvFp32Center.S     | 42 +++++++++---------
22 .../nnacl/assembly/arm64/ConvSW1x16Kernel.S   | 23 +++++-----
23 .../nnacl/assembly/arm64/ConvSW1x8Kernel.S    | 21 +++++----
24 .../nnacl/assembly/arm64/ConvSW2x16Kernel.S   | 21 +++++----
25 .../nnacl/assembly/arm64/ConvSW2x8Kernel.S    | 21 +++++----
26 .../nnacl/assembly/arm64/ConvSW3x16Kernel.S   | 23 +++++-----
27 .../nnacl/assembly/arm64/ConvSW3x8Kernel.S    | 21 +++++----
28 .../nnacl/assembly/arm64/ConvSW4x16Kernel.S   | 28 ++++++------
29 .../nnacl/assembly/arm64/ConvSW4x8Kernel.S    | 28 ++++++------
30 .../nnacl/assembly/arm64/ConvSW5x16Kernel.S   | 28 ++++++------
31 .../nnacl/assembly/arm64/ConvSW5x8Kernel.S    | 28 ++++++------
32 .../nnacl/assembly/arm64/DeconvDwFp32Center.S | 15 +++----
33 .../nnacl/assembly/arm64/DeconvDwInt8Center.S | 15 +++----
34 .../nnacl/assembly/arm64/MatVecMulFp32.S      | 24 +++++-----
35 .../nnacl/assembly/arm64/MatVecMulPackFp32.S  | 15 +++----
36 .../kernel/nnacl/assembly/arm64/MatmulFp32.S  | 14 +++---
37 .../nnacl/assembly/arm64/MatmulFp32Opt.S      | 16 +++----
38 .../nnacl/assembly/arm64/MatmulFp32OptRow12.S | 14 +++---
39 .../nnacl/assembly/arm64/MatmulFp32OptRow4.S  | 16 +++----
40 .../nnacl/assembly/arm64/MatmulFp32OptRow8.S  | 14 +++---
41 .../kernel/nnacl/assembly/arm64/MatmulInt8.S  | 38 ++++++++--------
42 .../nnacl/assembly/arm64/MatmulInt8Opt.S      | 44 +++++++++----------
43 .../nnacl/assembly/arm64/MatmulR4Int8.S       |  8 ++--
44 .../nnacl/assembly/arm64/MatmulWinogradFp32.S |  7 ++-
45 .../nnacl/assembly/arm64/PostFuncBiasReluC8.S |  6 +--
46 .../assembly/arm64/PostFuncInt8C4Neon64.S     | 15 +++----
47 .../kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S |  8 ++--
48 .../nnacl/assembly/arm64/TiledC4MatmulFp32.S  |  6 +--
49 .../nnacl/assembly/arm64/WinogradTransLeft.S  |  3 +-
50 .../nnacl/assembly/arm64/WinogradTransRight.S |  3 +-
51 .../nnacl/assembly/fp16/ConvDwFp16Center.S    | 30 ++++++-------
52 .../nnacl/assembly/fp16/DeconvDwFp16Center.S  | 15 +++----
53 .../nnacl/assembly/fp16/MatVecMulFp16.S       |  6 +--
54 .../nnacl/assembly/fp16/Matmul12X16Fp16.S     | 14 +++---
55 .../nnacl/assembly/fp16/MatmulBaseFp16Neon.S  | 14 +++---
56 .../kernel/nnacl/assembly/fp16/MatmulFp16.S   | 14 +++---
57 .../nnacl/assembly/fp16/MatmulFp16Opt.S       | 11 +++--
58 .../nnacl/assembly/fp16/MatmulFp16OptV2.S     | 20 ++++-----
59 .../nnacl/assembly/fp16/MatmulWinogradFp16.S  |  7 ++-
60 .../nnacl/assembly/fp16/TiledC4MatmulFp16.S   |  6 +--
61 .../nnacl/assembly/fp16/VecMatmulFp16.S       |  6 +--
62 .../assembly/fp16/WinogradTransLeftFp16.S     |  3 +-
63 .../assembly/fp16/WinogradTransRightFp16.S    |  3 +-
64 56 files changed, 483 insertions(+), 505 deletions(-)
65
66diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S
67index 66136f42..9123d88c 100644
68--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S
69+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S
70@@ -34,11 +34,12 @@
71 
72 asm_function AdderFloatNeon64
73     sub sp, sp, #144
74-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
75-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
76-    stp x19, x20, [sp], #16
77+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
78+    add x9, sp, #64
79+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
80+    stp x19, x20, [sp, #128]
81 
82-    ldr x8, [sp]
83+    ldr x8, [sp, #144]
84 
85     mov x20, #48 // sizeof(float) * 12
86     mul x17, x5, x20 // block stride of lhs/rhs: sizeof(float) * 12 * depth
87@@ -614,7 +615,6 @@ LoopColEnd:
88         subs x6, x6, #12
89         bgt LoopRowStart
90 
91-  sub sp, sp, #144
92   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
93   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
94   ldp x19, x20, [sp], #16
95diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S
96index 498038ff..03898585 100644
97--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S
98+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S
99@@ -33,16 +33,17 @@
100 
101 asm_function BigMatmulFloatNeon64Opt
102     sub sp, sp, #224
103-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
104-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
105-    stp x19, x20, [sp], #16
106-    stp x21, x22, [sp], #16
107-    stp x23, x24, [sp], #16
108-    stp x25, x26, [sp], #16
109-    stp x27, x28, [sp], #16
110-    stp x29, x30, [sp], #16
111-
112-    ldr x8, [sp]
113+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
114+    add x9, sp, #64
115+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
116+    stp x19, x20, [sp, #128]
117+    stp x21, x22, [sp, #144]
118+    stp x23, x24, [sp, #160]
119+    stp x25, x26, [sp, #176]
120+    stp x27, x28, [sp, #192]
121+    stp x29, x30, [sp, #208]
122+
123+    ldr x8, [sp, #224]
124     mov x20, #1
125     mov x22, #32
126     mov x23, #48
127@@ -2515,7 +2516,6 @@ Compute4x4Unit:
128         ret
129 
130 End:
131-  sub sp, sp, #224
132   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
133   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
134   ldp x19, x20, [sp], #16
135diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S
136index f04d9082..b96efd64 100644
137--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S
138+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S
139@@ -36,12 +36,13 @@
140 
141 asm_function ConvDw3x3Stride1
142     sub sp, sp, #128
143-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
144-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
145+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
146+    add x9, sp, #64
147+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
148 
149-    ldr w8, [sp]
150-    ldr w9, [sp, #8]
151-    ldr w10, [sp, #16]
152+    ldr w8, [sp, #128]
153+    ldr w9, [sp, #136]
154+    ldr w10, [sp, #144]
155 
156     mov w11, #4
157     mul w15, w4, w11   // col_size * 4
158@@ -203,7 +204,6 @@ WIDTH1_LEFT:
159         st1 {v21.4s}, [x0]
160 
161 End:
162-    sub sp, sp, #128
163     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
164     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
165     ret
166diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S
167index 0dd075dd..7632d48e 100644
168--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S
169+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S
170@@ -36,12 +36,13 @@
171 
172 asm_function ConvDw3x3Stride2
173     sub sp, sp, #128
174-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
175-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
176+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
177+    add x9, sp, #64
178+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
179 
180-    ldr w8, [sp]
181-    ldr w9, [sp, #8]
182-    ldr w10, [sp, #16]
183+    ldr w8, [sp, #128]
184+    ldr w9, [sp, #136]
185+    ldr w10, [sp, #144]
186 
187     mov w11, #4
188     mul w15, w4, w11   // col_size * 4
189@@ -205,7 +206,6 @@ WIDTH1_LEFT:
190         st1 {v24.4s}, [x0]
191 
192 End:
193-    sub sp, sp, #128
194     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
195     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
196     ret
197diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S
198index bfb9b8f6..5187d368 100644
199--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S
200+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S
201@@ -44,22 +44,23 @@
202 
203 asm_function ConvDw3x3Int8Neon64
204   sub sp, sp, #192
205-  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
206-  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
207-  stp x19, x20, [sp], #16
208-  stp x21, x22, [sp], #16
209-  stp x23, x24, [sp], #16
210-  stp x25, x26, [sp], #16
211-
212-  ldr x8, [sp]
213-  ldr x9, [sp, #8]
214-  ldr x10, [sp, #16]
215-  ldr x11, [sp, #24]
216-  ldr x12, [sp, #32]
217-  ldr x13, [sp, #40]
218-  ldr x14, [sp, #48]
219-  ldr x15, [sp, #56]
220-  ldr x23, [sp, #64]  // per_channel
221+  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
222+  add x9, sp, #64
223+  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
224+  stp x19, x20, [sp, #128]
225+  stp x21, x22, [sp, #144]
226+  stp x23, x24, [sp, #160]
227+  stp x25, x26, [sp, #176]
228+
229+  ldr x8, [sp, #192]
230+  ldr x9, [sp, #200]
231+  ldr x10, [sp, #208]
232+  ldr x11, [sp, #216]
233+  ldr x12, [sp, #224]
234+  ldr x13, [sp, #232]
235+  ldr x14, [sp, #240]
236+  ldr x15, [sp, #248]
237+  ldr x23, [sp, #256]  // per_channel
238 
239   add x19, x3, #16
240   add w20, w6, w6   // channel * 2
241@@ -488,7 +489,6 @@ OUTZP3:
242   st1 {v21.8b}, [x0], x6
243 
244 End:
245-  sub sp, sp, #192
246   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
247   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
248   ldp x19, x20, [sp], #16
249diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S
250index b07ac01b..416e1a3a 100644
251--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S
252+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S
253@@ -32,20 +32,20 @@ asm_function ConvDw3x3Int8Corner
254     // x19 ~ x29 should be also preserved
255     // whereas our coding style do not permit such amount of parameters
256     sub sp, sp, #32
257-    stp x19, x20, [sp], #16
258-    stp x21, x22, [sp], #16
259+    stp x19, x20, [sp]
260+    stp x21, x22, [sp, #16]
261 
262     dup v25.8b, w7                      // in_zp
263-    ldr x8, [sp]
264+    ldr x8, [sp, #32]
265     dup v26.4s, w8                      // out_zp
266-    ldr x9, [sp, #8]                    // out_multiplier
267-    ldr x10, [sp, #16]                  // left_shift
268-    ldr x11, [sp, #24]                  // right_shift
269-    ldr x12, [sp, #32]
270+    ldr x9, [sp, #40]                    // out_multiplier
271+    ldr x10, [sp, #48]                  // left_shift
272+    ldr x11, [sp, #56]                  // right_shift
273+    ldr x12, [sp, #64]
274     dup v30.4s, w12                     // acc_min
275-    ldr x13, [sp, #40]
276+    ldr x13, [sp, #72]
277     dup v31.4s, w13                     // acc_max
278-    ldr x14, [sp, #48]                  // per_channel
279+    ldr x14, [sp, #80]                  // per_channel
280     cbnz x14, PerChannelDump
281     PerLayerDump:
282         ld1r {v27.4s}, [x9]
283@@ -216,7 +216,6 @@ asm_function ConvDw3x3Int8Corner
284         st1 {v23.s}[0], [x0], #4
285         st1 {v24.s}[0], [x0], #4
286 
287-    sub sp, sp, #32
288     ldp x19, x20, [sp], #16
289     ldp x21, x22, [sp], #16
290     ret
291diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S
292index 92eeffea..379154e6 100644
293--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S
294+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S
295@@ -32,21 +32,21 @@ asm_function ConvDw3x3Int8Horizontal
296     // x19 ~ x29 should be also preserved
297     // whereas our coding style do not permit such amount of parameters
298     sub sp, sp, #48
299-    stp x19, x20, [sp], #16
300-    stp x21, x22, [sp], #16
301-    stp x23, x24, [sp], #16
302+    stp x19, x20, [sp]
303+    stp x21, x22, [sp, #16]
304+    stp x23, x24, [sp, #32]
305 
306     dup v25.8b, w7                      // in_zp
307-    ldr x8, [sp]
308+    ldr x8, [sp, #48]
309     dup v26.4s, w8                      // out_zp
310-    ldr x9, [sp, #8]                    // out_multiplier
311-    ldr x10, [sp, #16]                  // left_shift
312-    ldr x11, [sp, #24]                  // right_shift
313-    ldr x12, [sp, #32]
314+    ldr x9, [sp, #56]                    // out_multiplier
315+    ldr x10, [sp, #64]                  // left_shift
316+    ldr x11, [sp, #72]                  // right_shift
317+    ldr x12, [sp, #80]
318     dup v30.4s, w12                     // acc_min
319-    ldr x13, [sp, #40]
320+    ldr x13, [sp, #88]
321     dup v31.4s, w13                     // acc_max
322-    ldr x14, [sp, #48]                  // per_channel
323+    ldr x14, [sp, #96]                  // per_channel
324     cbnz x14, PerChannelDump
325     PerLayerDump:
326         ld1r {v27.4s}, [x9]
327@@ -58,9 +58,9 @@ asm_function ConvDw3x3Int8Horizontal
328         ld1 {v28.4s}, [x10], #16
329         ld1 {v29.4s}, [x11], #16
330     ContinueFunc:
331-    ldr x12, [sp, #32]
332+    ldr x12, [sp, #80]
333     dup v30.4s, w12                     // acc_min
334-    ldr x13, [sp, #40]
335+    ldr x13, [sp, #88]
336     dup v31.4s, w13                     // acc_max
337 
338     mov x12, #2
339@@ -248,7 +248,6 @@ asm_function ConvDw3x3Int8Horizontal
340 
341         st1 {v23.s}[0], [x0], #4
342         st1 {v24.s}[0], [x0], #4
343-    sub sp, sp, #48
344     ldp x19, x20, [sp], #16
345     ldp x21, x22, [sp], #16
346     ldp x23, x24, [sp], #16
347diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
348index cc1b3e9b..8643a536 100644
349--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
350+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
351@@ -44,22 +44,23 @@
352 
353 asm_function ConvDw3x3Int8Stride2
354     sub sp, sp, #192
355-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
356-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
357-    stp x19, x20, [sp], #16
358-    stp x21, x22, [sp], #16
359-    stp x23, x24, [sp], #16
360-    stp x25, x26, [sp], #16
361-
362-    ldr x8, [sp]
363-    ldr x9, [sp, #8]
364-    ldr x10, [sp, #16]
365-    ldr x11, [sp, #24]
366-    ldr x12, [sp, #32]
367-    ldr x13, [sp, #40]
368-    ldr x14, [sp, #48]
369-    ldr x15, [sp, #56]
370-    ldr x23, [sp, #64]  // per_channel
371+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
372+    add x9, sp, #64
373+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
374+    stp x19, x20, [sp, #128]
375+    stp x21, x22, [sp, #144]
376+    stp x23, x24, [sp, #160]
377+    stp x25, x26, [sp, #176]
378+
379+    ldr x8, [sp, #192]
380+    ldr x9, [sp, #200]
381+    ldr x10, [sp, #208]
382+    ldr x11, [sp, #216]
383+    ldr x12, [sp, #224]
384+    ldr x13, [sp, #232]
385+    ldr x14, [sp, #240]
386+    ldr x15, [sp, #248]
387+    ldr x23, [sp, #256]  // per_channel
388 
389     add x19, x3, #16
390     add w20, w6, w6   // channel * 2
391@@ -463,7 +464,6 @@ OUTZP3:
392     st1 {v24.8b}, [x0], x6
393 
394 End:
395-    sub sp, sp, #192
396     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
397     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
398     ldp x19, x20, [sp], #16
399diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S
400index 67151534..706bc9fe 100644
401--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S
402+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S
403@@ -32,20 +32,20 @@ asm_function ConvDw3x3Int8Vertical
404     // x19 ~ x29 should be also preserved
405     // whereas our coding style do not permit such amount of parameters
406     sub sp, sp, #32
407-    stp x19, x20, [sp], #16
408-    stp x21, x22, [sp], #16
409+    stp x19, x20, [sp]
410+    stp x21, x22, [sp, #16]
411 
412     dup v25.8b, w7                      // in_zp
413-    ldr x8, [sp]
414+    ldr x8, [sp, #32]
415     dup v26.4s, w8                      // out_zp
416-    ldr x9, [sp, #8]                    // out_multiplier
417-    ldr x10, [sp, #16]                  // left_shift
418-    ldr x11, [sp, #24]                  // right_shift
419-    ldr x12, [sp, #32]
420+    ldr x9, [sp, #40]                    // out_multiplier
421+    ldr x10, [sp, #48]                  // left_shift
422+    ldr x11, [sp, #56]                  // right_shift
423+    ldr x12, [sp, #64]
424     dup v30.4s, w12                     // acc_min
425-    ldr x13, [sp, #40]
426+    ldr x13, [sp, #72]
427     dup v31.4s, w13                     // acc_max
428-    ldr x14, [sp, #48]                  // per_channel
429+    ldr x14, [sp, #80]                  // per_channel
430     cbnz x14, PerChannelDump
431     PerLayerDump:
432         ld1r {v27.4s}, [x9]
433@@ -239,7 +239,6 @@ asm_function ConvDw3x3Int8Vertical
434 
435         st1 {v23.s}[0], [x0], #4
436         st1 {v24.s}[0], [x0], #4
437-    sub sp, sp, #32
438     ldp x19, x20, [sp], #16
439     ldp x21, x22, [sp], #16
440     ret
441diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S
442index 6157848e..f939ec62 100644
443--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S
444+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S
445@@ -29,8 +29,9 @@ asm_function ConvDw3x3Line
446     // x19 ~ x29 should be also preserved
447     // whereas our coding style do not permit such amount of parameters
448     sub sp, sp, #128
449-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
450-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
451+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
452+    add x9, sp, #64
453+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
454 
455     ldr x8, [x1]
456     ldr x9, [x1, #8]
457@@ -196,7 +197,6 @@ asm_function ConvDw3x3Line
458         add x0, x0, #16
459         bgt LoopC4
460 
461-    sub sp, sp, #128
462     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
463     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
464     ret
465diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S
466index e9ddd65a..6f30c3ac 100644
467--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S
468+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S
469@@ -31,21 +31,22 @@ asm_function ConvDwFp32Center
470     // x19 ~ x29 should be also preserved
471     // whereas our coding style do not permit such amount of parameters
472     sub sp, sp, #192
473-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
474-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
475-    stp x19, x20, [sp], #16
476-    stp x21, x22, [sp], #16
477-    stp x23, x24, [sp], #16
478-    stp x25, x26, [sp], #16
479+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
480+    add x9, sp, #64
481+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
482+    stp x19, x20, [sp, #128]
483+    stp x21, x22, [sp, #144]
484+    stp x23, x24, [sp, #160]
485+    stp x25, x26, [sp, #176]
486 
487-    ldr x8, [sp]
488-    ldr x9, [sp, #8]
489-    ldr x10, [sp, #16]
490-    ldr x11, [sp, #24]
491-    ldr x12, [sp, #32]
492-    ldr x13, [sp, #40]
493-    ldr x14, [sp, #48]
494-    ldr x15, [sp, #56]
495+    ldr x8, [sp, #192]
496+    ldr x9, [sp, #200]
497+    ldr x10, [sp, #208]
498+    ldr x11, [sp, #216]
499+    ldr x12, [sp, #224]
500+    ldr x13, [sp, #232]
501+    ldr x14, [sp, #240]
502+    ldr x15, [sp, #248]
503 
504     ld1 {v24.4s}, [x3]
505     movi v26.4s, #6
506@@ -302,7 +303,6 @@ asm_function ConvDwFp32Center
507         subs x4, x4, #1
508         bne LoopH
509 
510-    sub sp, sp, #192
511     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
512     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
513     ldp x19, x20, [sp], #16
514diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
515index 34cc9037..ca93dc7d 100644
516--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
517+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
518@@ -25,14 +25,14 @@
519 
520 asm_function ConvDwFp32Indirect3x3
521     sub sp, sp, #32
522-    stp x19, x20, [sp], #16
523-    stp x21, x22, [sp], #16
524+    stp x19, x20, [sp]
525+    stp x21, x22, [sp, #16]
526 
527     movi v31.4s, #6
528     scvtf v31.4s, v31.4s
529     dup v30.4s, wzr
530 
531-    ldr x8, [sp]
532+    ldr x8, [sp, #32]
533     cmp x5, #0
534     beq End
535 
536@@ -153,7 +153,6 @@ asm_function ConvDwFp32Indirect3x3
537         cmp x5, #0
538         bgt LoopPixel
539 End:
540-    sub sp, sp, #32
541     ldp x19, x20, [sp], #16
542     ldp x21, x22, [sp], #16
543 ret
544diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S
545index 7ed94e6b..328250f3 100644
546--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S
547+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S
548@@ -34,44 +34,45 @@ asm_function ConvDwInt8Center
549     // x19 ~ x29 should be also preserved
550     // whereas our coding style do not permit such amount of parameters
551     sub sp, sp, #192
552-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
553-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
554-    stp x19, x20, [sp], #16
555-    stp x21, x22, [sp], #16
556-    stp x23, x24, [sp], #16
557-    stp x25, x26, [sp], #16
558-
559-    ldr x8, [sp]
560-    ldr x9, [sp, #8]
561-    ldr x10, [sp, #16]
562-    ldr x11, [sp, #24]
563-    ldr x12, [sp, #32]
564-    ldr x13, [sp, #40]
565-
566-    ldr x14, [sp, #48] // input_zp
567+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
568+    add x9, sp, #64
569+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
570+    stp x19, x20, [sp, #128]
571+    stp x21, x22, [sp, #144]
572+    stp x23, x24, [sp, #160]
573+    stp x25, x26, [sp, #176]
574+
575+    ldr x8, [sp, #192]
576+    ldr x9, [sp, #200]
577+    ldr x10, [sp, #208]
578+    ldr x11, [sp, #216]
579+    ldr x12, [sp, #224]
580+    ldr x13, [sp, #232]
581+
582+    ldr x14, [sp, #240] // input_zp
583     ld1 {v19.8b}, [x14], #8
584 
585-    ldr x15, [sp, #56] // output_zp
586+    ldr x15, [sp, #248] // output_zp
587     ld1 {v20.4s}, [x15], #16
588     ld1 {v21.4s}, [x15], #16
589 
590-    ldr x16, [sp, #64] // out_multiplier
591+    ldr x16, [sp, #256] // out_multiplier
592     ld1 {v22.4s}, [x16], #16
593     ld1 {v23.4s}, [x16], #16
594 
595-    ldr x17, [sp, #72] // left_shift
596+    ldr x17, [sp, #264] // left_shift
597     ld1 {v24.4s}, [x17], #16
598     ld1 {v25.4s}, [x17], #16
599 
600-    ldr x25, [sp, #80] // right shift
601+    ldr x25, [sp, #272] // right shift
602     ld1 {v26.4s}, [x25], #16
603     ld1 {v27.4s}, [x25], #16
604 
605-    ldr x19, [sp, #88] // acc_min
606+    ldr x19, [sp, #280] // acc_min
607     ld1 {v28.4s}, [x19], #16
608     ld1 {v29.4s}, [x19], #16
609 
610-    ldr x20, [sp, #96] // acc_max
611+    ldr x20, [sp, #288] // acc_max
612     ld1 {v30.4s}, [x20], #16
613     ld1 {v31.4s}, [x20], #16
614 
615@@ -283,7 +284,6 @@ asm_function ConvDwInt8Center
616         subs x4, x4, #1
617         bne LoopH
618 
619-    sub sp, sp, #192
620     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
621     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
622     ldp x19, x20, [sp], #16
623diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S
624index 2cc456f6..0a9d3265 100644
625--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S
626+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S
627@@ -31,21 +31,22 @@ asm_function ConvSwFp32Center
628     // x19 ~ x29 should be also preserved
629     // whereas our coding style do not permit such amount of parameters
630     sub sp, sp, #208
631-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
632-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
633-    stp x19, x20, [sp], #16
634-    stp x21, x22, [sp], #16
635-    stp x23, x24, [sp], #16
636-    stp x25, x26, [sp], #16
637-    stp x27, x28, [sp], #16
638+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
639+    add x9, sp, #64
640+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
641+    stp x19, x20, [sp, #128]
642+    stp x21, x22, [sp, #144]
643+    stp x23, x24, [sp, #160]
644+    stp x25, x26, [sp, #176]
645+    stp x27, x28, [sp, #192]
646 
647-    ldr x8, [sp]
648-    ldr x9, [sp, #8]
649-    ldr x10, [sp, #16]
650-    ldr x11, [sp, #24]
651-    ldr x12, [sp, #32]
652-    ldr x13, [sp, #40]
653-    ldr x14, [sp, #48]
654+    ldr x8, [sp, #208]
655+    ldr x9, [sp, #216]
656+    ldr x10, [sp, #224]
657+    ldr x11, [sp, #232]
658+    ldr x12, [sp, #240]
659+    ldr x13, [sp, #248]
660+    ldr x14, [sp, #256]
661     mul x15, x6, x7
662     mul x15, x10, x15
663     mov x16, #16
664@@ -198,9 +199,9 @@ asm_function ConvSwFp32Center
665                 add x20, x20, x13
666                 subs x22, x22, #1
667                 bne LoopKh16
668-            ldr x16, [sp, #64]
669+            ldr x16, [sp, #272]
670             cbnz x16, Relu616
671-            ldr x26, [sp, #56]
672+            ldr x26, [sp, #264]
673             cbnz x26, Relu16
674             b Write16
675         Relu616:
676@@ -347,9 +348,9 @@ asm_function ConvSwFp32Center
677                 add x20, x20, x13
678                 subs x22, x22, #1
679                 bne LoopKh8
680-            ldr x16, [sp, #64]
681+            ldr x16, [sp, #272]
682             cbnz x16, Relu68
683-            ldr x26, [sp, #56]
684+            ldr x26, [sp, #264]
685             cbnz x26, Relu8
686             b Write8
687         Relu68:
688@@ -426,9 +427,9 @@ asm_function ConvSwFp32Center
689                 add x20, x20, x13
690                 subs x22, x22, #1
691                 bne LoopKh
692-            ldr x16, [sp, #64]
693+            ldr x16, [sp, #272]
694             cbnz x16, Relu6
695-            ldr x26, [sp, #56]
696+            ldr x26, [sp, #264]
697             cbnz x26, Relu
698             b Write
699         Relu6:
700@@ -446,7 +447,6 @@ asm_function ConvSwFp32Center
701         subs x4, x4, #1
702         bne LoopH
703 
704-    sub sp, sp, #208
705     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
706     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
707     ldp x19, x20, [sp], #16
708diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S
709index 2267e776..3b436c17 100644
710--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S
711+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S
712@@ -30,17 +30,17 @@ asm_function SWConv1x16Kernel
713     // x19 ~ x29 should be also preserved
714     // whereas our coding style do not permit such amount of parameters
715     sub sp, sp, #64
716-    stp x19, x20, [sp], #16
717-    stp x21, x22, [sp], #16
718-    stp x23, x24, [sp], #16
719-    stp x25, x26, [sp], #16
720-
721-    ldr x10, [sp]
722-    ldr x11, [sp, #8]
723-    ldr x12, [sp, #16]
724-    ldr x13, [sp, #24]
725-    ldr x14, [sp, #32]
726-    ldr x15, [sp, #40]
727+    stp x19, x20, [sp]
728+    stp x21, x22, [sp, #16]
729+    stp x23, x24, [sp, #32]
730+    stp x25, x26, [sp, #48]
731+
732+    ldr x10, [sp, #64]
733+    ldr x11, [sp, #72]
734+    ldr x12, [sp, #80]
735+    ldr x13, [sp, #88]
736+    ldr x14, [sp, #96]
737+    ldr x15, [sp, #104]
738     lsl x7, x7, #2
739     lsl x11, x11, #2
740     lsl x12, x12, #2
741@@ -413,7 +413,6 @@ asm_function SWConv1x16Kernel
742             st1 {v2.4s}, [x21]
743             st1 {v3.4s}, [x22]
744     End:
745-    sub sp, sp, #64
746     ldp x19, x20, [sp], #16
747     ldp x21, x22, [sp], #16
748     ldp x23, x24, [sp], #16
749diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S
750index fa8bb63d..6a29e95e 100644
751--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S
752+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S
753@@ -30,17 +30,17 @@ asm_function SWConv1x8Kernel
754     // x19 ~ x29 should be also preserved
755     // whereas our coding style do not permit such amount of parameters
756     sub sp, sp, #64
757-    stp x19, x20, [sp], #16
758-    stp x21, x22, [sp], #16
759-    stp x23, x24, [sp], #16
760-    stp x25, x26, [sp], #16
761+    stp x19, x20, [sp]
762+    stp x21, x22, [sp, #16]
763+    stp x23, x24, [sp, #32]
764+    stp x25, x26, [sp, #48]
765 
766-    ldr x10, [sp]
767-    ldr x11, [sp, #8]
768-    ldr x12, [sp, #16]
769-    ldr x13, [sp, #24]
770-    ldr x14, [sp, #32]
771-    ldr x15, [sp, #40]
772+    ldr x10, [sp, #64]
773+    ldr x11, [sp, #72]
774+    ldr x12, [sp, #80]
775+    ldr x13, [sp, #88]
776+    ldr x14, [sp, #96]
777+    ldr x15, [sp, #104]
778     lsl x7, x7, #2
779     lsl x11, x11, #2
780     lsl x12, x12, #2
781@@ -270,7 +270,6 @@ asm_function SWConv1x8Kernel
782             st1 {v0.4s}, [x0]
783             st1 {v1.4s}, [x20]
784     End:
785-    sub sp, sp, #64
786     ldp x19, x20, [sp], #16
787     ldp x21, x22, [sp], #16
788     ldp x23, x24, [sp], #16
789diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S
790index 69624af6..8a5dd83a 100644
791--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S
792+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S
793@@ -30,17 +30,17 @@ asm_function SWConv2x16Kernel
794     // x19 ~ x29 should be also preserved
795     // whereas our coding style do not permit such amount of parameters
796     sub sp, sp, #64
797-    stp x19, x20, [sp], #16
798-    stp x21, x22, [sp], #16
799-    stp x23, x24, [sp], #16
800-    stp x25, x26, [sp], #16
801+    stp x19, x20, [sp]
802+    stp x21, x22, [sp, #16]
803+    stp x23, x24, [sp, #32]
804+    stp x25, x26, [sp, #48]
805 
806-    ldr x10, [sp]
807-    ldr x11, [sp, #8]
808-    ldr x12, [sp, #16]
809-    ldr x13, [sp, #24]
810-    ldr x14, [sp, #32]
811-    ldr x15, [sp, #40]
812+    ldr x10, [sp, #64]
813+    ldr x11, [sp, #72]
814+    ldr x12, [sp, #80]
815+    ldr x13, [sp, #88]
816+    ldr x14, [sp, #96]
817+    ldr x15, [sp, #104]
818     lsl x7, x7, #2
819     lsl x11, x11, #2
820     lsl x12, x12, #2
821@@ -399,7 +399,6 @@ asm_function SWConv2x16Kernel
822             st1 {v3.4s}, [x22], #16
823             st1 {v7.4s}, [x22]
824     End:
825-    sub sp, sp, #64
826     ldp x19, x20, [sp], #16
827     ldp x21, x22, [sp], #16
828     ldp x23, x24, [sp], #16
829diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S
830index 8fefa4be..6efd21d0 100644
831--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S
832+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S
833@@ -30,17 +30,17 @@ asm_function SWConv2x8Kernel
834     // x19 ~ x29 should be also preserved
835     // whereas our coding style do not permit such amount of parameters
836     sub sp, sp, #64
837-    stp x19, x20, [sp], #16
838-    stp x21, x22, [sp], #16
839-    stp x23, x24, [sp], #16
840-    stp x25, x26, [sp], #16
841+    stp x19, x20, [sp]
842+    stp x21, x22, [sp, #16]
843+    stp x23, x24, [sp, #32]
844+    stp x25, x26, [sp, #48]
845 
846-    ldr x10, [sp]
847-    ldr x11, [sp, #8]
848-    ldr x12, [sp, #16]
849-    ldr x13, [sp, #24]
850-    ldr x14, [sp, #32]
851-    ldr x15, [sp, #40]
852+    ldr x10, [sp, #64]
853+    ldr x11, [sp, #72]
854+    ldr x12, [sp, #80]
855+    ldr x13, [sp, #88]
856+    ldr x14, [sp, #96]
857+    ldr x15, [sp, #104]
858     lsl x7, x7, #2
859     lsl x11, x11, #2
860     lsl x12, x12, #2
861@@ -257,7 +257,6 @@ asm_function SWConv2x8Kernel
862             st1 {v1.4s}, [x20], #16
863             st1 {v3.4s}, [x20]
864     End:
865-    sub sp, sp, #64
866     ldp x19, x20, [sp], #16
867     ldp x21, x22, [sp], #16
868     ldp x23, x24, [sp], #16
869diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S
870index 61efd444..428dea69 100644
871--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S
872+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S
873@@ -30,18 +30,18 @@ asm_function SWConv3x16Kernel
874     // x19 ~ x29 should be also preserved
875     // whereas our coding style do not permit such amount of parameters
876     sub sp, sp, #128
877-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
878-    stp x19, x20, [sp], #16
879-    stp x21, x22, [sp], #16
880-    stp x23, x24, [sp], #16
881-    stp x25, x26, [sp], #16
882+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
883+    stp x19, x20, [sp, #64]
884+    stp x21, x22, [sp, #80]
885+    stp x23, x24, [sp, #96]
886+    stp x25, x26, [sp, #112]
887 
888-    ldr x10, [sp]
889-    ldr x11, [sp, #8]
890-    ldr x12, [sp, #16]
891-    ldr x13, [sp, #24]
892-    ldr x14, [sp, #32]
893-    ldr x15, [sp, #40]
894+    ldr x10, [sp, #128]
895+    ldr x11, [sp, #136]
896+    ldr x12, [sp, #144]
897+    ldr x13, [sp, #152]
898+    ldr x14, [sp, #160]
899+    ldr x15, [sp, #168]
900     lsl x7, x7, #2
901     lsl x11, x11, #2
902     lsl x12, x12, #2
903@@ -524,7 +524,6 @@ asm_function SWConv3x16Kernel
904             st1 {v7.4s}, [x22], #16
905             st1 {v11.4s}, [x22]
906     End:
907-    sub sp, sp, #128
908     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
909     ldp x19, x20, [sp], #16
910     ldp x21, x22, [sp], #16
911diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S
912index 1e958572..472e50b9 100644
913--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S
914+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S
915@@ -30,17 +30,17 @@ asm_function SWConv3x8Kernel
916     // x19 ~ x29 should be also preserved
917     // whereas our coding style do not permit such amount of parameters
918     sub sp, sp, #64
919-    stp x19, x20, [sp], #16
920-    stp x21, x22, [sp], #16
921-    stp x23, x24, [sp], #16
922-    stp x25, x26, [sp], #16
923+    stp x19, x20, [sp]
924+    stp x21, x22, [sp, #16]
925+    stp x23, x24, [sp, #32]
926+    stp x25, x26, [sp, #48]
927 
928-    ldr x10, [sp]
929-    ldr x11, [sp, #8]
930-    ldr x12, [sp, #16]
931-    ldr x13, [sp, #24]
932-    ldr x14, [sp, #32]
933-    ldr x15, [sp, #40]
934+    ldr x10, [sp, #64]
935+    ldr x11, [sp, #72]
936+    ldr x12, [sp, #80]
937+    ldr x13, [sp, #88]
938+    ldr x14, [sp, #96]
939+    ldr x15, [sp, #104]
940     lsl x7, x7, #2
941     lsl x11, x11, #2
942     lsl x12, x12, #2
943@@ -324,7 +324,6 @@ asm_function SWConv3x8Kernel
944             st1 {v3.4s}, [x20], #16
945             st1 {v5.4s}, [x20]
946     End:
947-    sub sp, sp, #64
948     ldp x19, x20, [sp], #16
949     ldp x21, x22, [sp], #16
950     ldp x23, x24, [sp], #16
951diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S
952index 1cd5e124..076724a7 100644
953--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S
954+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S
955@@ -30,20 +30,21 @@ asm_function SWConv4x16Kernel
956     // x19 ~ x29 should be also preserved
957     // whereas our coding style do not permit such amount of parameters
958     sub sp, sp, #208
959-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
960-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
961-    stp x19, x20, [sp], #16
962-    stp x21, x22, [sp], #16
963-    stp x23, x24, [sp], #16
964-    stp x25, x26, [sp], #16
965-    stp x27, x28, [sp], #16
966+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
967+    add x9, sp, #64
968+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
969+    stp x19, x20, [sp, #128]
970+    stp x21, x22, [sp, #144]
971+    stp x23, x24, [sp, #160]
972+    stp x25, x26, [sp, #176]
973+    stp x27, x28, [sp, #192]
974 
975-    ldr x10, [sp]
976-    ldr x11, [sp, #8]
977-    ldr x12, [sp, #16]
978-    ldr x13, [sp, #24]
979-    ldr x14, [sp, #32]
980-    ldr x15, [sp, #40]
981+    ldr x10, [sp, #208]
982+    ldr x11, [sp, #216]
983+    ldr x12, [sp, #224]
984+    ldr x13, [sp, #232]
985+    ldr x14, [sp, #240]
986+    ldr x15, [sp, #248]
987     lsl x7, x7, #2
988     lsl x11, x11, #2
989     lsl x12, x12, #2
990@@ -650,7 +651,6 @@ asm_function SWConv4x16Kernel
991             st1 {v11.4s}, [x22], #16
992             st1 {v15.4s}, [x22]
993     End:
994-    sub sp, sp, #208
995     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
996     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
997     ldp x19, x20, [sp], #16
998diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S
999index 28109031..6b24de97 100644
1000--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S
1001+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S
1002@@ -30,20 +30,21 @@ asm_function SWConv4x8Kernel
1003     // x19 ~ x29 should be also preserved
1004     // whereas our coding style do not permit such amount of parameters
1005     sub sp, sp, #208
1006-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1007-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1008-    stp x19, x20, [sp], #16
1009-    stp x21, x22, [sp], #16
1010-    stp x23, x24, [sp], #16
1011-    stp x25, x26, [sp], #16
1012-    stp x27, x28, [sp], #16
1013+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1014+    add x9, sp, #64
1015+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1016+    stp x19, x20, [sp, #128]
1017+    stp x21, x22, [sp, #144]
1018+    stp x23, x24, [sp, #160]
1019+    stp x25, x26, [sp, #176]
1020+    stp x27, x28, [sp, #192]
1021 
1022-    ldr x10, [sp]
1023-    ldr x11, [sp, #8]
1024-    ldr x12, [sp, #16]
1025-    ldr x13, [sp, #24]
1026-    ldr x14, [sp, #32]
1027-    ldr x15, [sp, #40]
1028+    ldr x10, [sp, #208]
1029+    ldr x11, [sp, #216]
1030+    ldr x12, [sp, #224]
1031+    ldr x13, [sp, #232]
1032+    ldr x14, [sp, #240]
1033+    ldr x15, [sp, #248]
1034     lsl x7, x7, #2
1035     lsl x11, x11, #2
1036     lsl x12, x12, #2
1037@@ -394,7 +395,6 @@ asm_function SWConv4x8Kernel
1038             st1 {v5.4s}, [x20], #16
1039             st1 {v7.4s}, [x20]
1040     End:
1041-    sub sp, sp, #208
1042     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1043     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1044     ldp x19, x20, [sp], #16
1045diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S
1046index 302e5a3d..a2b7ea2c 100644
1047--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S
1048+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S
1049@@ -30,20 +30,21 @@ asm_function SWConv5x16Kernel
1050     // x19 ~ x29 should be also preserved
1051     // whereas our coding style do not permit such amount of parameters
1052     sub sp, sp, #208
1053-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1054-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1055-    stp x19, x20, [sp], #16
1056-    stp x21, x22, [sp], #16
1057-    stp x23, x24, [sp], #16
1058-    stp x25, x26, [sp], #16
1059-    stp x27, x28, [sp], #16
1060+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1061+    add x9, sp, #64
1062+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1063+    stp x19, x20, [sp, #128]
1064+    stp x21, x22, [sp, #144]
1065+    stp x23, x24, [sp, #160]
1066+    stp x25, x26, [sp, #176]
1067+    stp x27, x28, [sp, #192]
1068 
1069-    ldr x10, [sp]
1070-    ldr x11, [sp, #8]
1071-    ldr x12, [sp, #16]
1072-    ldr x13, [sp, #24]
1073-    ldr x14, [sp, #32]
1074-    ldr x15, [sp, #40]
1075+    ldr x10, [sp, #208]
1076+    ldr x11, [sp, #216]
1077+    ldr x12, [sp, #224]
1078+    ldr x13, [sp, #232]
1079+    ldr x14, [sp, #240]
1080+    ldr x15, [sp, #248]
1081     lsl x7, x7, #2
1082     lsl x11, x11, #2
1083     lsl x12, x12, #2
1084@@ -445,7 +446,6 @@ asm_function SWConv5x16Kernel
1085             st1 {v15.4s}, [x22], #16
1086             st1 {v19.4s}, [x22]
1087     End:
1088-    sub sp, sp, #208
1089     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1090     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1091     ldp x19, x20, [sp], #16
1092diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S
1093index 059cc7fc..b7e48480 100644
1094--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S
1095+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S
1096@@ -30,20 +30,21 @@ asm_function SWConv5x8Kernel
1097     // x19 ~ x29 should be also preserved
1098     // whereas our coding style do not permit such amount of parameters
1099     sub sp, sp, #208
1100-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1101-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1102-    stp x19, x20, [sp], #16
1103-    stp x21, x22, [sp], #16
1104-    stp x23, x24, [sp], #16
1105-    stp x25, x26, [sp], #16
1106-    stp x27, x28, [sp], #16
1107+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1108+    add x9, sp, #64
1109+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1110+    stp x19, x20, [sp, #128]
1111+    stp x21, x22, [sp, #144]
1112+    stp x23, x24, [sp, #160]
1113+    stp x25, x26, [sp, #176]
1114+    stp x27, x28, [sp, #192]
1115 
1116-    ldr x10, [sp]
1117-    ldr x11, [sp, #8]
1118-    ldr x12, [sp, #16]
1119-    ldr x13, [sp, #24]
1120-    ldr x14, [sp, #32]
1121-    ldr x15, [sp, #40]
1122+    ldr x10, [sp, #208]
1123+    ldr x11, [sp, #216]
1124+    ldr x12, [sp, #224]
1125+    ldr x13, [sp, #232]
1126+    ldr x14, [sp, #240]
1127+    ldr x15, [sp, #248]
1128     lsl x7, x7, #2
1129     lsl x11, x11, #2
1130     lsl x12, x12, #2
1131@@ -296,7 +297,6 @@ asm_function SWConv5x8Kernel
1132             st1 {v7.4s}, [x20], #16
1133             st1 {v9.4s}, [x20]
1134     End:
1135-    sub sp, sp, #208
1136     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1137     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1138     ldp x19, x20, [sp], #16
1139diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S
1140index e6875bb1..11722e71 100644
1141--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S
1142+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S
1143@@ -30,14 +30,14 @@ asm_function DeconvDwFp32Center
1144     // x19 ~ x29 should be also preserved
1145     // whereas our coding style do not permit such amount of parameters
1146     sub sp, sp, #32
1147-    stp x19, x20, [sp], #16
1148-    stp x21, x22, [sp], #16
1149+    stp x19, x20, [sp]
1150+    stp x21, x22, [sp, #16]
1151 
1152-    ldr x8, [sp]
1153-    ldr x9, [sp, #8]
1154-    ldr x10, [sp, #16]
1155-    ldr x11, [sp, #24]
1156-    ldr x12, [sp, #32]
1157+    ldr x8, [sp, #32]
1158+    ldr x9, [sp, #40]
1159+    ldr x10, [sp, #48]
1160+    ldr x11, [sp, #56]
1161+    ldr x12, [sp, #64]
1162 
1163     LoopH:
1164         mov x15, x0
1165@@ -69,7 +69,6 @@ asm_function DeconvDwFp32Center
1166         subs x3, x3, #1
1167         bne LoopH
1168 
1169-    sub sp, sp, #32
1170     ldp x19, x20, [sp], #16
1171     ldp x21, x22, [sp], #16
1172     ret
1173diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S
1174index aaf210f0..1c3723fa 100644
1175--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S
1176+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S
1177@@ -30,14 +30,14 @@ asm_function DeconvDwInt8Center
1178     // x19 ~ x29 should be also preserved
1179     // whereas our coding style do not permit such amount of parameters
1180     sub sp, sp, #32
1181-    stp x19, x20, [sp], #16
1182-    stp x21, x22, [sp], #16
1183+    stp x19, x20, [sp]
1184+    stp x21, x22, [sp, #16]
1185 
1186-    ldr x8, [sp]
1187-    ldr x9, [sp, #8]
1188-    ldr x10, [sp, #16]
1189-    ldr x11, [sp, #24]
1190-    ldr x12, [sp, #32]
1191+    ldr x8, [sp, #32]
1192+    ldr x9, [sp, #40]
1193+    ldr x10, [sp, #48]
1194+    ldr x11, [sp, #56]
1195+    ldr x12, [sp, #64]
1196 
1197     LoopH:
1198         mov x15, x0
1199@@ -69,7 +69,6 @@ asm_function DeconvDwInt8Center
1200         subs x3, x3, #1
1201         bne LoopH
1202 
1203-    sub sp, sp, #32
1204     ldp x19, x20, [sp], #16
1205     ldp x21, x22, [sp], #16
1206     ret
1207diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S
1208index 71a7f0f1..36c8d8ec 100644
1209--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S
1210+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S
1211@@ -15,7 +15,7 @@
1212  */
1213 #ifdef ENABLE_ARM64
1214 #include "nnacl/assembly_global.h"
1215-    
1216+
1217 .text
1218 .align 5
1219 
1220@@ -30,24 +30,25 @@
1221 
1222 asm_default_function MatVecMulFp32
1223   sub sp, sp, #128
1224-  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
1225-  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
1226+  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
1227+  add x9, sp, #64
1228+  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9]
1229 
1230   mov w14, #4      // sizeof(float)
1231   mul w8, w14, w5  // rhs depthx1 block stride
1232   mov w14, #4
1233-  mul w13, w8, w14 // rhs depthx4 block stride 
1234+  mul w13, w8, w14 // rhs depthx4 block stride
1235 
1236 Loop:
1237   mov x15, x0     // reload a ptr
1238   mov x7, x1      // reload b ptr
1239   mov w9, w5      // reload depth
1240   cmp w6, #4
1241-  blt Loop1x1  
1242+  blt Loop1x1
1243 
1244-Loop1x4: 
1245-  dup v10.8h, wzr  
1246-  dup v11.8h, wzr  
1247+Loop1x4:
1248+  dup v10.8h, wzr
1249+  dup v11.8h, wzr
1250   dup v12.8h, wzr
1251   dup v13.8h, wzr
1252   dup v14.8h, wzr
1253@@ -150,7 +151,7 @@ End1x4:
1254 
1255   cbz x3, Act1x4
1256   ld1 {v15.4s}, [x3], #16
1257-  fadd v14.4s, v14.4s, v15.4s   // add bias 
1258+  fadd v14.4s, v14.4s, v15.4s   // add bias
1259 
1260 Act1x4:
1261   cmp w4, #3
1262@@ -214,8 +215,8 @@ Depth1_1x1:
1263   b Depth1_1x1
1264 
1265 End1x1:
1266-  faddp v6.4s, v4.4s, v4.4s  
1267-  faddp v7.4s, v6.4s, v6.4s  
1268+  faddp v6.4s, v4.4s, v4.4s
1269+  faddp v7.4s, v6.4s, v6.4s
1270   fadd v7.4s, v7.4s, v5.4s
1271 
1272   cbz x3, Act1x1
1273@@ -245,7 +246,6 @@ Write1x1:
1274   b Loop
1275 
1276 End:
1277-  sub sp, sp, #128
1278   ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
1279   ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
1280   ret
1281diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S
1282index d485b012..b013f48a 100644
1283--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S
1284+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S
1285@@ -30,8 +30,8 @@
1286 
1287 asm_default_function MatVecMulPackFp32
1288     sub sp, sp, #16
1289-    stp x29, x30, [sp], #16
1290-  
1291+    stp x29, x30, [sp]
1292+
1293     dup v1.2d, xzr
1294     mov w7, #6
1295     dup v2.4s, w7
1296@@ -43,7 +43,7 @@ asm_default_function MatVecMulPackFp32
1297         st1 {v24.4s, v25.4s}, [x2], #32
1298         subs w6, w6, #8
1299         bge Loop1x8Start
1300-                
1301+
1302     Loop1xNStart:
1303         add w6, w6, #8
1304         cbz w6, End
1305@@ -59,7 +59,7 @@ asm_default_function MatVecMulPackFp32
1306         beq End
1307         st1 {v25.s}[2], [x2]
1308         b End
1309-            
1310+
1311     Loop1x4Start:
1312         add w6, w6, #4
1313         cbz w6, End
1314@@ -75,7 +75,7 @@ asm_default_function MatVecMulPackFp32
1315         beq End
1316         st1 {v24.s}[3], [x2], #4
1317         b End
1318-        
1319+
1320     Compute1x8Unit:
1321         mov x7, x0     // reload a-ptr
1322         mov w8, w5     // reset depth
1323@@ -140,7 +140,7 @@ asm_default_function MatVecMulPackFp32
1324                     fmax v25.4s, v25.4s, v1.4s
1325                 Return1x8:
1326                     ret
1327-  
1328+
1329     Compute1x4Unit:
1330         mov x7, x0     // reload a-ptr
1331         mov w8, w5     // reset depth
1332@@ -191,9 +191,8 @@ asm_default_function MatVecMulPackFp32
1333                     fmax v24.4s, v24.4s, v1.4s
1334                 Return1x4:
1335                     ret
1336-  
1337+
1338     End:
1339-        sub sp, sp, #16
1340         ldp x29, x30, [sp], #16
1341         ret
1342 #endif
1343diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S
1344index 67d20dcc..2dedccd0 100644
1345--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S
1346+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S
1347@@ -34,17 +34,18 @@
1348 
1349 asm_function MatmulFloatNeon64
1350   sub sp, sp, #144
1351-  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1352-  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1353-  stp x19, x20, [sp], #16
1354+  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1355+  add x9, sp, #64
1356+  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1357+  stp x19, x20, [sp, #128]
1358 
1359-  ldr x9, [sp, #8]
1360-  ldr x14, [sp, #16]
1361+  ldr x9, [sp, #152]
1362+  ldr x14, [sp, #160]
1363 
1364   mov w19, #32 // sizeof(float) * 8
1365   mul w15, w5, w19 // block stride of lhs/rhs: sizeof(float) * 8 * depth
1366   mov x19, #4
1367-  ldr x17, [sp]
1368+  ldr x17, [sp, #144]
1369   cbz x14, NoWinoSteps
1370   mul x8, x7, x17
1371   mov x11, #8
1372@@ -779,7 +780,6 @@ NoDstStep:
1373   bgt L1
1374 
1375 End1:
1376-  sub sp, sp, #144
1377   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1378   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1379   ldp x19, x20, [sp], #16
1380diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S
1381index 6937f4ba..51d107c8 100644
1382--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S
1383+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S
1384@@ -19,7 +19,7 @@
1385 .text
1386 .align 5
1387 
1388-// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
1389+// void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
1390 //                        int row, int col, size_t stride, size_t writeMode)
1391 // x0: a
1392 // x1: b
1393@@ -34,13 +34,14 @@
1394 
1395 asm_function MatmulFloatNeon64Opt
1396     sub sp, sp, #160
1397-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1398-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1399-    stp x19, x20, [sp], #16
1400-    stp x21, x22, [sp], #16
1401+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1402+    add x9, sp, #64
1403+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1404+    stp x19, x20, [sp, #128]
1405+    stp x21, x22, [sp, #144]
1406 
1407-    ldr x8, [sp]
1408-    ldr x9, [sp, #8]
1409+    ldr x8, [sp, #160]
1410+    ldr x9, [sp, #168]
1411 
1412     mov x21, #48 // sizeof(float) * 12
1413     mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
1414@@ -1659,7 +1660,6 @@ LoopColEnd:
1415         subs x6, x6, #12
1416         bgt LoopRowStart
1417 
1418-  sub sp, sp, #160
1419   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1420   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1421   ldp x19, x20, [sp], #16
1422diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S
1423index c9151a99..05465bd1 100644
1424--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S
1425+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S
1426@@ -34,13 +34,14 @@
1427 
1428 asm_function MatmulFloatNeon64OptRow12
1429     sub sp, sp, #160
1430-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1431-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1432-    stp x19, x20, [sp], #16
1433-    stp x21, x22, [sp], #16
1434+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1435+    add x9, sp, #64
1436+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1437+    stp x19, x20, [sp, #128]
1438+    stp x21, x22, [sp, #144]
1439 
1440-    ldr x8, [sp]
1441-    ldr x9, [sp, #8]
1442+    ldr x8, [sp, #160]
1443+    ldr x9, [sp, #168]
1444 
1445     mov x21, #48 // sizeof(float) * 12
1446     mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
1447@@ -1220,7 +1221,6 @@ LoopColEnd:
1448         subs x6, x6, #12
1449         bgt LoopRow
1450 
1451-  sub sp, sp, #160
1452   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1453   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1454   ldp x19, x20, [sp], #16
1455diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S
1456index 0cc49fb9..b984c494 100644
1457--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S
1458+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S
1459@@ -19,7 +19,7 @@
1460 .text
1461 .align 5
1462 
1463-// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
1464+// void MatmulFloatNeon64OptRow4(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
1465 //                        int row, int col, size_t stride, size_t writeMode)
1466 // x0: a
1467 // x1: b
1468@@ -34,13 +34,14 @@
1469 
1470 asm_function MatmulFloatNeon64OptRow4
1471     sub sp, sp, #160
1472-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1473-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1474-    stp x19, x20, [sp], #16
1475-    stp x21, x22, [sp], #16
1476+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1477+    add x9, sp, #64
1478+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1479+    stp x19, x20, [sp, #128]
1480+    stp x21, x22, [sp, #144]
1481 
1482-    ldr x8, [sp]
1483-    ldr x9, [sp, #8]
1484+    ldr x8, [sp, #160]
1485+    ldr x9, [sp, #168]
1486 
1487     mov x21, #48 // sizeof(float) * 12
1488 
1489@@ -588,7 +589,6 @@ LoopColEnd:
1490         subs x6, x6, #12
1491         bgt LoopRow4
1492 
1493-  sub sp, sp, #160
1494   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1495   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1496   ldp x19, x20, [sp], #16
1497diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S
1498index a9e42a54..c5b260c0 100644
1499--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S
1500+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S
1501@@ -34,13 +34,14 @@
1502 
1503 asm_function MatmulFloatNeon64OptRow8
1504     sub sp, sp, #160
1505-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1506-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1507-    stp x19, x20, [sp], #16
1508-    stp x21, x22, [sp], #16
1509+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1510+    add x9, sp, #64
1511+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1512+    stp x19, x20, [sp, #128]
1513+    stp x21, x22, [sp, #144]
1514 
1515-    ldr x8, [sp]
1516-    ldr x9, [sp, #8]
1517+    ldr x8, [sp, #160]
1518+    ldr x9, [sp, #168]
1519 
1520     mov x21, #48 // sizeof(float) * 12
1521     mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
1522@@ -902,7 +903,6 @@ LoopColEnd:
1523         subs x6, x6, #12
1524         bgt LoopCol8
1525 
1526-  sub sp, sp, #160
1527   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1528   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1529   ldp x19, x20, [sp], #16
1530diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S
1531index a0e94c5f..731bac4b 100644
1532--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S
1533+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S
1534@@ -44,24 +44,25 @@
1535 
1536 asm_function MatmulInt8Neon64
1537   sub sp, sp, #208
1538-  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1539-  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1540-  stp x19, x20, [sp], #16
1541-  stp x21, x22, [sp], #16
1542-  stp x23, x24, [sp], #16
1543-  stp x25, x26, [sp], #16
1544-  stp x27, x28, [sp], #16
1545-
1546-  ldr w8, [sp]
1547-  ldr w9, [sp, #8]
1548-  ldr w10, [sp, #16]
1549-  ldr x11, [sp, #24]
1550-  ldr x12, [sp, #32]
1551-  ldr x13, [sp, #40]
1552-  ldr w14, [sp, #48]
1553-  ldr w15, [sp, #56]
1554-  ldr w24, [sp, #64]
1555-  ldr w27, [sp, #72]
1556+  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1557+  add x9, sp, #64
1558+  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1559+  stp x19, x20, [sp, #128]
1560+  stp x21, x22, [sp, #144]
1561+  stp x23, x24, [sp, #160]
1562+  stp x25, x26, [sp, #176]
1563+  stp x27, x28, [sp, #192]
1564+
1565+  ldr w8, [sp, #208]
1566+  ldr w9, [sp, #216]
1567+  ldr w10, [sp, #224]
1568+  ldr x11, [sp, #232]
1569+  ldr x12, [sp, #240]
1570+  ldr x13, [sp, #248]
1571+  ldr w14, [sp, #256]
1572+  ldr w15, [sp, #264]
1573+  ldr w24, [sp, #272]
1574+  ldr w27, [sp, #280]
1575 
1576   mov w17, #4       // sizeof(int8)*4
1577   mul w21, w5, w17  // the stride of a/b: sizeof(int8)*4*deep16
1578@@ -408,7 +409,6 @@ PerTEnd2:
1579   b L1
1580 
1581 End1:
1582-  sub sp, sp, #208
1583   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1584   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1585   ldp x19, x20, [sp], #16
1586diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S
1587index 64be8a14..a54ee5b8 100644
1588--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S
1589+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S
1590@@ -43,23 +43,24 @@
1591 
1592 asm_function MatmulInt8Opt
1593     sub sp, sp, #224
1594-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1595-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1596-    stp x19, x20, [sp], #16
1597-    stp x21, x22, [sp], #16
1598-    stp x23, x24, [sp], #16
1599-    stp x25, x26, [sp], #16
1600-    stp x27, x28, [sp], #16
1601-    stp x29, x30, [sp], #16
1602-
1603-    ldr w8, [sp]
1604-    ldr w9, [sp, #8]
1605-    ldr w10, [sp, #16]
1606-    ldr x11, [sp, #24]
1607-    ldr x12, [sp, #32]
1608-    ldr x13, [sp, #40]
1609-    ldr x14, [sp, #48]
1610-    ldr x15, [sp, #56]
1611+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1612+    add x9, sp, #64
1613+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1614+    stp x19, x20, [sp, #128]
1615+    stp x21, x22, [sp, #144]
1616+    stp x23, x24, [sp, #160]
1617+    stp x25, x26, [sp, #176]
1618+    stp x27, x28, [sp, #192]
1619+    stp x29, x30, [sp, #208]
1620+
1621+    ldr w8, [sp, #224]
1622+    ldr w9, [sp, #232]
1623+    ldr w10, [sp, #240]
1624+    ldr x11, [sp, #248]
1625+    ldr x12, [sp, #256]
1626+    ldr x13, [sp, #264]
1627+    ldr x14, [sp, #272]
1628+    ldr x15, [sp, #280]
1629 
1630     mov x23, #4
1631     mul x23, x23, x5  // lhs step
1632@@ -70,7 +71,7 @@ LoopRow:
1633     mov x17, x4 // reload rhs col
1634     mov x29, x7 // reload bias ptr
1635     mov x27, x2 // reload dst ptr
1636-    ldr x28, [sp, #64] // reload filter_zp
1637+    ldr x28, [sp, #288] // reload filter_zp
1638 
1639     LoopCol:
1640         mov x25, x6 // reload a_sums ptr
1641@@ -334,16 +335,15 @@ LoopRow:
1642 LoopColEnd:
1643     subs x3, x3, #4
1644     ble LoopRowEnd
1645-    ldr x11, [sp, #24]
1646-    ldr x12, [sp, #32]
1647-    ldr x13, [sp, #40]
1648+    ldr x11, [sp, #248]
1649+    ldr x12, [sp, #256]
1650+    ldr x13, [sp, #264]
1651     add x6, x6, #16
1652     add x0, x0, x23
1653     add x2, x2, x24
1654     b LoopRow
1655 
1656 LoopRowEnd:
1657-    sub sp, sp, #224
1658     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1659     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1660     ldp x19, x20, [sp], #16
1661diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S
1662index fe5207ad..adb0a42c 100644
1663--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S
1664+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S
1665@@ -33,9 +33,10 @@
1666 
1667 asm_function MatMulR4Int8Neon64
1668   sub sp, sp, #144
1669-  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1670-  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1671-  stp x19, x20, [sp], #16
1672+  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1673+  add x9, sp, #64
1674+  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1675+  stp x19, x20, [sp, #128]
1676 
1677   mov w15, #0       // b col index
1678   mov w16, #0       // a row index
1679@@ -185,7 +186,6 @@ End2:
1680   b L1
1681 
1682 End1:
1683-  sub sp, sp, #144
1684   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1685   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1686   ldp x19, x20, [sp], #16
1687diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S
1688index 0b814ce4..23032ab9 100644
1689--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S
1690+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S
1691@@ -27,9 +27,9 @@ asm_function MatrixMultiplyWinograd
1692     // x19 ~ x29 should be also preserved
1693     // whereas our coding style do not permit such amount of parameters
1694     sub sp, sp, #48
1695-    st1 {v8.4s}, [sp], #16
1696-    stp x19, x20, [sp], #16
1697-    stp x21, x22, [sp], #16
1698+    st1 {v8.4s}, [sp]
1699+    stp x19, x20, [sp, #16]
1700+    stp x21, x22, [sp, #32]
1701     mov x8, #4
1702     mul x10, x5, x8
1703     mov x17, x3  // m
1704@@ -176,7 +176,6 @@ asm_function MatrixMultiplyWinograd
1705             add x0, x0, x21
1706             b LoopM
1707     EndLoopM:
1708-        sub sp, sp, #48
1709         ld1 {v8.4s}, [sp], #16
1710         ldp x19, x20, [sp], #16
1711         ldp x21, x22, [sp], #16
1712diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S
1713index 5355d302..1392ab4a 100644
1714--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S
1715+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S
1716@@ -34,8 +34,9 @@
1717 
1718 asm_function PostFuncBiasReluC8
1719   sub sp, sp, #128
1720-  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1721-  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1722+  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1723+  add x9, sp, #64
1724+  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1725 
1726   movi v26.4s, #6
1727   scvtf v26.4s, v26.4s
1728@@ -546,7 +547,6 @@ Loop_C1_7_Write:
1729   b Loop_C1_7_Write
1730 
1731 End:
1732-  sub sp, sp, #128
1733   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1734   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1735   ret
1736diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S
1737index 0818d74e..a240b64d 100644
1738--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S
1739+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S
1740@@ -54,14 +54,14 @@
1741 
1742 asm_function PostFuncInt8C4Neon64
1743   sub sp, sp, #16
1744-  stp x24, x25, [sp], #16
1745+  stp x24, x25, [sp]
1746 
1747-  ldr w8, [sp]
1748-  ldr w9, [sp, #8]
1749-  ldr w10, [sp, #16]
1750-  ldr w11, [sp, #24]
1751-  ldr w12, [sp, #32]
1752-  ldr w13, [sp, #40]
1753+  ldr w8, [sp, #16]
1754+  ldr w9, [sp, #24]
1755+  ldr w10, [sp, #32]
1756+  ldr w11, [sp, #40]
1757+  ldr w12, [sp, #48]
1758+  ldr w13, [sp, #56]
1759 
1760   dup v26.4s, w7
1761   dup v27.4s, w8
1762@@ -254,7 +254,6 @@ Loop_C1_3:
1763 
1764 
1765 End:
1766-  sub sp, sp, #16
1767   ldp x24, x25, [sp], #16
1768   ret
1769 #endif
1770diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S
1771index cfa9bdf8..614d83f8 100644
1772--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S
1773+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S
1774@@ -55,9 +55,10 @@
1775 
1776 asm_function SPMM8x8Fp32
1777     sub sp, sp, #144
1778-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1779-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1780-    stp x19, x20, [sp], #16
1781+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1782+    add x9, sp, #64
1783+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1784+    stp x19, x20, [sp, #128]
1785 
1786     // init output with bias
1787     ldr w8, [x5], #4
1788@@ -286,7 +287,6 @@ WRITE_OUT:
1789     st1 {v14.4s, v15.4s}, [x4]
1790 
1791 End:
1792-  sub sp, sp, #144
1793   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1794   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1795   ldp x19, x20, [sp], #16
1796diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S
1797index 5987e68a..e0efc7b2 100644
1798--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S
1799+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S
1800@@ -29,8 +29,9 @@ asm_function TiledC4MatmulFp32
1801 //x5: oc4
1802 
1803 sub sp, sp, #128
1804-st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1805-st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1806+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1807+add x9, sp, #64
1808+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1809 
1810 mov x7, #4 //sizeof(float)
1811 mul x3, x3, x7
1812@@ -272,7 +273,6 @@ LoopOcHalf:
1813     st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
1814 
1815 LoopOcEnd:
1816-    sub sp, sp, #128
1817     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1818     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1819     ret
1820diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S
1821index 4a26b251..243b19de 100644
1822--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S
1823+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S
1824@@ -30,7 +30,7 @@ asm_function WinogradTransLeft
1825 //x6:length
1826 
1827 sub sp, sp, #32
1828-stp x19, x20, [sp], #32
1829+stp x19, x20, [sp]
1830 
1831 mov x8, #16 // 4 * sizeof(float)
1832 mul x8, x6, x8
1833@@ -152,7 +152,6 @@ LoopH:
1834     subs x4, x4, #1
1835     bne LoopH
1836 
1837-    sub sp, sp, #32
1838     ldp x19, x20, [sp], #32
1839     ret
1840 
1841diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S
1842index 931fa016..95ee50a5 100644
1843--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S
1844+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S
1845@@ -30,7 +30,7 @@ asm_function WinogradTransRight
1846 //x6: length
1847 
1848 sub sp, sp, #16
1849-stp x19, x20, [sp], #16
1850+stp x19, x20, [sp]
1851 
1852 mov x8, #16 // 4 * sizeof(float)
1853 mul x8, x6, x8
1854@@ -155,7 +155,6 @@ LoopH:
1855     subs x4, x4, #1
1856     bne LoopH
1857 
1858-    sub sp, sp, #16
1859     ldp x19, x20, [sp], #16
1860     ret
1861 #endif
1862diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S
1863index 221a1609..56f03dbd 100644
1864--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S
1865+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S
1866@@ -31,21 +31,22 @@ asm_function ConvDwFp16Center
1867     // x19 ~ x29 should be also preserved
1868     // whereas our coding style do not permit such amount of parameters
1869     sub sp, sp, #192
1870-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1871-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1872-    stp x19, x20, [sp], #16
1873-    stp x21, x22, [sp], #16
1874-    stp x23, x24, [sp], #16
1875-    stp x25, x26, [sp], #16
1876+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1877+    add x9, sp, #64
1878+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1879+    stp x19, x20, [sp, #128]
1880+    stp x21, x22, [sp, #144]
1881+    stp x23, x24, [sp, #160]
1882+    stp x25, x26, [sp, #176]
1883 
1884-    ldr x8, [sp]
1885-    ldr x9, [sp, #8]
1886-    ldr x10, [sp, #16]
1887-    ldr x11, [sp, #24]
1888-    ldr x12, [sp, #32]
1889-    ldr x13, [sp, #40]
1890-    ldr x14, [sp, #48]
1891-    ldr x15, [sp, #56]
1892+    ldr x8, [sp, #192]
1893+    ldr x9, [sp, #200]
1894+    ldr x10, [sp, #208]
1895+    ldr x11, [sp, #216]
1896+    ldr x12, [sp, #224]
1897+    ldr x13, [sp, #232]
1898+    ldr x14, [sp, #240]
1899+    ldr x15, [sp, #248]
1900 
1901     ld1 {v24.8h}, [x3]
1902     movi v26.8h, #0x46, lsl #8
1903@@ -301,7 +302,6 @@ asm_function ConvDwFp16Center
1904         subs x4, x4, #1
1905         bne LoopH
1906 
1907-    sub sp, sp, #192
1908     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1909     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1910     ldp x19, x20, [sp], #16
1911diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S
1912index 1266b160..bb37a913 100644
1913--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S
1914+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S
1915@@ -30,14 +30,14 @@ asm_function DeconvDwFp16Center
1916     // x19 ~ x29 should be also preserved
1917     // whereas our coding style do not permit such amount of parameters
1918     sub sp, sp, #32
1919-    stp x19, x20, [sp], #16
1920-    stp x21, x22, [sp], #16
1921+    stp x19, x20, [sp]
1922+    stp x21, x22, [sp, #16]
1923 
1924-    ldr x8, [sp]
1925-    ldr x9, [sp, #8]
1926-    ldr x10, [sp, #16]
1927-    ldr x11, [sp, #24]
1928-    ldr x12, [sp, #32]
1929+    ldr x8, [sp, #32]
1930+    ldr x9, [sp, #40]
1931+    ldr x10, [sp, #48]
1932+    ldr x11, [sp, #56]
1933+    ldr x12, [sp, #64]
1934 
1935     LoopH:
1936         mov x15, x0
1937@@ -69,7 +69,6 @@ asm_function DeconvDwFp16Center
1938         subs x3, x3, #1
1939         bne LoopH
1940 
1941-    sub sp, sp, #32
1942     ldp x19, x20, [sp], #16
1943     ldp x21, x22, [sp], #16
1944     ret
1945diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S
1946index 80a55b75..4f5441a3 100644
1947--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S
1948+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S
1949@@ -30,8 +30,9 @@
1950 
1951 asm_function MatVecMulFp16Neon64
1952   sub sp, sp, #128
1953-  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
1954-  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
1955+  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
1956+  add x9, sp, #64
1957+  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9]
1958 
1959   mov w14, #2      // sizeof(float16)
1960   mul w8, w14, w5  // rhs depthx1 block stride
1961@@ -184,7 +185,6 @@ Write1x1:
1962   b Loop
1963 
1964 End:
1965-  sub sp, sp, #128
1966   ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
1967   ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
1968   ret
1969diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S
1970index a0e28b74..9f804fd3 100644
1971--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S
1972+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S
1973@@ -36,13 +36,14 @@
1974 
1975 asm_function MatMul12x16Fp16Opt
1976     sub sp, sp, #160
1977-    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
1978-    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
1979-    stp x19, x20, [sp], #16
1980-    stp x21, x22, [sp], #16
1981+    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
1982+    add x9, sp, #64
1983+    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9]
1984+    stp x19, x20, [sp, #128]
1985+    stp x21, x22, [sp, #144]
1986 
1987-    ldr x8, [sp]
1988-    ldr x9, [sp, #8]
1989+    ldr x8, [sp, #160]
1990+    ldr x9, [sp, #168]
1991 
1992 .macro CLEAR_OUTPUT_V8_V9
1993     dup v8.4s, wzr
1994@@ -1694,7 +1695,6 @@ LoopColEnd:
1995         subs x6, x6, #12
1996         bgt LoopRowStart
1997 
1998-    sub sp, sp, #160
1999     ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2000     ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2001     ldp x19, x20, [sp], #16
2002diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S
2003index 79fa12bc..31f1adbd 100644
2004--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S
2005+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S
2006@@ -34,13 +34,14 @@
2007 
2008 asm_function MatmulBaseFp16Neon
2009     sub sp, sp, #160
2010-    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2011-    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2012-    stp x19, x20, [sp], #16
2013-    stp x21, x22, [sp], #16
2014+    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
2015+    add x9, sp, #64
2016+    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9]
2017+    stp x19, x20, [sp, #128]
2018+    stp x21, x22, [sp, #144]
2019 
2020-    ldr x8, [sp]
2021-    ldr x9, [sp, #8]  // act
2022+    ldr x8, [sp, #160]
2023+    ldr x9, [sp, #168]  // act
2024     add x8, x8, x8  // stride * sizeof(float16_t)
2025 
2026     add x16, x7, x7 // col * sizeof(float16_t)
2027@@ -951,7 +952,6 @@ LoopColEnd:
2028     add x0, x0, x15
2029     bgt LoopRowStart
2030 
2031-    sub sp, sp, #160
2032     ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2033     ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2034     ldp x19, x20, [sp], #16
2035diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S
2036index 6bb93f99..1d6b69a6 100644
2037--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S
2038+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S
2039@@ -34,15 +34,16 @@
2040 
2041 asm_function MatmulFp16Neon64
2042   sub sp, sp, #144
2043-  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2044-  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2045-  stp x19, x20, [sp], #16
2046+  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
2047+  add x9, sp, #64
2048+  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9]
2049+  stp x19, x20, [sp, #128]
2050 
2051   mov w18, #16 // sizeof(float16) * 8
2052   mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float16) * 8 * depth
2053   mov x11, x3 // bias flag
2054   mov x19, #2
2055-  ldr x17, [sp]
2056+  ldr x17, [sp, #144]
2057   mul x17, x17, x19
2058 
2059 L1:
2060@@ -308,7 +309,7 @@ Relu:
2061   fmax v31.8h, v31.8h, v14.8h
2062 
2063 Write:
2064-  ldrb w13, [sp, #8]
2065+  ldrb w13, [sp, #152]
2066   cbz w13, WriteC8
2067   cmp w7, #1
2068   beq Write1
2069@@ -877,14 +878,13 @@ End2:
2070   subs w7, w7, #8 // rhs col - 8
2071   add x1, x1, x15 // rhs ptr + stride
2072   add x3, x3, #16 // bias ptr + stride
2073-  ldrb w13, [sp, #8]
2074+  ldrb w13, [sp, #152]
2075   cbz w13, NoDstStep
2076   add x2, x2, #16 // dst ptr + stride
2077 NoDstStep:
2078   bgt L1
2079 
2080 End1:
2081-  sub sp, sp, #144
2082   ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2083   ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2084   ldp x19, x20, [sp], #16
2085diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S
2086index 4a111066..21348f80 100644
2087--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S
2088+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S
2089@@ -34,12 +34,12 @@
2090 
2091 asm_function MatmulFp16Neon64Opt
2092     sub sp, sp, #96
2093-    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2094-    stp x19, x20, [sp], #16
2095-    stp x21, x22, [sp], #16
2096+    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
2097+    stp x19, x20, [sp, #64]
2098+    stp x21, x22, [sp, #80]
2099 
2100-    ldr x8, [sp]
2101-    ldr x9, [sp, #8]
2102+    ldr x8, [sp, #96]
2103+    ldr x9, [sp, #104]
2104 
2105     mov x21, #32 // sizeof(float16_t) * 16
2106     mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth
2107@@ -1178,7 +1178,6 @@ LoopColEnd:
2108         subs x6, x6, #16
2109         bgt LoopRowStart
2110 
2111-    sub sp, sp, #96
2112     ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2113     ldp x19, x20, [sp], #16
2114     ldp x21, x22, [sp], #16
2115diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S
2116index 2d901a3d..40b788c9 100644
2117--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S
2118+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S
2119@@ -34,15 +34,16 @@
2120 
2121 asm_function MatmulFp16OptV2
2122     sub sp, sp, #192
2123-    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2124-    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2125-    stp x19, x20, [sp], #16
2126-    stp x21, x22, [sp], #16
2127-    stp x23, x24, [sp], #16
2128-    stp x29, x30, [sp], #16
2129-
2130-    ldr x8, [sp]
2131-    ldr x9, [sp, #8]  // writeMode
2132+    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
2133+    add x9, sp, #64
2134+    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9]
2135+    stp x19, x20, [sp, #128]
2136+    stp x21, x22, [sp, #144]
2137+    stp x23, x24, [sp, #160]
2138+    stp x29, x30, [sp, #176]
2139+
2140+    ldr x8, [sp, #192]
2141+    ldr x9, [sp, #200]  // writeMode
2142     lsl x8, x8, #1  // stride * sizeof(float16_t)
2143 
2144     lsl x15, x7, #1 // col * sizeof(float16_t)
2145@@ -2955,7 +2956,6 @@ Compute1x4Unit:
2146         ret
2147 
2148 End:
2149-  sub sp, sp, #192
2150   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
2151   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
2152   ldp x19, x20, [sp], #16
2153diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S
2154index 9ee3c4d5..ca0542da 100644
2155--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S
2156+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S
2157@@ -27,9 +27,9 @@ asm_function MatrixMultiplyWinogradFp16
2158     // x19 ~ x29 should be also preserved
2159     // whereas our coding style do not permit such amount of parameters
2160     sub sp, sp, #48
2161-    st1 {v8.8h}, [sp], #16
2162-    stp x19, x20, [sp], #16
2163-    stp x21, x22, [sp], #16
2164+    st1 {v8.8h}, [sp]
2165+    stp x19, x20, [sp, #16]
2166+    stp x21, x22, [sp, #32]
2167 
2168     mov x8, #2
2169     mul x10, x5, x8    // n * 2
2170@@ -210,7 +210,6 @@ asm_function MatrixMultiplyWinogradFp16
2171             b LoopM
2172 
2173     EndLoopM:
2174-        sub sp, sp, #48
2175         ld1 {v8.8h}, [sp], #16
2176         ldp x19, x20, [sp], #16
2177         ldp x21, x22, [sp], #16
2178diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S
2179index d7570d18..5b616ae7 100644
2180--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S
2181+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S
2182@@ -22,8 +22,9 @@
2183 asm_function TiledC4MatmulFp16
2184 
2185 sub sp, sp, #128
2186-st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
2187-st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
2188+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
2189+add x9, sp, #64
2190+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
2191 
2192 mov x7, #2 //sizeof(float)
2193 mul x3, x3, x7
2194@@ -265,7 +266,6 @@ LoopOcHalf:
2195     st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], #32
2196 
2197 LoopOcEnd:
2198-    sub sp, sp, #128
2199     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
2200     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
2201     ret
2202diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S
2203index d11dd472..0df891d3 100644
2204--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S
2205+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S
2206@@ -31,8 +31,9 @@
2207 
2208 asm_function VecMatmulFp16Neon64_2
2209   sub sp, sp, #128
2210-  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2211-  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2212+  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
2213+  add x9, sp, #64
2214+  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9]
2215 
2216 LoopCol:
2217   mov x15, x0   // reload a ptr
2218@@ -174,7 +175,6 @@ Write7:
2219   b End
2220 
2221 End:
2222-  sub sp, sp, #128
2223   ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2224   ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2225   ret
2226diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S
2227index 1970c16a..c9b4104e 100644
2228--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S
2229+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S
2230@@ -22,7 +22,7 @@
2231 asm_function WinogradTransLeftFp16
2232 
2233 sub sp, sp, #16
2234-stp x19, x20, [sp], #16
2235+stp x19, x20, [sp]
2236 
2237 mov x8, #8 // 4 * sizeof(float16)
2238 mul x8, x6, x8
2239@@ -144,7 +144,6 @@ LoopH:
2240     subs x4, x4, #1
2241     bne LoopH
2242 
2243-    sub sp, sp, #16
2244     ldp x19, x20, [sp], #16
2245     ret
2246 
2247diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S
2248index c575f504..46c3cd84 100644
2249--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S
2250+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S
2251@@ -22,7 +22,7 @@
2252 asm_function WinogradTransRightFp16
2253 
2254 sub sp, sp, #16
2255-stp x19, x20, [sp], #16
2256+stp x19, x20, [sp]
2257 
2258 mov x8, #8 // 4 * sizeof(float16)
2259 mul x8, x6, x8
2260@@ -147,7 +147,6 @@ LoopH:
2261     subs x4, x4, #1
2262     bne LoopH
2263 
2264-    sub sp, sp, #16
2265     ldp x19, x20, [sp], #16
2266 
2267     ret
2268-- 
22692.17.1
2270
2271