1be168c0dSopenharmony_ciFrom c580b97cbfea388ac393f617c4d960021bf11322 Mon Sep 17 00:00:00 2001
2be168c0dSopenharmony_ciFrom: chengfeng27 <chengfeng27@huawei.com>
3be168c0dSopenharmony_ciDate: Mon, 12 Aug 2024 11:42:12 +0800
4be168c0dSopenharmony_ciSubject: [PATCH] fix arm64/fp16 assemble can not protect stack in mutil-thread
5be168c0dSopenharmony_ci switch case
6be168c0dSopenharmony_ci
7be168c0dSopenharmony_ci---
8be168c0dSopenharmony_ci .../kernel/nnacl/assembly/arm64/AdderFp32.S   | 10 ++---
9be168c0dSopenharmony_ci .../nnacl/assembly/arm64/BigMatmulFp32Opt.S   | 22 +++++-----
10be168c0dSopenharmony_ci .../assembly/arm64/ConvDw3x3Fp32Stride1.S     | 12 ++---
11be168c0dSopenharmony_ci .../assembly/arm64/ConvDw3x3Fp32Stride2.S     | 12 ++---
12be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvDw3x3Int8.S      | 34 +++++++-------
13be168c0dSopenharmony_ci .../assembly/arm64/ConvDw3x3Int8Corner.S      | 19 ++++----
14be168c0dSopenharmony_ci .../assembly/arm64/ConvDw3x3Int8Horizontal.S  | 25 +++++------
15be168c0dSopenharmony_ci .../assembly/arm64/ConvDw3x3Int8Stride2.S     | 34 +++++++-------
16be168c0dSopenharmony_ci .../assembly/arm64/ConvDw3x3Int8Vertical.S    | 19 ++++----
17be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvDw3x3Line.S      |  6 +--
18be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvDwFp32Center.S   | 30 ++++++-------
19be168c0dSopenharmony_ci .../assembly/arm64/ConvDwFp32Indirect3x3.S    |  7 ++-
20be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvDwInt8Center.S   | 44 +++++++++----------
21be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvFp32Center.S     | 42 +++++++++---------
22be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW1x16Kernel.S   | 23 +++++-----
23be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW1x8Kernel.S    | 21 +++++----
24be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW2x16Kernel.S   | 21 +++++----
25be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW2x8Kernel.S    | 21 +++++----
26be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW3x16Kernel.S   | 23 +++++-----
27be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW3x8Kernel.S    | 21 +++++----
28be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW4x16Kernel.S   | 28 ++++++------
29be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW4x8Kernel.S    | 28 ++++++------
30be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW5x16Kernel.S   | 28 ++++++------
31be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW5x8Kernel.S    | 28 ++++++------
32be168c0dSopenharmony_ci .../nnacl/assembly/arm64/DeconvDwFp32Center.S | 15 +++----
33be168c0dSopenharmony_ci .../nnacl/assembly/arm64/DeconvDwInt8Center.S | 15 +++----
34be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatVecMulFp32.S      | 24 +++++-----
35be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatVecMulPackFp32.S  | 15 +++----
36be168c0dSopenharmony_ci .../kernel/nnacl/assembly/arm64/MatmulFp32.S  | 14 +++---
37be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatmulFp32Opt.S      | 16 +++----
38be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatmulFp32OptRow12.S | 14 +++---
39be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatmulFp32OptRow4.S  | 16 +++----
40be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatmulFp32OptRow8.S  | 14 +++---
41be168c0dSopenharmony_ci .../kernel/nnacl/assembly/arm64/MatmulInt8.S  | 38 ++++++++--------
42be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatmulInt8Opt.S      | 44 +++++++++----------
43be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatmulR4Int8.S       |  8 ++--
44be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatmulWinogradFp32.S |  7 ++-
45be168c0dSopenharmony_ci .../nnacl/assembly/arm64/PostFuncBiasReluC8.S |  6 +--
46be168c0dSopenharmony_ci .../assembly/arm64/PostFuncInt8C4Neon64.S     | 15 +++----
47be168c0dSopenharmony_ci .../kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S |  8 ++--
48be168c0dSopenharmony_ci .../nnacl/assembly/arm64/TiledC4MatmulFp32.S  |  6 +--
49be168c0dSopenharmony_ci .../nnacl/assembly/arm64/WinogradTransLeft.S  |  3 +-
50be168c0dSopenharmony_ci .../nnacl/assembly/arm64/WinogradTransRight.S |  3 +-
51be168c0dSopenharmony_ci .../nnacl/assembly/fp16/ConvDwFp16Center.S    | 30 ++++++-------
52be168c0dSopenharmony_ci .../nnacl/assembly/fp16/DeconvDwFp16Center.S  | 15 +++----
53be168c0dSopenharmony_ci .../nnacl/assembly/fp16/MatVecMulFp16.S       |  6 +--
54be168c0dSopenharmony_ci .../nnacl/assembly/fp16/Matmul12X16Fp16.S     | 14 +++---
55be168c0dSopenharmony_ci .../nnacl/assembly/fp16/MatmulBaseFp16Neon.S  | 14 +++---
56be168c0dSopenharmony_ci .../kernel/nnacl/assembly/fp16/MatmulFp16.S   | 14 +++---
57be168c0dSopenharmony_ci .../nnacl/assembly/fp16/MatmulFp16Opt.S       | 11 +++--
58be168c0dSopenharmony_ci .../nnacl/assembly/fp16/MatmulFp16OptV2.S     | 20 ++++-----
59be168c0dSopenharmony_ci .../nnacl/assembly/fp16/MatmulWinogradFp16.S  |  7 ++-
60be168c0dSopenharmony_ci .../nnacl/assembly/fp16/TiledC4MatmulFp16.S   |  6 +--
61be168c0dSopenharmony_ci .../nnacl/assembly/fp16/VecMatmulFp16.S       |  6 +--
62be168c0dSopenharmony_ci .../assembly/fp16/WinogradTransLeftFp16.S     |  3 +-
63be168c0dSopenharmony_ci .../assembly/fp16/WinogradTransRightFp16.S    |  3 +-
64be168c0dSopenharmony_ci 56 files changed, 483 insertions(+), 505 deletions(-)
65be168c0dSopenharmony_ci
66be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S
67be168c0dSopenharmony_ciindex 66136f42..9123d88c 100644
68be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S
69be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S
70be168c0dSopenharmony_ci@@ -34,11 +34,12 @@
71be168c0dSopenharmony_ci 
72be168c0dSopenharmony_ci asm_function AdderFloatNeon64
73be168c0dSopenharmony_ci     sub sp, sp, #144
74be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
75be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
76be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
77be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
78be168c0dSopenharmony_ci+    add x9, sp, #64
79be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
80be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
81be168c0dSopenharmony_ci 
82be168c0dSopenharmony_ci-    ldr x8, [sp]
83be168c0dSopenharmony_ci+    ldr x8, [sp, #144]
84be168c0dSopenharmony_ci 
85be168c0dSopenharmony_ci     mov x20, #48 // sizeof(float) * 12
86be168c0dSopenharmony_ci     mul x17, x5, x20 // block stride of lhs/rhs: sizeof(float) * 12 * depth
87be168c0dSopenharmony_ci@@ -614,7 +615,6 @@ LoopColEnd:
88be168c0dSopenharmony_ci         subs x6, x6, #12
89be168c0dSopenharmony_ci         bgt LoopRowStart
90be168c0dSopenharmony_ci 
91be168c0dSopenharmony_ci-  sub sp, sp, #144
92be168c0dSopenharmony_ci   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
93be168c0dSopenharmony_ci   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
94be168c0dSopenharmony_ci   ldp x19, x20, [sp], #16
95be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S
96be168c0dSopenharmony_ciindex 498038ff..03898585 100644
97be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S
98be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S
99be168c0dSopenharmony_ci@@ -33,16 +33,17 @@
100be168c0dSopenharmony_ci 
101be168c0dSopenharmony_ci asm_function BigMatmulFloatNeon64Opt
102be168c0dSopenharmony_ci     sub sp, sp, #224
103be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
104be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
105be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
106be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
107be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
108be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
109be168c0dSopenharmony_ci-    stp x27, x28, [sp], #16
110be168c0dSopenharmony_ci-    stp x29, x30, [sp], #16
111be168c0dSopenharmony_ci-
112be168c0dSopenharmony_ci-    ldr x8, [sp]
113be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
114be168c0dSopenharmony_ci+    add x9, sp, #64
115be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
116be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
117be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
118be168c0dSopenharmony_ci+    stp x23, x24, [sp, #160]
119be168c0dSopenharmony_ci+    stp x25, x26, [sp, #176]
120be168c0dSopenharmony_ci+    stp x27, x28, [sp, #192]
121be168c0dSopenharmony_ci+    stp x29, x30, [sp, #208]
122be168c0dSopenharmony_ci+
123be168c0dSopenharmony_ci+    ldr x8, [sp, #224]
124be168c0dSopenharmony_ci     mov x20, #1
125be168c0dSopenharmony_ci     mov x22, #32
126be168c0dSopenharmony_ci     mov x23, #48
127be168c0dSopenharmony_ci@@ -2515,7 +2516,6 @@ Compute4x4Unit:
128be168c0dSopenharmony_ci         ret
129be168c0dSopenharmony_ci 
130be168c0dSopenharmony_ci End:
131be168c0dSopenharmony_ci-  sub sp, sp, #224
132be168c0dSopenharmony_ci   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
133be168c0dSopenharmony_ci   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
134be168c0dSopenharmony_ci   ldp x19, x20, [sp], #16
135be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S
136be168c0dSopenharmony_ciindex f04d9082..b96efd64 100644
137be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S
138be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S
139be168c0dSopenharmony_ci@@ -36,12 +36,13 @@
140be168c0dSopenharmony_ci 
141be168c0dSopenharmony_ci asm_function ConvDw3x3Stride1
142be168c0dSopenharmony_ci     sub sp, sp, #128
143be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
144be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
145be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
146be168c0dSopenharmony_ci+    add x9, sp, #64
147be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
148be168c0dSopenharmony_ci 
149be168c0dSopenharmony_ci-    ldr w8, [sp]
150be168c0dSopenharmony_ci-    ldr w9, [sp, #8]
151be168c0dSopenharmony_ci-    ldr w10, [sp, #16]
152be168c0dSopenharmony_ci+    ldr w8, [sp, #128]
153be168c0dSopenharmony_ci+    ldr w9, [sp, #136]
154be168c0dSopenharmony_ci+    ldr w10, [sp, #144]
155be168c0dSopenharmony_ci 
156be168c0dSopenharmony_ci     mov w11, #4
157be168c0dSopenharmony_ci     mul w15, w4, w11   // col_size * 4
158be168c0dSopenharmony_ci@@ -203,7 +204,6 @@ WIDTH1_LEFT:
159be168c0dSopenharmony_ci         st1 {v21.4s}, [x0]
160be168c0dSopenharmony_ci 
161be168c0dSopenharmony_ci End:
162be168c0dSopenharmony_ci-    sub sp, sp, #128
163be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
164be168c0dSopenharmony_ci     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
165be168c0dSopenharmony_ci     ret
166be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S
167be168c0dSopenharmony_ciindex 0dd075dd..7632d48e 100644
168be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S
169be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S
170be168c0dSopenharmony_ci@@ -36,12 +36,13 @@
171be168c0dSopenharmony_ci 
172be168c0dSopenharmony_ci asm_function ConvDw3x3Stride2
173be168c0dSopenharmony_ci     sub sp, sp, #128
174be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
175be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
176be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
177be168c0dSopenharmony_ci+    add x9, sp, #64
178be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
179be168c0dSopenharmony_ci 
180be168c0dSopenharmony_ci-    ldr w8, [sp]
181be168c0dSopenharmony_ci-    ldr w9, [sp, #8]
182be168c0dSopenharmony_ci-    ldr w10, [sp, #16]
183be168c0dSopenharmony_ci+    ldr w8, [sp, #128]
184be168c0dSopenharmony_ci+    ldr w9, [sp, #136]
185be168c0dSopenharmony_ci+    ldr w10, [sp, #144]
186be168c0dSopenharmony_ci 
187be168c0dSopenharmony_ci     mov w11, #4
188be168c0dSopenharmony_ci     mul w15, w4, w11   // col_size * 4
189be168c0dSopenharmony_ci@@ -205,7 +206,6 @@ WIDTH1_LEFT:
190be168c0dSopenharmony_ci         st1 {v24.4s}, [x0]
191be168c0dSopenharmony_ci 
192be168c0dSopenharmony_ci End:
193be168c0dSopenharmony_ci-    sub sp, sp, #128
194be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
195be168c0dSopenharmony_ci     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
196be168c0dSopenharmony_ci     ret
197be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S
198be168c0dSopenharmony_ciindex bfb9b8f6..5187d368 100644
199be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S
200be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S
201be168c0dSopenharmony_ci@@ -44,22 +44,23 @@
202be168c0dSopenharmony_ci 
203be168c0dSopenharmony_ci asm_function ConvDw3x3Int8Neon64
204be168c0dSopenharmony_ci   sub sp, sp, #192
205be168c0dSopenharmony_ci-  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
206be168c0dSopenharmony_ci-  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
207be168c0dSopenharmony_ci-  stp x19, x20, [sp], #16
208be168c0dSopenharmony_ci-  stp x21, x22, [sp], #16
209be168c0dSopenharmony_ci-  stp x23, x24, [sp], #16
210be168c0dSopenharmony_ci-  stp x25, x26, [sp], #16
211be168c0dSopenharmony_ci-
212be168c0dSopenharmony_ci-  ldr x8, [sp]
213be168c0dSopenharmony_ci-  ldr x9, [sp, #8]
214be168c0dSopenharmony_ci-  ldr x10, [sp, #16]
215be168c0dSopenharmony_ci-  ldr x11, [sp, #24]
216be168c0dSopenharmony_ci-  ldr x12, [sp, #32]
217be168c0dSopenharmony_ci-  ldr x13, [sp, #40]
218be168c0dSopenharmony_ci-  ldr x14, [sp, #48]
219be168c0dSopenharmony_ci-  ldr x15, [sp, #56]
220be168c0dSopenharmony_ci-  ldr x23, [sp, #64]  // per_channel
221be168c0dSopenharmony_ci+  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
222be168c0dSopenharmony_ci+  add x9, sp, #64
223be168c0dSopenharmony_ci+  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
224be168c0dSopenharmony_ci+  stp x19, x20, [sp, #128]
225be168c0dSopenharmony_ci+  stp x21, x22, [sp, #144]
226be168c0dSopenharmony_ci+  stp x23, x24, [sp, #160]
227be168c0dSopenharmony_ci+  stp x25, x26, [sp, #176]
228be168c0dSopenharmony_ci+
229be168c0dSopenharmony_ci+  ldr x8, [sp, #192]
230be168c0dSopenharmony_ci+  ldr x9, [sp, #200]
231be168c0dSopenharmony_ci+  ldr x10, [sp, #208]
232be168c0dSopenharmony_ci+  ldr x11, [sp, #216]
233be168c0dSopenharmony_ci+  ldr x12, [sp, #224]
234be168c0dSopenharmony_ci+  ldr x13, [sp, #232]
235be168c0dSopenharmony_ci+  ldr x14, [sp, #240]
236be168c0dSopenharmony_ci+  ldr x15, [sp, #248]
237be168c0dSopenharmony_ci+  ldr x23, [sp, #256]  // per_channel
238be168c0dSopenharmony_ci 
239be168c0dSopenharmony_ci   add x19, x3, #16
240be168c0dSopenharmony_ci   add w20, w6, w6   // channel * 2
241be168c0dSopenharmony_ci@@ -488,7 +489,6 @@ OUTZP3:
242be168c0dSopenharmony_ci   st1 {v21.8b}, [x0], x6
243be168c0dSopenharmony_ci 
244be168c0dSopenharmony_ci End:
245be168c0dSopenharmony_ci-  sub sp, sp, #192
246be168c0dSopenharmony_ci   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
247be168c0dSopenharmony_ci   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
248be168c0dSopenharmony_ci   ldp x19, x20, [sp], #16
249be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S
250be168c0dSopenharmony_ciindex b07ac01b..416e1a3a 100644
251be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S
252be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S
253be168c0dSopenharmony_ci@@ -32,20 +32,20 @@ asm_function ConvDw3x3Int8Corner
254be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
255be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
256be168c0dSopenharmony_ci     sub sp, sp, #32
257be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
258be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
259be168c0dSopenharmony_ci+    stp x19, x20, [sp]
260be168c0dSopenharmony_ci+    stp x21, x22, [sp, #16]
261be168c0dSopenharmony_ci 
262be168c0dSopenharmony_ci     dup v25.8b, w7                      // in_zp
263be168c0dSopenharmony_ci-    ldr x8, [sp]
264be168c0dSopenharmony_ci+    ldr x8, [sp, #32]
265be168c0dSopenharmony_ci     dup v26.4s, w8                      // out_zp
266be168c0dSopenharmony_ci-    ldr x9, [sp, #8]                    // out_multiplier
267be168c0dSopenharmony_ci-    ldr x10, [sp, #16]                  // left_shift
268be168c0dSopenharmony_ci-    ldr x11, [sp, #24]                  // right_shift
269be168c0dSopenharmony_ci-    ldr x12, [sp, #32]
270be168c0dSopenharmony_ci+    ldr x9, [sp, #40]                    // out_multiplier
271be168c0dSopenharmony_ci+    ldr x10, [sp, #48]                  // left_shift
272be168c0dSopenharmony_ci+    ldr x11, [sp, #56]                  // right_shift
273be168c0dSopenharmony_ci+    ldr x12, [sp, #64]
274be168c0dSopenharmony_ci     dup v30.4s, w12                     // acc_min
275be168c0dSopenharmony_ci-    ldr x13, [sp, #40]
276be168c0dSopenharmony_ci+    ldr x13, [sp, #72]
277be168c0dSopenharmony_ci     dup v31.4s, w13                     // acc_max
278be168c0dSopenharmony_ci-    ldr x14, [sp, #48]                  // per_channel
279be168c0dSopenharmony_ci+    ldr x14, [sp, #80]                  // per_channel
280be168c0dSopenharmony_ci     cbnz x14, PerChannelDump
281be168c0dSopenharmony_ci     PerLayerDump:
282be168c0dSopenharmony_ci         ld1r {v27.4s}, [x9]
283be168c0dSopenharmony_ci@@ -216,7 +216,6 @@ asm_function ConvDw3x3Int8Corner
284be168c0dSopenharmony_ci         st1 {v23.s}[0], [x0], #4
285be168c0dSopenharmony_ci         st1 {v24.s}[0], [x0], #4
286be168c0dSopenharmony_ci 
287be168c0dSopenharmony_ci-    sub sp, sp, #32
288be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
289be168c0dSopenharmony_ci     ldp x21, x22, [sp], #16
290be168c0dSopenharmony_ci     ret
291be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S
292be168c0dSopenharmony_ciindex 92eeffea..379154e6 100644
293be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S
294be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S
295be168c0dSopenharmony_ci@@ -32,21 +32,21 @@ asm_function ConvDw3x3Int8Horizontal
296be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
297be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
298be168c0dSopenharmony_ci     sub sp, sp, #48
299be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
300be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
301be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
302be168c0dSopenharmony_ci+    stp x19, x20, [sp]
303be168c0dSopenharmony_ci+    stp x21, x22, [sp, #16]
304be168c0dSopenharmony_ci+    stp x23, x24, [sp, #32]
305be168c0dSopenharmony_ci 
306be168c0dSopenharmony_ci     dup v25.8b, w7                      // in_zp
307be168c0dSopenharmony_ci-    ldr x8, [sp]
308be168c0dSopenharmony_ci+    ldr x8, [sp, #48]
309be168c0dSopenharmony_ci     dup v26.4s, w8                      // out_zp
310be168c0dSopenharmony_ci-    ldr x9, [sp, #8]                    // out_multiplier
311be168c0dSopenharmony_ci-    ldr x10, [sp, #16]                  // left_shift
312be168c0dSopenharmony_ci-    ldr x11, [sp, #24]                  // right_shift
313be168c0dSopenharmony_ci-    ldr x12, [sp, #32]
314be168c0dSopenharmony_ci+    ldr x9, [sp, #56]                    // out_multiplier
315be168c0dSopenharmony_ci+    ldr x10, [sp, #64]                  // left_shift
316be168c0dSopenharmony_ci+    ldr x11, [sp, #72]                  // right_shift
317be168c0dSopenharmony_ci+    ldr x12, [sp, #80]
318be168c0dSopenharmony_ci     dup v30.4s, w12                     // acc_min
319be168c0dSopenharmony_ci-    ldr x13, [sp, #40]
320be168c0dSopenharmony_ci+    ldr x13, [sp, #88]
321be168c0dSopenharmony_ci     dup v31.4s, w13                     // acc_max
322be168c0dSopenharmony_ci-    ldr x14, [sp, #48]                  // per_channel
323be168c0dSopenharmony_ci+    ldr x14, [sp, #96]                  // per_channel
324be168c0dSopenharmony_ci     cbnz x14, PerChannelDump
325be168c0dSopenharmony_ci     PerLayerDump:
326be168c0dSopenharmony_ci         ld1r {v27.4s}, [x9]
327be168c0dSopenharmony_ci@@ -58,9 +58,9 @@ asm_function ConvDw3x3Int8Horizontal
328be168c0dSopenharmony_ci         ld1 {v28.4s}, [x10], #16
329be168c0dSopenharmony_ci         ld1 {v29.4s}, [x11], #16
330be168c0dSopenharmony_ci     ContinueFunc:
331be168c0dSopenharmony_ci-    ldr x12, [sp, #32]
332be168c0dSopenharmony_ci+    ldr x12, [sp, #80]
333be168c0dSopenharmony_ci     dup v30.4s, w12                     // acc_min
334be168c0dSopenharmony_ci-    ldr x13, [sp, #40]
335be168c0dSopenharmony_ci+    ldr x13, [sp, #88]
336be168c0dSopenharmony_ci     dup v31.4s, w13                     // acc_max
337be168c0dSopenharmony_ci 
338be168c0dSopenharmony_ci     mov x12, #2
339be168c0dSopenharmony_ci@@ -248,7 +248,6 @@ asm_function ConvDw3x3Int8Horizontal
340be168c0dSopenharmony_ci 
341be168c0dSopenharmony_ci         st1 {v23.s}[0], [x0], #4
342be168c0dSopenharmony_ci         st1 {v24.s}[0], [x0], #4
343be168c0dSopenharmony_ci-    sub sp, sp, #48
344be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
345be168c0dSopenharmony_ci     ldp x21, x22, [sp], #16
346be168c0dSopenharmony_ci     ldp x23, x24, [sp], #16
347be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
348be168c0dSopenharmony_ciindex cc1b3e9b..8643a536 100644
349be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
350be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
351be168c0dSopenharmony_ci@@ -44,22 +44,23 @@
352be168c0dSopenharmony_ci 
353be168c0dSopenharmony_ci asm_function ConvDw3x3Int8Stride2
354be168c0dSopenharmony_ci     sub sp, sp, #192
355be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
356be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
357be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
358be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
359be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
360be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
361be168c0dSopenharmony_ci-
362be168c0dSopenharmony_ci-    ldr x8, [sp]
363be168c0dSopenharmony_ci-    ldr x9, [sp, #8]
364be168c0dSopenharmony_ci-    ldr x10, [sp, #16]
365be168c0dSopenharmony_ci-    ldr x11, [sp, #24]
366be168c0dSopenharmony_ci-    ldr x12, [sp, #32]
367be168c0dSopenharmony_ci-    ldr x13, [sp, #40]
368be168c0dSopenharmony_ci-    ldr x14, [sp, #48]
369be168c0dSopenharmony_ci-    ldr x15, [sp, #56]
370be168c0dSopenharmony_ci-    ldr x23, [sp, #64]  // per_channel
371be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
372be168c0dSopenharmony_ci+    add x9, sp, #64
373be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
374be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
375be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
376be168c0dSopenharmony_ci+    stp x23, x24, [sp, #160]
377be168c0dSopenharmony_ci+    stp x25, x26, [sp, #176]
378be168c0dSopenharmony_ci+
379be168c0dSopenharmony_ci+    ldr x8, [sp, #192]
380be168c0dSopenharmony_ci+    ldr x9, [sp, #200]
381be168c0dSopenharmony_ci+    ldr x10, [sp, #208]
382be168c0dSopenharmony_ci+    ldr x11, [sp, #216]
383be168c0dSopenharmony_ci+    ldr x12, [sp, #224]
384be168c0dSopenharmony_ci+    ldr x13, [sp, #232]
385be168c0dSopenharmony_ci+    ldr x14, [sp, #240]
386be168c0dSopenharmony_ci+    ldr x15, [sp, #248]
387be168c0dSopenharmony_ci+    ldr x23, [sp, #256]  // per_channel
388be168c0dSopenharmony_ci 
389be168c0dSopenharmony_ci     add x19, x3, #16
390be168c0dSopenharmony_ci     add w20, w6, w6   // channel * 2
391be168c0dSopenharmony_ci@@ -463,7 +464,6 @@ OUTZP3:
392be168c0dSopenharmony_ci     st1 {v24.8b}, [x0], x6
393be168c0dSopenharmony_ci 
394be168c0dSopenharmony_ci End:
395be168c0dSopenharmony_ci-    sub sp, sp, #192
396be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
397be168c0dSopenharmony_ci     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
398be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
399be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S
400be168c0dSopenharmony_ciindex 67151534..706bc9fe 100644
401be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S
402be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S
403be168c0dSopenharmony_ci@@ -32,20 +32,20 @@ asm_function ConvDw3x3Int8Vertical
404be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
405be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
406be168c0dSopenharmony_ci     sub sp, sp, #32
407be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
408be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
409be168c0dSopenharmony_ci+    stp x19, x20, [sp]
410be168c0dSopenharmony_ci+    stp x21, x22, [sp, #16]
411be168c0dSopenharmony_ci 
412be168c0dSopenharmony_ci     dup v25.8b, w7                      // in_zp
413be168c0dSopenharmony_ci-    ldr x8, [sp]
414be168c0dSopenharmony_ci+    ldr x8, [sp, #32]
415be168c0dSopenharmony_ci     dup v26.4s, w8                      // out_zp
416be168c0dSopenharmony_ci-    ldr x9, [sp, #8]                    // out_multiplier
417be168c0dSopenharmony_ci-    ldr x10, [sp, #16]                  // left_shift
418be168c0dSopenharmony_ci-    ldr x11, [sp, #24]                  // right_shift
419be168c0dSopenharmony_ci-    ldr x12, [sp, #32]
420be168c0dSopenharmony_ci+    ldr x9, [sp, #40]                    // out_multiplier
421be168c0dSopenharmony_ci+    ldr x10, [sp, #48]                  // left_shift
422be168c0dSopenharmony_ci+    ldr x11, [sp, #56]                  // right_shift
423be168c0dSopenharmony_ci+    ldr x12, [sp, #64]
424be168c0dSopenharmony_ci     dup v30.4s, w12                     // acc_min
425be168c0dSopenharmony_ci-    ldr x13, [sp, #40]
426be168c0dSopenharmony_ci+    ldr x13, [sp, #72]
427be168c0dSopenharmony_ci     dup v31.4s, w13                     // acc_max
428be168c0dSopenharmony_ci-    ldr x14, [sp, #48]                  // per_channel
429be168c0dSopenharmony_ci+    ldr x14, [sp, #80]                  // per_channel
430be168c0dSopenharmony_ci     cbnz x14, PerChannelDump
431be168c0dSopenharmony_ci     PerLayerDump:
432be168c0dSopenharmony_ci         ld1r {v27.4s}, [x9]
433be168c0dSopenharmony_ci@@ -239,7 +239,6 @@ asm_function ConvDw3x3Int8Vertical
434be168c0dSopenharmony_ci 
435be168c0dSopenharmony_ci         st1 {v23.s}[0], [x0], #4
436be168c0dSopenharmony_ci         st1 {v24.s}[0], [x0], #4
437be168c0dSopenharmony_ci-    sub sp, sp, #32
438be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
439be168c0dSopenharmony_ci     ldp x21, x22, [sp], #16
440be168c0dSopenharmony_ci     ret
441be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S
442be168c0dSopenharmony_ciindex 6157848e..f939ec62 100644
443be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S
444be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S
445be168c0dSopenharmony_ci@@ -29,8 +29,9 @@ asm_function ConvDw3x3Line
446be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
447be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
448be168c0dSopenharmony_ci     sub sp, sp, #128
449be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
450be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
451be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
452be168c0dSopenharmony_ci+    add x9, sp, #64
453be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
454be168c0dSopenharmony_ci 
455be168c0dSopenharmony_ci     ldr x8, [x1]
456be168c0dSopenharmony_ci     ldr x9, [x1, #8]
457be168c0dSopenharmony_ci@@ -196,7 +197,6 @@ asm_function ConvDw3x3Line
458be168c0dSopenharmony_ci         add x0, x0, #16
459be168c0dSopenharmony_ci         bgt LoopC4
460be168c0dSopenharmony_ci 
461be168c0dSopenharmony_ci-    sub sp, sp, #128
462be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
463be168c0dSopenharmony_ci     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
464be168c0dSopenharmony_ci     ret
465be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S
466be168c0dSopenharmony_ciindex e9ddd65a..6f30c3ac 100644
467be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S
468be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S
469be168c0dSopenharmony_ci@@ -31,21 +31,22 @@ asm_function ConvDwFp32Center
470be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
471be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
472be168c0dSopenharmony_ci     sub sp, sp, #192
473be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
474be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
475be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
476be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
477be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
478be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
479be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
480be168c0dSopenharmony_ci+    add x9, sp, #64
481be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
482be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
483be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
484be168c0dSopenharmony_ci+    stp x23, x24, [sp, #160]
485be168c0dSopenharmony_ci+    stp x25, x26, [sp, #176]
486be168c0dSopenharmony_ci 
487be168c0dSopenharmony_ci-    ldr x8, [sp]
488be168c0dSopenharmony_ci-    ldr x9, [sp, #8]
489be168c0dSopenharmony_ci-    ldr x10, [sp, #16]
490be168c0dSopenharmony_ci-    ldr x11, [sp, #24]
491be168c0dSopenharmony_ci-    ldr x12, [sp, #32]
492be168c0dSopenharmony_ci-    ldr x13, [sp, #40]
493be168c0dSopenharmony_ci-    ldr x14, [sp, #48]
494be168c0dSopenharmony_ci-    ldr x15, [sp, #56]
495be168c0dSopenharmony_ci+    ldr x8, [sp, #192]
496be168c0dSopenharmony_ci+    ldr x9, [sp, #200]
497be168c0dSopenharmony_ci+    ldr x10, [sp, #208]
498be168c0dSopenharmony_ci+    ldr x11, [sp, #216]
499be168c0dSopenharmony_ci+    ldr x12, [sp, #224]
500be168c0dSopenharmony_ci+    ldr x13, [sp, #232]
501be168c0dSopenharmony_ci+    ldr x14, [sp, #240]
502be168c0dSopenharmony_ci+    ldr x15, [sp, #248]
503be168c0dSopenharmony_ci 
504be168c0dSopenharmony_ci     ld1 {v24.4s}, [x3]
505be168c0dSopenharmony_ci     movi v26.4s, #6
506be168c0dSopenharmony_ci@@ -302,7 +303,6 @@ asm_function ConvDwFp32Center
507be168c0dSopenharmony_ci         subs x4, x4, #1
508be168c0dSopenharmony_ci         bne LoopH
509be168c0dSopenharmony_ci 
510be168c0dSopenharmony_ci-    sub sp, sp, #192
511be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
512be168c0dSopenharmony_ci     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
513be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
514be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
515be168c0dSopenharmony_ciindex 34cc9037..ca93dc7d 100644
516be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
517be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
518be168c0dSopenharmony_ci@@ -25,14 +25,14 @@
519be168c0dSopenharmony_ci 
520be168c0dSopenharmony_ci asm_function ConvDwFp32Indirect3x3
521be168c0dSopenharmony_ci     sub sp, sp, #32
522be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
523be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
524be168c0dSopenharmony_ci+    stp x19, x20, [sp]
525be168c0dSopenharmony_ci+    stp x21, x22, [sp, #16]
526be168c0dSopenharmony_ci 
527be168c0dSopenharmony_ci     movi v31.4s, #6
528be168c0dSopenharmony_ci     scvtf v31.4s, v31.4s
529be168c0dSopenharmony_ci     dup v30.4s, wzr
530be168c0dSopenharmony_ci 
531be168c0dSopenharmony_ci-    ldr x8, [sp]
532be168c0dSopenharmony_ci+    ldr x8, [sp, #32]
533be168c0dSopenharmony_ci     cmp x5, #0
534be168c0dSopenharmony_ci     beq End
535be168c0dSopenharmony_ci 
536be168c0dSopenharmony_ci@@ -153,7 +153,6 @@ asm_function ConvDwFp32Indirect3x3
537be168c0dSopenharmony_ci         cmp x5, #0
538be168c0dSopenharmony_ci         bgt LoopPixel
539be168c0dSopenharmony_ci End:
540be168c0dSopenharmony_ci-    sub sp, sp, #32
541be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
542be168c0dSopenharmony_ci     ldp x21, x22, [sp], #16
543be168c0dSopenharmony_ci ret
544be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S
545be168c0dSopenharmony_ciindex 7ed94e6b..328250f3 100644
546be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S
547be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S
548be168c0dSopenharmony_ci@@ -34,44 +34,45 @@ asm_function ConvDwInt8Center
549be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
550be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
551be168c0dSopenharmony_ci     sub sp, sp, #192
552be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
553be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
554be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
555be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
556be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
557be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
558be168c0dSopenharmony_ci-
559be168c0dSopenharmony_ci-    ldr x8, [sp]
560be168c0dSopenharmony_ci-    ldr x9, [sp, #8]
561be168c0dSopenharmony_ci-    ldr x10, [sp, #16]
562be168c0dSopenharmony_ci-    ldr x11, [sp, #24]
563be168c0dSopenharmony_ci-    ldr x12, [sp, #32]
564be168c0dSopenharmony_ci-    ldr x13, [sp, #40]
565be168c0dSopenharmony_ci-
566be168c0dSopenharmony_ci-    ldr x14, [sp, #48] // input_zp
567be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
568be168c0dSopenharmony_ci+    add x9, sp, #64
569be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
570be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
571be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
572be168c0dSopenharmony_ci+    stp x23, x24, [sp, #160]
573be168c0dSopenharmony_ci+    stp x25, x26, [sp, #176]
574be168c0dSopenharmony_ci+
575be168c0dSopenharmony_ci+    ldr x8, [sp, #192]
576be168c0dSopenharmony_ci+    ldr x9, [sp, #200]
577be168c0dSopenharmony_ci+    ldr x10, [sp, #208]
578be168c0dSopenharmony_ci+    ldr x11, [sp, #216]
579be168c0dSopenharmony_ci+    ldr x12, [sp, #224]
580be168c0dSopenharmony_ci+    ldr x13, [sp, #232]
581be168c0dSopenharmony_ci+
582be168c0dSopenharmony_ci+    ldr x14, [sp, #240] // input_zp
583be168c0dSopenharmony_ci     ld1 {v19.8b}, [x14], #8
584be168c0dSopenharmony_ci 
585be168c0dSopenharmony_ci-    ldr x15, [sp, #56] // output_zp
586be168c0dSopenharmony_ci+    ldr x15, [sp, #248] // output_zp
587be168c0dSopenharmony_ci     ld1 {v20.4s}, [x15], #16
588be168c0dSopenharmony_ci     ld1 {v21.4s}, [x15], #16
589be168c0dSopenharmony_ci 
590be168c0dSopenharmony_ci-    ldr x16, [sp, #64] // out_multiplier
591be168c0dSopenharmony_ci+    ldr x16, [sp, #256] // out_multiplier
592be168c0dSopenharmony_ci     ld1 {v22.4s}, [x16], #16
593be168c0dSopenharmony_ci     ld1 {v23.4s}, [x16], #16
594be168c0dSopenharmony_ci 
595be168c0dSopenharmony_ci-    ldr x17, [sp, #72] // left_shift
596be168c0dSopenharmony_ci+    ldr x17, [sp, #264] // left_shift
597be168c0dSopenharmony_ci     ld1 {v24.4s}, [x17], #16
598be168c0dSopenharmony_ci     ld1 {v25.4s}, [x17], #16
599be168c0dSopenharmony_ci 
600be168c0dSopenharmony_ci-    ldr x25, [sp, #80] // right shift
601be168c0dSopenharmony_ci+    ldr x25, [sp, #272] // right shift
602be168c0dSopenharmony_ci     ld1 {v26.4s}, [x25], #16
603be168c0dSopenharmony_ci     ld1 {v27.4s}, [x25], #16
604be168c0dSopenharmony_ci 
605be168c0dSopenharmony_ci-    ldr x19, [sp, #88] // acc_min
606be168c0dSopenharmony_ci+    ldr x19, [sp, #280] // acc_min
607be168c0dSopenharmony_ci     ld1 {v28.4s}, [x19], #16
608be168c0dSopenharmony_ci     ld1 {v29.4s}, [x19], #16
609be168c0dSopenharmony_ci 
610be168c0dSopenharmony_ci-    ldr x20, [sp, #96] // acc_max
611be168c0dSopenharmony_ci+    ldr x20, [sp, #288] // acc_max
612be168c0dSopenharmony_ci     ld1 {v30.4s}, [x20], #16
613be168c0dSopenharmony_ci     ld1 {v31.4s}, [x20], #16
614be168c0dSopenharmony_ci 
615be168c0dSopenharmony_ci@@ -283,7 +284,6 @@ asm_function ConvDwInt8Center
616be168c0dSopenharmony_ci         subs x4, x4, #1
617be168c0dSopenharmony_ci         bne LoopH
618be168c0dSopenharmony_ci 
619be168c0dSopenharmony_ci-    sub sp, sp, #192
620be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
621be168c0dSopenharmony_ci     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
622be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
623be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S
624be168c0dSopenharmony_ciindex 2cc456f6..0a9d3265 100644
625be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S
626be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S
627be168c0dSopenharmony_ci@@ -31,21 +31,22 @@ asm_function ConvSwFp32Center
628be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
629be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
630be168c0dSopenharmony_ci     sub sp, sp, #208
631be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
632be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
633be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
634be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
635be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
636be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
637be168c0dSopenharmony_ci-    stp x27, x28, [sp], #16
638be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
639be168c0dSopenharmony_ci+    add x9, sp, #64
640be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
641be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
642be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
643be168c0dSopenharmony_ci+    stp x23, x24, [sp, #160]
644be168c0dSopenharmony_ci+    stp x25, x26, [sp, #176]
645be168c0dSopenharmony_ci+    stp x27, x28, [sp, #192]
646be168c0dSopenharmony_ci 
647be168c0dSopenharmony_ci-    ldr x8, [sp]
648be168c0dSopenharmony_ci-    ldr x9, [sp, #8]
649be168c0dSopenharmony_ci-    ldr x10, [sp, #16]
650be168c0dSopenharmony_ci-    ldr x11, [sp, #24]
651be168c0dSopenharmony_ci-    ldr x12, [sp, #32]
652be168c0dSopenharmony_ci-    ldr x13, [sp, #40]
653be168c0dSopenharmony_ci-    ldr x14, [sp, #48]
654be168c0dSopenharmony_ci+    ldr x8, [sp, #208]
655be168c0dSopenharmony_ci+    ldr x9, [sp, #216]
656be168c0dSopenharmony_ci+    ldr x10, [sp, #224]
657be168c0dSopenharmony_ci+    ldr x11, [sp, #232]
658be168c0dSopenharmony_ci+    ldr x12, [sp, #240]
659be168c0dSopenharmony_ci+    ldr x13, [sp, #248]
660be168c0dSopenharmony_ci+    ldr x14, [sp, #256]
661be168c0dSopenharmony_ci     mul x15, x6, x7
662be168c0dSopenharmony_ci     mul x15, x10, x15
663be168c0dSopenharmony_ci     mov x16, #16
664be168c0dSopenharmony_ci@@ -198,9 +199,9 @@ asm_function ConvSwFp32Center
665be168c0dSopenharmony_ci                 add x20, x20, x13
666be168c0dSopenharmony_ci                 subs x22, x22, #1
667be168c0dSopenharmony_ci                 bne LoopKh16
668be168c0dSopenharmony_ci-            ldr x16, [sp, #64]
669be168c0dSopenharmony_ci+            ldr x16, [sp, #272]
670be168c0dSopenharmony_ci             cbnz x16, Relu616
671be168c0dSopenharmony_ci-            ldr x26, [sp, #56]
672be168c0dSopenharmony_ci+            ldr x26, [sp, #264]
673be168c0dSopenharmony_ci             cbnz x26, Relu16
674be168c0dSopenharmony_ci             b Write16
675be168c0dSopenharmony_ci         Relu616:
676be168c0dSopenharmony_ci@@ -347,9 +348,9 @@ asm_function ConvSwFp32Center
677be168c0dSopenharmony_ci                 add x20, x20, x13
678be168c0dSopenharmony_ci                 subs x22, x22, #1
679be168c0dSopenharmony_ci                 bne LoopKh8
680be168c0dSopenharmony_ci-            ldr x16, [sp, #64]
681be168c0dSopenharmony_ci+            ldr x16, [sp, #272]
682be168c0dSopenharmony_ci             cbnz x16, Relu68
683be168c0dSopenharmony_ci-            ldr x26, [sp, #56]
684be168c0dSopenharmony_ci+            ldr x26, [sp, #264]
685be168c0dSopenharmony_ci             cbnz x26, Relu8
686be168c0dSopenharmony_ci             b Write8
687be168c0dSopenharmony_ci         Relu68:
688be168c0dSopenharmony_ci@@ -426,9 +427,9 @@ asm_function ConvSwFp32Center
689be168c0dSopenharmony_ci                 add x20, x20, x13
690be168c0dSopenharmony_ci                 subs x22, x22, #1
691be168c0dSopenharmony_ci                 bne LoopKh
692be168c0dSopenharmony_ci-            ldr x16, [sp, #64]
693be168c0dSopenharmony_ci+            ldr x16, [sp, #272]
694be168c0dSopenharmony_ci             cbnz x16, Relu6
695be168c0dSopenharmony_ci-            ldr x26, [sp, #56]
696be168c0dSopenharmony_ci+            ldr x26, [sp, #264]
697be168c0dSopenharmony_ci             cbnz x26, Relu
698be168c0dSopenharmony_ci             b Write
699be168c0dSopenharmony_ci         Relu6:
700be168c0dSopenharmony_ci@@ -446,7 +447,6 @@ asm_function ConvSwFp32Center
701be168c0dSopenharmony_ci         subs x4, x4, #1
702be168c0dSopenharmony_ci         bne LoopH
703be168c0dSopenharmony_ci 
704be168c0dSopenharmony_ci-    sub sp, sp, #208
705be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
706be168c0dSopenharmony_ci     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
707be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
708be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S
709be168c0dSopenharmony_ciindex 2267e776..3b436c17 100644
710be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S
711be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S
712be168c0dSopenharmony_ci@@ -30,17 +30,17 @@ asm_function SWConv1x16Kernel
713be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
714be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
715be168c0dSopenharmony_ci     sub sp, sp, #64
716be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
717be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
718be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
719be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
720be168c0dSopenharmony_ci-
721be168c0dSopenharmony_ci-    ldr x10, [sp]
722be168c0dSopenharmony_ci-    ldr x11, [sp, #8]
723be168c0dSopenharmony_ci-    ldr x12, [sp, #16]
724be168c0dSopenharmony_ci-    ldr x13, [sp, #24]
725be168c0dSopenharmony_ci-    ldr x14, [sp, #32]
726be168c0dSopenharmony_ci-    ldr x15, [sp, #40]
727be168c0dSopenharmony_ci+    stp x19, x20, [sp]
728be168c0dSopenharmony_ci+    stp x21, x22, [sp, #16]
729be168c0dSopenharmony_ci+    stp x23, x24, [sp, #32]
730be168c0dSopenharmony_ci+    stp x25, x26, [sp, #48]
731be168c0dSopenharmony_ci+
732be168c0dSopenharmony_ci+    ldr x10, [sp, #64]
733be168c0dSopenharmony_ci+    ldr x11, [sp, #72]
734be168c0dSopenharmony_ci+    ldr x12, [sp, #80]
735be168c0dSopenharmony_ci+    ldr x13, [sp, #88]
736be168c0dSopenharmony_ci+    ldr x14, [sp, #96]
737be168c0dSopenharmony_ci+    ldr x15, [sp, #104]
738be168c0dSopenharmony_ci     lsl x7, x7, #2
739be168c0dSopenharmony_ci     lsl x11, x11, #2
740be168c0dSopenharmony_ci     lsl x12, x12, #2
741be168c0dSopenharmony_ci@@ -413,7 +413,6 @@ asm_function SWConv1x16Kernel
742be168c0dSopenharmony_ci             st1 {v2.4s}, [x21]
743be168c0dSopenharmony_ci             st1 {v3.4s}, [x22]
744be168c0dSopenharmony_ci     End:
745be168c0dSopenharmony_ci-    sub sp, sp, #64
746be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
747be168c0dSopenharmony_ci     ldp x21, x22, [sp], #16
748be168c0dSopenharmony_ci     ldp x23, x24, [sp], #16
749be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S
750be168c0dSopenharmony_ciindex fa8bb63d..6a29e95e 100644
751be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S
752be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S
753be168c0dSopenharmony_ci@@ -30,17 +30,17 @@ asm_function SWConv1x8Kernel
754be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
755be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
756be168c0dSopenharmony_ci     sub sp, sp, #64
757be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
758be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
759be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
760be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
761be168c0dSopenharmony_ci+    stp x19, x20, [sp]
762be168c0dSopenharmony_ci+    stp x21, x22, [sp, #16]
763be168c0dSopenharmony_ci+    stp x23, x24, [sp, #32]
764be168c0dSopenharmony_ci+    stp x25, x26, [sp, #48]
765be168c0dSopenharmony_ci 
766be168c0dSopenharmony_ci-    ldr x10, [sp]
767be168c0dSopenharmony_ci-    ldr x11, [sp, #8]
768be168c0dSopenharmony_ci-    ldr x12, [sp, #16]
769be168c0dSopenharmony_ci-    ldr x13, [sp, #24]
770be168c0dSopenharmony_ci-    ldr x14, [sp, #32]
771be168c0dSopenharmony_ci-    ldr x15, [sp, #40]
772be168c0dSopenharmony_ci+    ldr x10, [sp, #64]
773be168c0dSopenharmony_ci+    ldr x11, [sp, #72]
774be168c0dSopenharmony_ci+    ldr x12, [sp, #80]
775be168c0dSopenharmony_ci+    ldr x13, [sp, #88]
776be168c0dSopenharmony_ci+    ldr x14, [sp, #96]
777be168c0dSopenharmony_ci+    ldr x15, [sp, #104]
778be168c0dSopenharmony_ci     lsl x7, x7, #2
779be168c0dSopenharmony_ci     lsl x11, x11, #2
780be168c0dSopenharmony_ci     lsl x12, x12, #2
781be168c0dSopenharmony_ci@@ -270,7 +270,6 @@ asm_function SWConv1x8Kernel
782be168c0dSopenharmony_ci             st1 {v0.4s}, [x0]
783be168c0dSopenharmony_ci             st1 {v1.4s}, [x20]
784be168c0dSopenharmony_ci     End:
785be168c0dSopenharmony_ci-    sub sp, sp, #64
786be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
787be168c0dSopenharmony_ci     ldp x21, x22, [sp], #16
788be168c0dSopenharmony_ci     ldp x23, x24, [sp], #16
789be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S
790be168c0dSopenharmony_ciindex 69624af6..8a5dd83a 100644
791be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S
792be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S
793be168c0dSopenharmony_ci@@ -30,17 +30,17 @@ asm_function SWConv2x16Kernel
794be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
795be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
796be168c0dSopenharmony_ci     sub sp, sp, #64
797be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
798be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
799be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
800be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
801be168c0dSopenharmony_ci+    stp x19, x20, [sp]
802be168c0dSopenharmony_ci+    stp x21, x22, [sp, #16]
803be168c0dSopenharmony_ci+    stp x23, x24, [sp, #32]
804be168c0dSopenharmony_ci+    stp x25, x26, [sp, #48]
805be168c0dSopenharmony_ci 
806be168c0dSopenharmony_ci-    ldr x10, [sp]
807be168c0dSopenharmony_ci-    ldr x11, [sp, #8]
808be168c0dSopenharmony_ci-    ldr x12, [sp, #16]
809be168c0dSopenharmony_ci-    ldr x13, [sp, #24]
810be168c0dSopenharmony_ci-    ldr x14, [sp, #32]
811be168c0dSopenharmony_ci-    ldr x15, [sp, #40]
812be168c0dSopenharmony_ci+    ldr x10, [sp, #64]
813be168c0dSopenharmony_ci+    ldr x11, [sp, #72]
814be168c0dSopenharmony_ci+    ldr x12, [sp, #80]
815be168c0dSopenharmony_ci+    ldr x13, [sp, #88]
816be168c0dSopenharmony_ci+    ldr x14, [sp, #96]
817be168c0dSopenharmony_ci+    ldr x15, [sp, #104]
818be168c0dSopenharmony_ci     lsl x7, x7, #2
819be168c0dSopenharmony_ci     lsl x11, x11, #2
820be168c0dSopenharmony_ci     lsl x12, x12, #2
821be168c0dSopenharmony_ci@@ -399,7 +399,6 @@ asm_function SWConv2x16Kernel
822be168c0dSopenharmony_ci             st1 {v3.4s}, [x22], #16
823be168c0dSopenharmony_ci             st1 {v7.4s}, [x22]
824be168c0dSopenharmony_ci     End:
825be168c0dSopenharmony_ci-    sub sp, sp, #64
826be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
827be168c0dSopenharmony_ci     ldp x21, x22, [sp], #16
828be168c0dSopenharmony_ci     ldp x23, x24, [sp], #16
829be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S
830be168c0dSopenharmony_ciindex 8fefa4be..6efd21d0 100644
831be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S
832be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S
833be168c0dSopenharmony_ci@@ -30,17 +30,17 @@ asm_function SWConv2x8Kernel
834be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
835be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
836be168c0dSopenharmony_ci     sub sp, sp, #64
837be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
838be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
839be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
840be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
841be168c0dSopenharmony_ci+    stp x19, x20, [sp]
842be168c0dSopenharmony_ci+    stp x21, x22, [sp, #16]
843be168c0dSopenharmony_ci+    stp x23, x24, [sp, #32]
844be168c0dSopenharmony_ci+    stp x25, x26, [sp, #48]
845be168c0dSopenharmony_ci 
846be168c0dSopenharmony_ci-    ldr x10, [sp]
847be168c0dSopenharmony_ci-    ldr x11, [sp, #8]
848be168c0dSopenharmony_ci-    ldr x12, [sp, #16]
849be168c0dSopenharmony_ci-    ldr x13, [sp, #24]
850be168c0dSopenharmony_ci-    ldr x14, [sp, #32]
851be168c0dSopenharmony_ci-    ldr x15, [sp, #40]
852be168c0dSopenharmony_ci+    ldr x10, [sp, #64]
853be168c0dSopenharmony_ci+    ldr x11, [sp, #72]
854be168c0dSopenharmony_ci+    ldr x12, [sp, #80]
855be168c0dSopenharmony_ci+    ldr x13, [sp, #88]
856be168c0dSopenharmony_ci+    ldr x14, [sp, #96]
857be168c0dSopenharmony_ci+    ldr x15, [sp, #104]
858be168c0dSopenharmony_ci     lsl x7, x7, #2
859be168c0dSopenharmony_ci     lsl x11, x11, #2
860be168c0dSopenharmony_ci     lsl x12, x12, #2
861be168c0dSopenharmony_ci@@ -257,7 +257,6 @@ asm_function SWConv2x8Kernel
862be168c0dSopenharmony_ci             st1 {v1.4s}, [x20], #16
863be168c0dSopenharmony_ci             st1 {v3.4s}, [x20]
864be168c0dSopenharmony_ci     End:
865be168c0dSopenharmony_ci-    sub sp, sp, #64
866be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
867be168c0dSopenharmony_ci     ldp x21, x22, [sp], #16
868be168c0dSopenharmony_ci     ldp x23, x24, [sp], #16
869be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S
870be168c0dSopenharmony_ciindex 61efd444..428dea69 100644
871be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S
872be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S
873be168c0dSopenharmony_ci@@ -30,18 +30,18 @@ asm_function SWConv3x16Kernel
874be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
875be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
876be168c0dSopenharmony_ci     sub sp, sp, #128
877be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
878be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
879be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
880be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
881be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
882be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
883be168c0dSopenharmony_ci+    stp x19, x20, [sp, #64]
884be168c0dSopenharmony_ci+    stp x21, x22, [sp, #80]
885be168c0dSopenharmony_ci+    stp x23, x24, [sp, #96]
886be168c0dSopenharmony_ci+    stp x25, x26, [sp, #112]
887be168c0dSopenharmony_ci 
888be168c0dSopenharmony_ci-    ldr x10, [sp]
889be168c0dSopenharmony_ci-    ldr x11, [sp, #8]
890be168c0dSopenharmony_ci-    ldr x12, [sp, #16]
891be168c0dSopenharmony_ci-    ldr x13, [sp, #24]
892be168c0dSopenharmony_ci-    ldr x14, [sp, #32]
893be168c0dSopenharmony_ci-    ldr x15, [sp, #40]
894be168c0dSopenharmony_ci+    ldr x10, [sp, #128]
895be168c0dSopenharmony_ci+    ldr x11, [sp, #136]
896be168c0dSopenharmony_ci+    ldr x12, [sp, #144]
897be168c0dSopenharmony_ci+    ldr x13, [sp, #152]
898be168c0dSopenharmony_ci+    ldr x14, [sp, #160]
899be168c0dSopenharmony_ci+    ldr x15, [sp, #168]
900be168c0dSopenharmony_ci     lsl x7, x7, #2
901be168c0dSopenharmony_ci     lsl x11, x11, #2
902be168c0dSopenharmony_ci     lsl x12, x12, #2
903be168c0dSopenharmony_ci@@ -524,7 +524,6 @@ asm_function SWConv3x16Kernel
904be168c0dSopenharmony_ci             st1 {v7.4s}, [x22], #16
905be168c0dSopenharmony_ci             st1 {v11.4s}, [x22]
906be168c0dSopenharmony_ci     End:
907be168c0dSopenharmony_ci-    sub sp, sp, #128
908be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
909be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
910be168c0dSopenharmony_ci     ldp x21, x22, [sp], #16
911be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S
912be168c0dSopenharmony_ciindex 1e958572..472e50b9 100644
913be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S
914be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S
915be168c0dSopenharmony_ci@@ -30,17 +30,17 @@ asm_function SWConv3x8Kernel
916be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
917be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
918be168c0dSopenharmony_ci     sub sp, sp, #64
919be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
920be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
921be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
922be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
923be168c0dSopenharmony_ci+    stp x19, x20, [sp]
924be168c0dSopenharmony_ci+    stp x21, x22, [sp, #16]
925be168c0dSopenharmony_ci+    stp x23, x24, [sp, #32]
926be168c0dSopenharmony_ci+    stp x25, x26, [sp, #48]
927be168c0dSopenharmony_ci 
928be168c0dSopenharmony_ci-    ldr x10, [sp]
929be168c0dSopenharmony_ci-    ldr x11, [sp, #8]
930be168c0dSopenharmony_ci-    ldr x12, [sp, #16]
931be168c0dSopenharmony_ci-    ldr x13, [sp, #24]
932be168c0dSopenharmony_ci-    ldr x14, [sp, #32]
933be168c0dSopenharmony_ci-    ldr x15, [sp, #40]
934be168c0dSopenharmony_ci+    ldr x10, [sp, #64]
935be168c0dSopenharmony_ci+    ldr x11, [sp, #72]
936be168c0dSopenharmony_ci+    ldr x12, [sp, #80]
937be168c0dSopenharmony_ci+    ldr x13, [sp, #88]
938be168c0dSopenharmony_ci+    ldr x14, [sp, #96]
939be168c0dSopenharmony_ci+    ldr x15, [sp, #104]
940be168c0dSopenharmony_ci     lsl x7, x7, #2
941be168c0dSopenharmony_ci     lsl x11, x11, #2
942be168c0dSopenharmony_ci     lsl x12, x12, #2
943be168c0dSopenharmony_ci@@ -324,7 +324,6 @@ asm_function SWConv3x8Kernel
944be168c0dSopenharmony_ci             st1 {v3.4s}, [x20], #16
945be168c0dSopenharmony_ci             st1 {v5.4s}, [x20]
946be168c0dSopenharmony_ci     End:
947be168c0dSopenharmony_ci-    sub sp, sp, #64
948be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
949be168c0dSopenharmony_ci     ldp x21, x22, [sp], #16
950be168c0dSopenharmony_ci     ldp x23, x24, [sp], #16
951be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S
952be168c0dSopenharmony_ciindex 1cd5e124..076724a7 100644
953be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S
954be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S
955be168c0dSopenharmony_ci@@ -30,20 +30,21 @@ asm_function SWConv4x16Kernel
956be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
957be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
958be168c0dSopenharmony_ci     sub sp, sp, #208
959be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
960be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
961be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
962be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
963be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
964be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
965be168c0dSopenharmony_ci-    stp x27, x28, [sp], #16
966be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
967be168c0dSopenharmony_ci+    add x9, sp, #64
968be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
969be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
970be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
971be168c0dSopenharmony_ci+    stp x23, x24, [sp, #160]
972be168c0dSopenharmony_ci+    stp x25, x26, [sp, #176]
973be168c0dSopenharmony_ci+    stp x27, x28, [sp, #192]
974be168c0dSopenharmony_ci 
975be168c0dSopenharmony_ci-    ldr x10, [sp]
976be168c0dSopenharmony_ci-    ldr x11, [sp, #8]
977be168c0dSopenharmony_ci-    ldr x12, [sp, #16]
978be168c0dSopenharmony_ci-    ldr x13, [sp, #24]
979be168c0dSopenharmony_ci-    ldr x14, [sp, #32]
980be168c0dSopenharmony_ci-    ldr x15, [sp, #40]
981be168c0dSopenharmony_ci+    ldr x10, [sp, #208]
982be168c0dSopenharmony_ci+    ldr x11, [sp, #216]
983be168c0dSopenharmony_ci+    ldr x12, [sp, #224]
984be168c0dSopenharmony_ci+    ldr x13, [sp, #232]
985be168c0dSopenharmony_ci+    ldr x14, [sp, #240]
986be168c0dSopenharmony_ci+    ldr x15, [sp, #248]
987be168c0dSopenharmony_ci     lsl x7, x7, #2
988be168c0dSopenharmony_ci     lsl x11, x11, #2
989be168c0dSopenharmony_ci     lsl x12, x12, #2
990be168c0dSopenharmony_ci@@ -650,7 +651,6 @@ asm_function SWConv4x16Kernel
991be168c0dSopenharmony_ci             st1 {v11.4s}, [x22], #16
992be168c0dSopenharmony_ci             st1 {v15.4s}, [x22]
993be168c0dSopenharmony_ci     End:
994be168c0dSopenharmony_ci-    sub sp, sp, #208
995be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
996be168c0dSopenharmony_ci     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
997be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
998be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S
999be168c0dSopenharmony_ciindex 28109031..6b24de97 100644
1000be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S
1001be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S
1002be168c0dSopenharmony_ci@@ -30,20 +30,21 @@ asm_function SWConv4x8Kernel
1003be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
1004be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
1005be168c0dSopenharmony_ci     sub sp, sp, #208
1006be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1007be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1008be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
1009be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
1010be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
1011be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
1012be168c0dSopenharmony_ci-    stp x27, x28, [sp], #16
1013be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1014be168c0dSopenharmony_ci+    add x9, sp, #64
1015be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1016be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
1017be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
1018be168c0dSopenharmony_ci+    stp x23, x24, [sp, #160]
1019be168c0dSopenharmony_ci+    stp x25, x26, [sp, #176]
1020be168c0dSopenharmony_ci+    stp x27, x28, [sp, #192]
1021be168c0dSopenharmony_ci 
1022be168c0dSopenharmony_ci-    ldr x10, [sp]
1023be168c0dSopenharmony_ci-    ldr x11, [sp, #8]
1024be168c0dSopenharmony_ci-    ldr x12, [sp, #16]
1025be168c0dSopenharmony_ci-    ldr x13, [sp, #24]
1026be168c0dSopenharmony_ci-    ldr x14, [sp, #32]
1027be168c0dSopenharmony_ci-    ldr x15, [sp, #40]
1028be168c0dSopenharmony_ci+    ldr x10, [sp, #208]
1029be168c0dSopenharmony_ci+    ldr x11, [sp, #216]
1030be168c0dSopenharmony_ci+    ldr x12, [sp, #224]
1031be168c0dSopenharmony_ci+    ldr x13, [sp, #232]
1032be168c0dSopenharmony_ci+    ldr x14, [sp, #240]
1033be168c0dSopenharmony_ci+    ldr x15, [sp, #248]
1034be168c0dSopenharmony_ci     lsl x7, x7, #2
1035be168c0dSopenharmony_ci     lsl x11, x11, #2
1036be168c0dSopenharmony_ci     lsl x12, x12, #2
1037be168c0dSopenharmony_ci@@ -394,7 +395,6 @@ asm_function SWConv4x8Kernel
1038be168c0dSopenharmony_ci             st1 {v5.4s}, [x20], #16
1039be168c0dSopenharmony_ci             st1 {v7.4s}, [x20]
1040be168c0dSopenharmony_ci     End:
1041be168c0dSopenharmony_ci-    sub sp, sp, #208
1042be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1043be168c0dSopenharmony_ci     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1044be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
1045be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S
1046be168c0dSopenharmony_ciindex 302e5a3d..a2b7ea2c 100644
1047be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S
1048be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S
1049be168c0dSopenharmony_ci@@ -30,20 +30,21 @@ asm_function SWConv5x16Kernel
1050be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
1051be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
1052be168c0dSopenharmony_ci     sub sp, sp, #208
1053be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1054be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1055be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
1056be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
1057be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
1058be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
1059be168c0dSopenharmony_ci-    stp x27, x28, [sp], #16
1060be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1061be168c0dSopenharmony_ci+    add x9, sp, #64
1062be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1063be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
1064be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
1065be168c0dSopenharmony_ci+    stp x23, x24, [sp, #160]
1066be168c0dSopenharmony_ci+    stp x25, x26, [sp, #176]
1067be168c0dSopenharmony_ci+    stp x27, x28, [sp, #192]
1068be168c0dSopenharmony_ci 
1069be168c0dSopenharmony_ci-    ldr x10, [sp]
1070be168c0dSopenharmony_ci-    ldr x11, [sp, #8]
1071be168c0dSopenharmony_ci-    ldr x12, [sp, #16]
1072be168c0dSopenharmony_ci-    ldr x13, [sp, #24]
1073be168c0dSopenharmony_ci-    ldr x14, [sp, #32]
1074be168c0dSopenharmony_ci-    ldr x15, [sp, #40]
1075be168c0dSopenharmony_ci+    ldr x10, [sp, #208]
1076be168c0dSopenharmony_ci+    ldr x11, [sp, #216]
1077be168c0dSopenharmony_ci+    ldr x12, [sp, #224]
1078be168c0dSopenharmony_ci+    ldr x13, [sp, #232]
1079be168c0dSopenharmony_ci+    ldr x14, [sp, #240]
1080be168c0dSopenharmony_ci+    ldr x15, [sp, #248]
1081be168c0dSopenharmony_ci     lsl x7, x7, #2
1082be168c0dSopenharmony_ci     lsl x11, x11, #2
1083be168c0dSopenharmony_ci     lsl x12, x12, #2
1084be168c0dSopenharmony_ci@@ -445,7 +446,6 @@ asm_function SWConv5x16Kernel
1085be168c0dSopenharmony_ci             st1 {v15.4s}, [x22], #16
1086be168c0dSopenharmony_ci             st1 {v19.4s}, [x22]
1087be168c0dSopenharmony_ci     End:
1088be168c0dSopenharmony_ci-    sub sp, sp, #208
1089be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1090be168c0dSopenharmony_ci     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1091be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
1092be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S
1093be168c0dSopenharmony_ciindex 059cc7fc..b7e48480 100644
1094be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S
1095be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S
1096be168c0dSopenharmony_ci@@ -30,20 +30,21 @@ asm_function SWConv5x8Kernel
1097be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
1098be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
1099be168c0dSopenharmony_ci     sub sp, sp, #208
1100be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1101be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1102be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
1103be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
1104be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
1105be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
1106be168c0dSopenharmony_ci-    stp x27, x28, [sp], #16
1107be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1108be168c0dSopenharmony_ci+    add x9, sp, #64
1109be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1110be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
1111be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
1112be168c0dSopenharmony_ci+    stp x23, x24, [sp, #160]
1113be168c0dSopenharmony_ci+    stp x25, x26, [sp, #176]
1114be168c0dSopenharmony_ci+    stp x27, x28, [sp, #192]
1115be168c0dSopenharmony_ci 
1116be168c0dSopenharmony_ci-    ldr x10, [sp]
1117be168c0dSopenharmony_ci-    ldr x11, [sp, #8]
1118be168c0dSopenharmony_ci-    ldr x12, [sp, #16]
1119be168c0dSopenharmony_ci-    ldr x13, [sp, #24]
1120be168c0dSopenharmony_ci-    ldr x14, [sp, #32]
1121be168c0dSopenharmony_ci-    ldr x15, [sp, #40]
1122be168c0dSopenharmony_ci+    ldr x10, [sp, #208]
1123be168c0dSopenharmony_ci+    ldr x11, [sp, #216]
1124be168c0dSopenharmony_ci+    ldr x12, [sp, #224]
1125be168c0dSopenharmony_ci+    ldr x13, [sp, #232]
1126be168c0dSopenharmony_ci+    ldr x14, [sp, #240]
1127be168c0dSopenharmony_ci+    ldr x15, [sp, #248]
1128be168c0dSopenharmony_ci     lsl x7, x7, #2
1129be168c0dSopenharmony_ci     lsl x11, x11, #2
1130be168c0dSopenharmony_ci     lsl x12, x12, #2
1131be168c0dSopenharmony_ci@@ -296,7 +297,6 @@ asm_function SWConv5x8Kernel
1132be168c0dSopenharmony_ci             st1 {v7.4s}, [x20], #16
1133be168c0dSopenharmony_ci             st1 {v9.4s}, [x20]
1134be168c0dSopenharmony_ci     End:
1135be168c0dSopenharmony_ci-    sub sp, sp, #208
1136be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1137be168c0dSopenharmony_ci     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1138be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
1139be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S
1140be168c0dSopenharmony_ciindex e6875bb1..11722e71 100644
1141be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S
1142be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S
1143be168c0dSopenharmony_ci@@ -30,14 +30,14 @@ asm_function DeconvDwFp32Center
1144be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
1145be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
1146be168c0dSopenharmony_ci     sub sp, sp, #32
1147be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
1148be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
1149be168c0dSopenharmony_ci+    stp x19, x20, [sp]
1150be168c0dSopenharmony_ci+    stp x21, x22, [sp, #16]
1151be168c0dSopenharmony_ci 
1152be168c0dSopenharmony_ci-    ldr x8, [sp]
1153be168c0dSopenharmony_ci-    ldr x9, [sp, #8]
1154be168c0dSopenharmony_ci-    ldr x10, [sp, #16]
1155be168c0dSopenharmony_ci-    ldr x11, [sp, #24]
1156be168c0dSopenharmony_ci-    ldr x12, [sp, #32]
1157be168c0dSopenharmony_ci+    ldr x8, [sp, #32]
1158be168c0dSopenharmony_ci+    ldr x9, [sp, #40]
1159be168c0dSopenharmony_ci+    ldr x10, [sp, #48]
1160be168c0dSopenharmony_ci+    ldr x11, [sp, #56]
1161be168c0dSopenharmony_ci+    ldr x12, [sp, #64]
1162be168c0dSopenharmony_ci 
1163be168c0dSopenharmony_ci     LoopH:
1164be168c0dSopenharmony_ci         mov x15, x0
1165be168c0dSopenharmony_ci@@ -69,7 +69,6 @@ asm_function DeconvDwFp32Center
1166be168c0dSopenharmony_ci         subs x3, x3, #1
1167be168c0dSopenharmony_ci         bne LoopH
1168be168c0dSopenharmony_ci 
1169be168c0dSopenharmony_ci-    sub sp, sp, #32
1170be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
1171be168c0dSopenharmony_ci     ldp x21, x22, [sp], #16
1172be168c0dSopenharmony_ci     ret
1173be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S
1174be168c0dSopenharmony_ciindex aaf210f0..1c3723fa 100644
1175be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S
1176be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S
1177be168c0dSopenharmony_ci@@ -30,14 +30,14 @@ asm_function DeconvDwInt8Center
1178be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
1179be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
1180be168c0dSopenharmony_ci     sub sp, sp, #32
1181be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
1182be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
1183be168c0dSopenharmony_ci+    stp x19, x20, [sp]
1184be168c0dSopenharmony_ci+    stp x21, x22, [sp, #16]
1185be168c0dSopenharmony_ci 
1186be168c0dSopenharmony_ci-    ldr x8, [sp]
1187be168c0dSopenharmony_ci-    ldr x9, [sp, #8]
1188be168c0dSopenharmony_ci-    ldr x10, [sp, #16]
1189be168c0dSopenharmony_ci-    ldr x11, [sp, #24]
1190be168c0dSopenharmony_ci-    ldr x12, [sp, #32]
1191be168c0dSopenharmony_ci+    ldr x8, [sp, #32]
1192be168c0dSopenharmony_ci+    ldr x9, [sp, #40]
1193be168c0dSopenharmony_ci+    ldr x10, [sp, #48]
1194be168c0dSopenharmony_ci+    ldr x11, [sp, #56]
1195be168c0dSopenharmony_ci+    ldr x12, [sp, #64]
1196be168c0dSopenharmony_ci 
1197be168c0dSopenharmony_ci     LoopH:
1198be168c0dSopenharmony_ci         mov x15, x0
1199be168c0dSopenharmony_ci@@ -69,7 +69,6 @@ asm_function DeconvDwInt8Center
1200be168c0dSopenharmony_ci         subs x3, x3, #1
1201be168c0dSopenharmony_ci         bne LoopH
1202be168c0dSopenharmony_ci 
1203be168c0dSopenharmony_ci-    sub sp, sp, #32
1204be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
1205be168c0dSopenharmony_ci     ldp x21, x22, [sp], #16
1206be168c0dSopenharmony_ci     ret
1207be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S
1208be168c0dSopenharmony_ciindex 71a7f0f1..36c8d8ec 100644
1209be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S
1210be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S
1211be168c0dSopenharmony_ci@@ -15,7 +15,7 @@
1212be168c0dSopenharmony_ci  */
1213be168c0dSopenharmony_ci #ifdef ENABLE_ARM64
1214be168c0dSopenharmony_ci #include "nnacl/assembly_global.h"
1215be168c0dSopenharmony_ci-    
1216be168c0dSopenharmony_ci+
1217be168c0dSopenharmony_ci .text
1218be168c0dSopenharmony_ci .align 5
1219be168c0dSopenharmony_ci 
1220be168c0dSopenharmony_ci@@ -30,24 +30,25 @@
1221be168c0dSopenharmony_ci 
1222be168c0dSopenharmony_ci asm_default_function MatVecMulFp32
1223be168c0dSopenharmony_ci   sub sp, sp, #128
1224be168c0dSopenharmony_ci-  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
1225be168c0dSopenharmony_ci-  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
1226be168c0dSopenharmony_ci+  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
1227be168c0dSopenharmony_ci+  add x9, sp, #64
1228be168c0dSopenharmony_ci+  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9]
1229be168c0dSopenharmony_ci 
1230be168c0dSopenharmony_ci   mov w14, #4      // sizeof(float)
1231be168c0dSopenharmony_ci   mul w8, w14, w5  // rhs depthx1 block stride
1232be168c0dSopenharmony_ci   mov w14, #4
1233be168c0dSopenharmony_ci-  mul w13, w8, w14 // rhs depthx4 block stride 
1234be168c0dSopenharmony_ci+  mul w13, w8, w14 // rhs depthx4 block stride
1235be168c0dSopenharmony_ci 
1236be168c0dSopenharmony_ci Loop:
1237be168c0dSopenharmony_ci   mov x15, x0     // reload a ptr
1238be168c0dSopenharmony_ci   mov x7, x1      // reload b ptr
1239be168c0dSopenharmony_ci   mov w9, w5      // reload depth
1240be168c0dSopenharmony_ci   cmp w6, #4
1241be168c0dSopenharmony_ci-  blt Loop1x1  
1242be168c0dSopenharmony_ci+  blt Loop1x1
1243be168c0dSopenharmony_ci 
1244be168c0dSopenharmony_ci-Loop1x4: 
1245be168c0dSopenharmony_ci-  dup v10.8h, wzr  
1246be168c0dSopenharmony_ci-  dup v11.8h, wzr  
1247be168c0dSopenharmony_ci+Loop1x4:
1248be168c0dSopenharmony_ci+  dup v10.8h, wzr
1249be168c0dSopenharmony_ci+  dup v11.8h, wzr
1250be168c0dSopenharmony_ci   dup v12.8h, wzr
1251be168c0dSopenharmony_ci   dup v13.8h, wzr
1252be168c0dSopenharmony_ci   dup v14.8h, wzr
1253be168c0dSopenharmony_ci@@ -150,7 +151,7 @@ End1x4:
1254be168c0dSopenharmony_ci 
1255be168c0dSopenharmony_ci   cbz x3, Act1x4
1256be168c0dSopenharmony_ci   ld1 {v15.4s}, [x3], #16
1257be168c0dSopenharmony_ci-  fadd v14.4s, v14.4s, v15.4s   // add bias 
1258be168c0dSopenharmony_ci+  fadd v14.4s, v14.4s, v15.4s   // add bias
1259be168c0dSopenharmony_ci 
1260be168c0dSopenharmony_ci Act1x4:
1261be168c0dSopenharmony_ci   cmp w4, #3
1262be168c0dSopenharmony_ci@@ -214,8 +215,8 @@ Depth1_1x1:
1263be168c0dSopenharmony_ci   b Depth1_1x1
1264be168c0dSopenharmony_ci 
1265be168c0dSopenharmony_ci End1x1:
1266be168c0dSopenharmony_ci-  faddp v6.4s, v4.4s, v4.4s  
1267be168c0dSopenharmony_ci-  faddp v7.4s, v6.4s, v6.4s  
1268be168c0dSopenharmony_ci+  faddp v6.4s, v4.4s, v4.4s
1269be168c0dSopenharmony_ci+  faddp v7.4s, v6.4s, v6.4s
1270be168c0dSopenharmony_ci   fadd v7.4s, v7.4s, v5.4s
1271be168c0dSopenharmony_ci 
1272be168c0dSopenharmony_ci   cbz x3, Act1x1
1273be168c0dSopenharmony_ci@@ -245,7 +246,6 @@ Write1x1:
1274be168c0dSopenharmony_ci   b Loop
1275be168c0dSopenharmony_ci 
1276be168c0dSopenharmony_ci End:
1277be168c0dSopenharmony_ci-  sub sp, sp, #128
1278be168c0dSopenharmony_ci   ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
1279be168c0dSopenharmony_ci   ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
1280be168c0dSopenharmony_ci   ret
1281be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S
1282be168c0dSopenharmony_ciindex d485b012..b013f48a 100644
1283be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S
1284be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S
1285be168c0dSopenharmony_ci@@ -30,8 +30,8 @@
1286be168c0dSopenharmony_ci 
1287be168c0dSopenharmony_ci asm_default_function MatVecMulPackFp32
1288be168c0dSopenharmony_ci     sub sp, sp, #16
1289be168c0dSopenharmony_ci-    stp x29, x30, [sp], #16
1290be168c0dSopenharmony_ci-  
1291be168c0dSopenharmony_ci+    stp x29, x30, [sp]
1292be168c0dSopenharmony_ci+
1293be168c0dSopenharmony_ci     dup v1.2d, xzr
1294be168c0dSopenharmony_ci     mov w7, #6
1295be168c0dSopenharmony_ci     dup v2.4s, w7
1296be168c0dSopenharmony_ci@@ -43,7 +43,7 @@ asm_default_function MatVecMulPackFp32
1297be168c0dSopenharmony_ci         st1 {v24.4s, v25.4s}, [x2], #32
1298be168c0dSopenharmony_ci         subs w6, w6, #8
1299be168c0dSopenharmony_ci         bge Loop1x8Start
1300be168c0dSopenharmony_ci-                
1301be168c0dSopenharmony_ci+
1302be168c0dSopenharmony_ci     Loop1xNStart:
1303be168c0dSopenharmony_ci         add w6, w6, #8
1304be168c0dSopenharmony_ci         cbz w6, End
1305be168c0dSopenharmony_ci@@ -59,7 +59,7 @@ asm_default_function MatVecMulPackFp32
1306be168c0dSopenharmony_ci         beq End
1307be168c0dSopenharmony_ci         st1 {v25.s}[2], [x2]
1308be168c0dSopenharmony_ci         b End
1309be168c0dSopenharmony_ci-            
1310be168c0dSopenharmony_ci+
1311be168c0dSopenharmony_ci     Loop1x4Start:
1312be168c0dSopenharmony_ci         add w6, w6, #4
1313be168c0dSopenharmony_ci         cbz w6, End
1314be168c0dSopenharmony_ci@@ -75,7 +75,7 @@ asm_default_function MatVecMulPackFp32
1315be168c0dSopenharmony_ci         beq End
1316be168c0dSopenharmony_ci         st1 {v24.s}[3], [x2], #4
1317be168c0dSopenharmony_ci         b End
1318be168c0dSopenharmony_ci-        
1319be168c0dSopenharmony_ci+
1320be168c0dSopenharmony_ci     Compute1x8Unit:
1321be168c0dSopenharmony_ci         mov x7, x0     // reload a-ptr
1322be168c0dSopenharmony_ci         mov w8, w5     // reset depth
1323be168c0dSopenharmony_ci@@ -140,7 +140,7 @@ asm_default_function MatVecMulPackFp32
1324be168c0dSopenharmony_ci                     fmax v25.4s, v25.4s, v1.4s
1325be168c0dSopenharmony_ci                 Return1x8:
1326be168c0dSopenharmony_ci                     ret
1327be168c0dSopenharmony_ci-  
1328be168c0dSopenharmony_ci+
1329be168c0dSopenharmony_ci     Compute1x4Unit:
1330be168c0dSopenharmony_ci         mov x7, x0     // reload a-ptr
1331be168c0dSopenharmony_ci         mov w8, w5     // reset depth
1332be168c0dSopenharmony_ci@@ -191,9 +191,8 @@ asm_default_function MatVecMulPackFp32
1333be168c0dSopenharmony_ci                     fmax v24.4s, v24.4s, v1.4s
1334be168c0dSopenharmony_ci                 Return1x4:
1335be168c0dSopenharmony_ci                     ret
1336be168c0dSopenharmony_ci-  
1337be168c0dSopenharmony_ci+
1338be168c0dSopenharmony_ci     End:
1339be168c0dSopenharmony_ci-        sub sp, sp, #16
1340be168c0dSopenharmony_ci         ldp x29, x30, [sp], #16
1341be168c0dSopenharmony_ci         ret
1342be168c0dSopenharmony_ci #endif
1343be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S
1344be168c0dSopenharmony_ciindex 67d20dcc..2dedccd0 100644
1345be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S
1346be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S
1347be168c0dSopenharmony_ci@@ -34,17 +34,18 @@
1348be168c0dSopenharmony_ci 
1349be168c0dSopenharmony_ci asm_function MatmulFloatNeon64
1350be168c0dSopenharmony_ci   sub sp, sp, #144
1351be168c0dSopenharmony_ci-  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1352be168c0dSopenharmony_ci-  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1353be168c0dSopenharmony_ci-  stp x19, x20, [sp], #16
1354be168c0dSopenharmony_ci+  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1355be168c0dSopenharmony_ci+  add x9, sp, #64
1356be168c0dSopenharmony_ci+  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1357be168c0dSopenharmony_ci+  stp x19, x20, [sp, #128]
1358be168c0dSopenharmony_ci 
1359be168c0dSopenharmony_ci-  ldr x9, [sp, #8]
1360be168c0dSopenharmony_ci-  ldr x14, [sp, #16]
1361be168c0dSopenharmony_ci+  ldr x9, [sp, #152]
1362be168c0dSopenharmony_ci+  ldr x14, [sp, #160]
1363be168c0dSopenharmony_ci 
1364be168c0dSopenharmony_ci   mov w19, #32 // sizeof(float) * 8
1365be168c0dSopenharmony_ci   mul w15, w5, w19 // block stride of lhs/rhs: sizeof(float) * 8 * depth
1366be168c0dSopenharmony_ci   mov x19, #4
1367be168c0dSopenharmony_ci-  ldr x17, [sp]
1368be168c0dSopenharmony_ci+  ldr x17, [sp, #144]
1369be168c0dSopenharmony_ci   cbz x14, NoWinoSteps
1370be168c0dSopenharmony_ci   mul x8, x7, x17
1371be168c0dSopenharmony_ci   mov x11, #8
1372be168c0dSopenharmony_ci@@ -779,7 +780,6 @@ NoDstStep:
1373be168c0dSopenharmony_ci   bgt L1
1374be168c0dSopenharmony_ci 
1375be168c0dSopenharmony_ci End1:
1376be168c0dSopenharmony_ci-  sub sp, sp, #144
1377be168c0dSopenharmony_ci   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1378be168c0dSopenharmony_ci   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1379be168c0dSopenharmony_ci   ldp x19, x20, [sp], #16
1380be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S
1381be168c0dSopenharmony_ciindex 6937f4ba..51d107c8 100644
1382be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S
1383be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S
1384be168c0dSopenharmony_ci@@ -19,7 +19,7 @@
1385be168c0dSopenharmony_ci .text
1386be168c0dSopenharmony_ci .align 5
1387be168c0dSopenharmony_ci 
1388be168c0dSopenharmony_ci-// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
1389be168c0dSopenharmony_ci+// void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
1390be168c0dSopenharmony_ci //                        int row, int col, size_t stride, size_t writeMode)
1391be168c0dSopenharmony_ci // x0: a
1392be168c0dSopenharmony_ci // x1: b
1393be168c0dSopenharmony_ci@@ -34,13 +34,14 @@
1394be168c0dSopenharmony_ci 
1395be168c0dSopenharmony_ci asm_function MatmulFloatNeon64Opt
1396be168c0dSopenharmony_ci     sub sp, sp, #160
1397be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1398be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1399be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
1400be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
1401be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1402be168c0dSopenharmony_ci+    add x9, sp, #64
1403be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1404be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
1405be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
1406be168c0dSopenharmony_ci 
1407be168c0dSopenharmony_ci-    ldr x8, [sp]
1408be168c0dSopenharmony_ci-    ldr x9, [sp, #8]
1409be168c0dSopenharmony_ci+    ldr x8, [sp, #160]
1410be168c0dSopenharmony_ci+    ldr x9, [sp, #168]
1411be168c0dSopenharmony_ci 
1412be168c0dSopenharmony_ci     mov x21, #48 // sizeof(float) * 12
1413be168c0dSopenharmony_ci     mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
1414be168c0dSopenharmony_ci@@ -1659,7 +1660,6 @@ LoopColEnd:
1415be168c0dSopenharmony_ci         subs x6, x6, #12
1416be168c0dSopenharmony_ci         bgt LoopRowStart
1417be168c0dSopenharmony_ci 
1418be168c0dSopenharmony_ci-  sub sp, sp, #160
1419be168c0dSopenharmony_ci   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1420be168c0dSopenharmony_ci   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1421be168c0dSopenharmony_ci   ldp x19, x20, [sp], #16
1422be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S
1423be168c0dSopenharmony_ciindex c9151a99..05465bd1 100644
1424be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S
1425be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S
1426be168c0dSopenharmony_ci@@ -34,13 +34,14 @@
1427be168c0dSopenharmony_ci 
1428be168c0dSopenharmony_ci asm_function MatmulFloatNeon64OptRow12
1429be168c0dSopenharmony_ci     sub sp, sp, #160
1430be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1431be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1432be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
1433be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
1434be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1435be168c0dSopenharmony_ci+    add x9, sp, #64
1436be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1437be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
1438be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
1439be168c0dSopenharmony_ci 
1440be168c0dSopenharmony_ci-    ldr x8, [sp]
1441be168c0dSopenharmony_ci-    ldr x9, [sp, #8]
1442be168c0dSopenharmony_ci+    ldr x8, [sp, #160]
1443be168c0dSopenharmony_ci+    ldr x9, [sp, #168]
1444be168c0dSopenharmony_ci 
1445be168c0dSopenharmony_ci     mov x21, #48 // sizeof(float) * 12
1446be168c0dSopenharmony_ci     mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
1447be168c0dSopenharmony_ci@@ -1220,7 +1221,6 @@ LoopColEnd:
1448be168c0dSopenharmony_ci         subs x6, x6, #12
1449be168c0dSopenharmony_ci         bgt LoopRow
1450be168c0dSopenharmony_ci 
1451be168c0dSopenharmony_ci-  sub sp, sp, #160
1452be168c0dSopenharmony_ci   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1453be168c0dSopenharmony_ci   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1454be168c0dSopenharmony_ci   ldp x19, x20, [sp], #16
1455be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S
1456be168c0dSopenharmony_ciindex 0cc49fb9..b984c494 100644
1457be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S
1458be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S
1459be168c0dSopenharmony_ci@@ -19,7 +19,7 @@
1460be168c0dSopenharmony_ci .text
1461be168c0dSopenharmony_ci .align 5
1462be168c0dSopenharmony_ci 
1463be168c0dSopenharmony_ci-// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
1464be168c0dSopenharmony_ci+// void MatmulFloatNeon64OptRow4(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
1465be168c0dSopenharmony_ci //                        int row, int col, size_t stride, size_t writeMode)
1466be168c0dSopenharmony_ci // x0: a
1467be168c0dSopenharmony_ci // x1: b
1468be168c0dSopenharmony_ci@@ -34,13 +34,14 @@
1469be168c0dSopenharmony_ci 
1470be168c0dSopenharmony_ci asm_function MatmulFloatNeon64OptRow4
1471be168c0dSopenharmony_ci     sub sp, sp, #160
1472be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1473be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1474be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
1475be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
1476be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1477be168c0dSopenharmony_ci+    add x9, sp, #64
1478be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1479be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
1480be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
1481be168c0dSopenharmony_ci 
1482be168c0dSopenharmony_ci-    ldr x8, [sp]
1483be168c0dSopenharmony_ci-    ldr x9, [sp, #8]
1484be168c0dSopenharmony_ci+    ldr x8, [sp, #160]
1485be168c0dSopenharmony_ci+    ldr x9, [sp, #168]
1486be168c0dSopenharmony_ci 
1487be168c0dSopenharmony_ci     mov x21, #48 // sizeof(float) * 12
1488be168c0dSopenharmony_ci 
1489be168c0dSopenharmony_ci@@ -588,7 +589,6 @@ LoopColEnd:
1490be168c0dSopenharmony_ci         subs x6, x6, #12
1491be168c0dSopenharmony_ci         bgt LoopRow4
1492be168c0dSopenharmony_ci 
1493be168c0dSopenharmony_ci-  sub sp, sp, #160
1494be168c0dSopenharmony_ci   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1495be168c0dSopenharmony_ci   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1496be168c0dSopenharmony_ci   ldp x19, x20, [sp], #16
1497be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S
1498be168c0dSopenharmony_ciindex a9e42a54..c5b260c0 100644
1499be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S
1500be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S
1501be168c0dSopenharmony_ci@@ -34,13 +34,14 @@
1502be168c0dSopenharmony_ci 
1503be168c0dSopenharmony_ci asm_function MatmulFloatNeon64OptRow8
1504be168c0dSopenharmony_ci     sub sp, sp, #160
1505be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1506be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1507be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
1508be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
1509be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1510be168c0dSopenharmony_ci+    add x9, sp, #64
1511be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1512be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
1513be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
1514be168c0dSopenharmony_ci 
1515be168c0dSopenharmony_ci-    ldr x8, [sp]
1516be168c0dSopenharmony_ci-    ldr x9, [sp, #8]
1517be168c0dSopenharmony_ci+    ldr x8, [sp, #160]
1518be168c0dSopenharmony_ci+    ldr x9, [sp, #168]
1519be168c0dSopenharmony_ci 
1520be168c0dSopenharmony_ci     mov x21, #48 // sizeof(float) * 12
1521be168c0dSopenharmony_ci     mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
1522be168c0dSopenharmony_ci@@ -902,7 +903,6 @@ LoopColEnd:
1523be168c0dSopenharmony_ci         subs x6, x6, #12
1524be168c0dSopenharmony_ci         bgt LoopCol8
1525be168c0dSopenharmony_ci 
1526be168c0dSopenharmony_ci-  sub sp, sp, #160
1527be168c0dSopenharmony_ci   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1528be168c0dSopenharmony_ci   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1529be168c0dSopenharmony_ci   ldp x19, x20, [sp], #16
1530be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S
1531be168c0dSopenharmony_ciindex a0e94c5f..731bac4b 100644
1532be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S
1533be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S
1534be168c0dSopenharmony_ci@@ -44,24 +44,25 @@
1535be168c0dSopenharmony_ci 
1536be168c0dSopenharmony_ci asm_function MatmulInt8Neon64
1537be168c0dSopenharmony_ci   sub sp, sp, #208
1538be168c0dSopenharmony_ci-  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1539be168c0dSopenharmony_ci-  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1540be168c0dSopenharmony_ci-  stp x19, x20, [sp], #16
1541be168c0dSopenharmony_ci-  stp x21, x22, [sp], #16
1542be168c0dSopenharmony_ci-  stp x23, x24, [sp], #16
1543be168c0dSopenharmony_ci-  stp x25, x26, [sp], #16
1544be168c0dSopenharmony_ci-  stp x27, x28, [sp], #16
1545be168c0dSopenharmony_ci-
1546be168c0dSopenharmony_ci-  ldr w8, [sp]
1547be168c0dSopenharmony_ci-  ldr w9, [sp, #8]
1548be168c0dSopenharmony_ci-  ldr w10, [sp, #16]
1549be168c0dSopenharmony_ci-  ldr x11, [sp, #24]
1550be168c0dSopenharmony_ci-  ldr x12, [sp, #32]
1551be168c0dSopenharmony_ci-  ldr x13, [sp, #40]
1552be168c0dSopenharmony_ci-  ldr w14, [sp, #48]
1553be168c0dSopenharmony_ci-  ldr w15, [sp, #56]
1554be168c0dSopenharmony_ci-  ldr w24, [sp, #64]
1555be168c0dSopenharmony_ci-  ldr w27, [sp, #72]
1556be168c0dSopenharmony_ci+  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1557be168c0dSopenharmony_ci+  add x9, sp, #64
1558be168c0dSopenharmony_ci+  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1559be168c0dSopenharmony_ci+  stp x19, x20, [sp, #128]
1560be168c0dSopenharmony_ci+  stp x21, x22, [sp, #144]
1561be168c0dSopenharmony_ci+  stp x23, x24, [sp, #160]
1562be168c0dSopenharmony_ci+  stp x25, x26, [sp, #176]
1563be168c0dSopenharmony_ci+  stp x27, x28, [sp, #192]
1564be168c0dSopenharmony_ci+
1565be168c0dSopenharmony_ci+  ldr w8, [sp, #208]
1566be168c0dSopenharmony_ci+  ldr w9, [sp, #216]
1567be168c0dSopenharmony_ci+  ldr w10, [sp, #224]
1568be168c0dSopenharmony_ci+  ldr x11, [sp, #232]
1569be168c0dSopenharmony_ci+  ldr x12, [sp, #240]
1570be168c0dSopenharmony_ci+  ldr x13, [sp, #248]
1571be168c0dSopenharmony_ci+  ldr w14, [sp, #256]
1572be168c0dSopenharmony_ci+  ldr w15, [sp, #264]
1573be168c0dSopenharmony_ci+  ldr w24, [sp, #272]
1574be168c0dSopenharmony_ci+  ldr w27, [sp, #280]
1575be168c0dSopenharmony_ci 
1576be168c0dSopenharmony_ci   mov w17, #4       // sizeof(int8)*4
1577be168c0dSopenharmony_ci   mul w21, w5, w17  // the stride of a/b: sizeof(int8)*4*deep16
1578be168c0dSopenharmony_ci@@ -408,7 +409,6 @@ PerTEnd2:
1579be168c0dSopenharmony_ci   b L1
1580be168c0dSopenharmony_ci 
1581be168c0dSopenharmony_ci End1:
1582be168c0dSopenharmony_ci-  sub sp, sp, #208
1583be168c0dSopenharmony_ci   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1584be168c0dSopenharmony_ci   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1585be168c0dSopenharmony_ci   ldp x19, x20, [sp], #16
1586be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S
1587be168c0dSopenharmony_ciindex 64be8a14..a54ee5b8 100644
1588be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S
1589be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S
1590be168c0dSopenharmony_ci@@ -43,23 +43,24 @@
1591be168c0dSopenharmony_ci 
1592be168c0dSopenharmony_ci asm_function MatmulInt8Opt
1593be168c0dSopenharmony_ci     sub sp, sp, #224
1594be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1595be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1596be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
1597be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
1598be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
1599be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
1600be168c0dSopenharmony_ci-    stp x27, x28, [sp], #16
1601be168c0dSopenharmony_ci-    stp x29, x30, [sp], #16
1602be168c0dSopenharmony_ci-
1603be168c0dSopenharmony_ci-    ldr w8, [sp]
1604be168c0dSopenharmony_ci-    ldr w9, [sp, #8]
1605be168c0dSopenharmony_ci-    ldr w10, [sp, #16]
1606be168c0dSopenharmony_ci-    ldr x11, [sp, #24]
1607be168c0dSopenharmony_ci-    ldr x12, [sp, #32]
1608be168c0dSopenharmony_ci-    ldr x13, [sp, #40]
1609be168c0dSopenharmony_ci-    ldr x14, [sp, #48]
1610be168c0dSopenharmony_ci-    ldr x15, [sp, #56]
1611be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1612be168c0dSopenharmony_ci+    add x9, sp, #64
1613be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1614be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
1615be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
1616be168c0dSopenharmony_ci+    stp x23, x24, [sp, #160]
1617be168c0dSopenharmony_ci+    stp x25, x26, [sp, #176]
1618be168c0dSopenharmony_ci+    stp x27, x28, [sp, #192]
1619be168c0dSopenharmony_ci+    stp x29, x30, [sp, #208]
1620be168c0dSopenharmony_ci+
1621be168c0dSopenharmony_ci+    ldr w8, [sp, #224]
1622be168c0dSopenharmony_ci+    ldr w9, [sp, #232]
1623be168c0dSopenharmony_ci+    ldr w10, [sp, #240]
1624be168c0dSopenharmony_ci+    ldr x11, [sp, #248]
1625be168c0dSopenharmony_ci+    ldr x12, [sp, #256]
1626be168c0dSopenharmony_ci+    ldr x13, [sp, #264]
1627be168c0dSopenharmony_ci+    ldr x14, [sp, #272]
1628be168c0dSopenharmony_ci+    ldr x15, [sp, #280]
1629be168c0dSopenharmony_ci 
1630be168c0dSopenharmony_ci     mov x23, #4
1631be168c0dSopenharmony_ci     mul x23, x23, x5  // lhs step
1632be168c0dSopenharmony_ci@@ -70,7 +71,7 @@ LoopRow:
1633be168c0dSopenharmony_ci     mov x17, x4 // reload rhs col
1634be168c0dSopenharmony_ci     mov x29, x7 // reload bias ptr
1635be168c0dSopenharmony_ci     mov x27, x2 // reload dst ptr
1636be168c0dSopenharmony_ci-    ldr x28, [sp, #64] // reload filter_zp
1637be168c0dSopenharmony_ci+    ldr x28, [sp, #288] // reload filter_zp
1638be168c0dSopenharmony_ci 
1639be168c0dSopenharmony_ci     LoopCol:
1640be168c0dSopenharmony_ci         mov x25, x6 // reload a_sums ptr
1641be168c0dSopenharmony_ci@@ -334,16 +335,15 @@ LoopRow:
1642be168c0dSopenharmony_ci LoopColEnd:
1643be168c0dSopenharmony_ci     subs x3, x3, #4
1644be168c0dSopenharmony_ci     ble LoopRowEnd
1645be168c0dSopenharmony_ci-    ldr x11, [sp, #24]
1646be168c0dSopenharmony_ci-    ldr x12, [sp, #32]
1647be168c0dSopenharmony_ci-    ldr x13, [sp, #40]
1648be168c0dSopenharmony_ci+    ldr x11, [sp, #248]
1649be168c0dSopenharmony_ci+    ldr x12, [sp, #256]
1650be168c0dSopenharmony_ci+    ldr x13, [sp, #264]
1651be168c0dSopenharmony_ci     add x6, x6, #16
1652be168c0dSopenharmony_ci     add x0, x0, x23
1653be168c0dSopenharmony_ci     add x2, x2, x24
1654be168c0dSopenharmony_ci     b LoopRow
1655be168c0dSopenharmony_ci 
1656be168c0dSopenharmony_ci LoopRowEnd:
1657be168c0dSopenharmony_ci-    sub sp, sp, #224
1658be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1659be168c0dSopenharmony_ci     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1660be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
1661be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S
1662be168c0dSopenharmony_ciindex fe5207ad..adb0a42c 100644
1663be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S
1664be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S
1665be168c0dSopenharmony_ci@@ -33,9 +33,10 @@
1666be168c0dSopenharmony_ci 
1667be168c0dSopenharmony_ci asm_function MatMulR4Int8Neon64
1668be168c0dSopenharmony_ci   sub sp, sp, #144
1669be168c0dSopenharmony_ci-  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1670be168c0dSopenharmony_ci-  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1671be168c0dSopenharmony_ci-  stp x19, x20, [sp], #16
1672be168c0dSopenharmony_ci+  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1673be168c0dSopenharmony_ci+  add x9, sp, #64
1674be168c0dSopenharmony_ci+  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1675be168c0dSopenharmony_ci+  stp x19, x20, [sp, #128]
1676be168c0dSopenharmony_ci 
1677be168c0dSopenharmony_ci   mov w15, #0       // b col index
1678be168c0dSopenharmony_ci   mov w16, #0       // a row index
1679be168c0dSopenharmony_ci@@ -185,7 +186,6 @@ End2:
1680be168c0dSopenharmony_ci   b L1
1681be168c0dSopenharmony_ci 
1682be168c0dSopenharmony_ci End1:
1683be168c0dSopenharmony_ci-  sub sp, sp, #144
1684be168c0dSopenharmony_ci   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1685be168c0dSopenharmony_ci   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1686be168c0dSopenharmony_ci   ldp x19, x20, [sp], #16
1687be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S
1688be168c0dSopenharmony_ciindex 0b814ce4..23032ab9 100644
1689be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S
1690be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S
1691be168c0dSopenharmony_ci@@ -27,9 +27,9 @@ asm_function MatrixMultiplyWinograd
1692be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
1693be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
1694be168c0dSopenharmony_ci     sub sp, sp, #48
1695be168c0dSopenharmony_ci-    st1 {v8.4s}, [sp], #16
1696be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
1697be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
1698be168c0dSopenharmony_ci+    st1 {v8.4s}, [sp]
1699be168c0dSopenharmony_ci+    stp x19, x20, [sp, #16]
1700be168c0dSopenharmony_ci+    stp x21, x22, [sp, #32]
1701be168c0dSopenharmony_ci     mov x8, #4
1702be168c0dSopenharmony_ci     mul x10, x5, x8
1703be168c0dSopenharmony_ci     mov x17, x3  // m
1704be168c0dSopenharmony_ci@@ -176,7 +176,6 @@ asm_function MatrixMultiplyWinograd
1705be168c0dSopenharmony_ci             add x0, x0, x21
1706be168c0dSopenharmony_ci             b LoopM
1707be168c0dSopenharmony_ci     EndLoopM:
1708be168c0dSopenharmony_ci-        sub sp, sp, #48
1709be168c0dSopenharmony_ci         ld1 {v8.4s}, [sp], #16
1710be168c0dSopenharmony_ci         ldp x19, x20, [sp], #16
1711be168c0dSopenharmony_ci         ldp x21, x22, [sp], #16
1712be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S
1713be168c0dSopenharmony_ciindex 5355d302..1392ab4a 100644
1714be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S
1715be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S
1716be168c0dSopenharmony_ci@@ -34,8 +34,9 @@
1717be168c0dSopenharmony_ci 
1718be168c0dSopenharmony_ci asm_function PostFuncBiasReluC8
1719be168c0dSopenharmony_ci   sub sp, sp, #128
1720be168c0dSopenharmony_ci-  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1721be168c0dSopenharmony_ci-  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1722be168c0dSopenharmony_ci+  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1723be168c0dSopenharmony_ci+  add x9, sp, #64
1724be168c0dSopenharmony_ci+  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1725be168c0dSopenharmony_ci 
1726be168c0dSopenharmony_ci   movi v26.4s, #6
1727be168c0dSopenharmony_ci   scvtf v26.4s, v26.4s
1728be168c0dSopenharmony_ci@@ -546,7 +547,6 @@ Loop_C1_7_Write:
1729be168c0dSopenharmony_ci   b Loop_C1_7_Write
1730be168c0dSopenharmony_ci 
1731be168c0dSopenharmony_ci End:
1732be168c0dSopenharmony_ci-  sub sp, sp, #128
1733be168c0dSopenharmony_ci   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1734be168c0dSopenharmony_ci   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1735be168c0dSopenharmony_ci   ret
1736be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S
1737be168c0dSopenharmony_ciindex 0818d74e..a240b64d 100644
1738be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S
1739be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S
1740be168c0dSopenharmony_ci@@ -54,14 +54,14 @@
1741be168c0dSopenharmony_ci 
1742be168c0dSopenharmony_ci asm_function PostFuncInt8C4Neon64
1743be168c0dSopenharmony_ci   sub sp, sp, #16
1744be168c0dSopenharmony_ci-  stp x24, x25, [sp], #16
1745be168c0dSopenharmony_ci+  stp x24, x25, [sp]
1746be168c0dSopenharmony_ci 
1747be168c0dSopenharmony_ci-  ldr w8, [sp]
1748be168c0dSopenharmony_ci-  ldr w9, [sp, #8]
1749be168c0dSopenharmony_ci-  ldr w10, [sp, #16]
1750be168c0dSopenharmony_ci-  ldr w11, [sp, #24]
1751be168c0dSopenharmony_ci-  ldr w12, [sp, #32]
1752be168c0dSopenharmony_ci-  ldr w13, [sp, #40]
1753be168c0dSopenharmony_ci+  ldr w8, [sp, #16]
1754be168c0dSopenharmony_ci+  ldr w9, [sp, #24]
1755be168c0dSopenharmony_ci+  ldr w10, [sp, #32]
1756be168c0dSopenharmony_ci+  ldr w11, [sp, #40]
1757be168c0dSopenharmony_ci+  ldr w12, [sp, #48]
1758be168c0dSopenharmony_ci+  ldr w13, [sp, #56]
1759be168c0dSopenharmony_ci 
1760be168c0dSopenharmony_ci   dup v26.4s, w7
1761be168c0dSopenharmony_ci   dup v27.4s, w8
1762be168c0dSopenharmony_ci@@ -254,7 +254,6 @@ Loop_C1_3:
1763be168c0dSopenharmony_ci 
1764be168c0dSopenharmony_ci 
1765be168c0dSopenharmony_ci End:
1766be168c0dSopenharmony_ci-  sub sp, sp, #16
1767be168c0dSopenharmony_ci   ldp x24, x25, [sp], #16
1768be168c0dSopenharmony_ci   ret
1769be168c0dSopenharmony_ci #endif
1770be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S
1771be168c0dSopenharmony_ciindex cfa9bdf8..614d83f8 100644
1772be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S
1773be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S
1774be168c0dSopenharmony_ci@@ -55,9 +55,10 @@
1775be168c0dSopenharmony_ci 
1776be168c0dSopenharmony_ci asm_function SPMM8x8Fp32
1777be168c0dSopenharmony_ci     sub sp, sp, #144
1778be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1779be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1780be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
1781be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1782be168c0dSopenharmony_ci+    add x9, sp, #64
1783be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1784be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
1785be168c0dSopenharmony_ci 
1786be168c0dSopenharmony_ci     // init output with bias
1787be168c0dSopenharmony_ci     ldr w8, [x5], #4
1788be168c0dSopenharmony_ci@@ -286,7 +287,6 @@ WRITE_OUT:
1789be168c0dSopenharmony_ci     st1 {v14.4s, v15.4s}, [x4]
1790be168c0dSopenharmony_ci 
1791be168c0dSopenharmony_ci End:
1792be168c0dSopenharmony_ci-  sub sp, sp, #144
1793be168c0dSopenharmony_ci   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1794be168c0dSopenharmony_ci   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1795be168c0dSopenharmony_ci   ldp x19, x20, [sp], #16
1796be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S
1797be168c0dSopenharmony_ciindex 5987e68a..e0efc7b2 100644
1798be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S
1799be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S
1800be168c0dSopenharmony_ci@@ -29,8 +29,9 @@ asm_function TiledC4MatmulFp32
1801be168c0dSopenharmony_ci //x5: oc4
1802be168c0dSopenharmony_ci 
1803be168c0dSopenharmony_ci sub sp, sp, #128
1804be168c0dSopenharmony_ci-st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1805be168c0dSopenharmony_ci-st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1806be168c0dSopenharmony_ci+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1807be168c0dSopenharmony_ci+add x9, sp, #64
1808be168c0dSopenharmony_ci+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1809be168c0dSopenharmony_ci 
1810be168c0dSopenharmony_ci mov x7, #4 //sizeof(float)
1811be168c0dSopenharmony_ci mul x3, x3, x7
1812be168c0dSopenharmony_ci@@ -272,7 +273,6 @@ LoopOcHalf:
1813be168c0dSopenharmony_ci     st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
1814be168c0dSopenharmony_ci 
1815be168c0dSopenharmony_ci LoopOcEnd:
1816be168c0dSopenharmony_ci-    sub sp, sp, #128
1817be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1818be168c0dSopenharmony_ci     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1819be168c0dSopenharmony_ci     ret
1820be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S
1821be168c0dSopenharmony_ciindex 4a26b251..243b19de 100644
1822be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S
1823be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S
1824be168c0dSopenharmony_ci@@ -30,7 +30,7 @@ asm_function WinogradTransLeft
1825be168c0dSopenharmony_ci //x6:length
1826be168c0dSopenharmony_ci 
1827be168c0dSopenharmony_ci sub sp, sp, #32
1828be168c0dSopenharmony_ci-stp x19, x20, [sp], #32
1829be168c0dSopenharmony_ci+stp x19, x20, [sp]
1830be168c0dSopenharmony_ci 
1831be168c0dSopenharmony_ci mov x8, #16 // 4 * sizeof(float)
1832be168c0dSopenharmony_ci mul x8, x6, x8
1833be168c0dSopenharmony_ci@@ -152,7 +152,6 @@ LoopH:
1834be168c0dSopenharmony_ci     subs x4, x4, #1
1835be168c0dSopenharmony_ci     bne LoopH
1836be168c0dSopenharmony_ci 
1837be168c0dSopenharmony_ci-    sub sp, sp, #32
1838be168c0dSopenharmony_ci     ldp x19, x20, [sp], #32
1839be168c0dSopenharmony_ci     ret
1840be168c0dSopenharmony_ci 
1841be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S
1842be168c0dSopenharmony_ciindex 931fa016..95ee50a5 100644
1843be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S
1844be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S
1845be168c0dSopenharmony_ci@@ -30,7 +30,7 @@ asm_function WinogradTransRight
1846be168c0dSopenharmony_ci //x6: length
1847be168c0dSopenharmony_ci 
1848be168c0dSopenharmony_ci sub sp, sp, #16
1849be168c0dSopenharmony_ci-stp x19, x20, [sp], #16
1850be168c0dSopenharmony_ci+stp x19, x20, [sp]
1851be168c0dSopenharmony_ci 
1852be168c0dSopenharmony_ci mov x8, #16 // 4 * sizeof(float)
1853be168c0dSopenharmony_ci mul x8, x6, x8
1854be168c0dSopenharmony_ci@@ -155,7 +155,6 @@ LoopH:
1855be168c0dSopenharmony_ci     subs x4, x4, #1
1856be168c0dSopenharmony_ci     bne LoopH
1857be168c0dSopenharmony_ci 
1858be168c0dSopenharmony_ci-    sub sp, sp, #16
1859be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
1860be168c0dSopenharmony_ci     ret
1861be168c0dSopenharmony_ci #endif
1862be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S
1863be168c0dSopenharmony_ciindex 221a1609..56f03dbd 100644
1864be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S
1865be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S
1866be168c0dSopenharmony_ci@@ -31,21 +31,22 @@ asm_function ConvDwFp16Center
1867be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
1868be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
1869be168c0dSopenharmony_ci     sub sp, sp, #192
1870be168c0dSopenharmony_ci-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1871be168c0dSopenharmony_ci-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1872be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
1873be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
1874be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
1875be168c0dSopenharmony_ci-    stp x25, x26, [sp], #16
1876be168c0dSopenharmony_ci+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
1877be168c0dSopenharmony_ci+    add x9, sp, #64
1878be168c0dSopenharmony_ci+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
1879be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
1880be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
1881be168c0dSopenharmony_ci+    stp x23, x24, [sp, #160]
1882be168c0dSopenharmony_ci+    stp x25, x26, [sp, #176]
1883be168c0dSopenharmony_ci 
1884be168c0dSopenharmony_ci-    ldr x8, [sp]
1885be168c0dSopenharmony_ci-    ldr x9, [sp, #8]
1886be168c0dSopenharmony_ci-    ldr x10, [sp, #16]
1887be168c0dSopenharmony_ci-    ldr x11, [sp, #24]
1888be168c0dSopenharmony_ci-    ldr x12, [sp, #32]
1889be168c0dSopenharmony_ci-    ldr x13, [sp, #40]
1890be168c0dSopenharmony_ci-    ldr x14, [sp, #48]
1891be168c0dSopenharmony_ci-    ldr x15, [sp, #56]
1892be168c0dSopenharmony_ci+    ldr x8, [sp, #192]
1893be168c0dSopenharmony_ci+    ldr x9, [sp, #200]
1894be168c0dSopenharmony_ci+    ldr x10, [sp, #208]
1895be168c0dSopenharmony_ci+    ldr x11, [sp, #216]
1896be168c0dSopenharmony_ci+    ldr x12, [sp, #224]
1897be168c0dSopenharmony_ci+    ldr x13, [sp, #232]
1898be168c0dSopenharmony_ci+    ldr x14, [sp, #240]
1899be168c0dSopenharmony_ci+    ldr x15, [sp, #248]
1900be168c0dSopenharmony_ci 
1901be168c0dSopenharmony_ci     ld1 {v24.8h}, [x3]
1902be168c0dSopenharmony_ci     movi v26.8h, #0x46, lsl #8
1903be168c0dSopenharmony_ci@@ -301,7 +302,6 @@ asm_function ConvDwFp16Center
1904be168c0dSopenharmony_ci         subs x4, x4, #1
1905be168c0dSopenharmony_ci         bne LoopH
1906be168c0dSopenharmony_ci 
1907be168c0dSopenharmony_ci-    sub sp, sp, #192
1908be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
1909be168c0dSopenharmony_ci     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
1910be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
1911be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S
1912be168c0dSopenharmony_ciindex 1266b160..bb37a913 100644
1913be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S
1914be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S
1915be168c0dSopenharmony_ci@@ -30,14 +30,14 @@ asm_function DeconvDwFp16Center
1916be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
1917be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
1918be168c0dSopenharmony_ci     sub sp, sp, #32
1919be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
1920be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
1921be168c0dSopenharmony_ci+    stp x19, x20, [sp]
1922be168c0dSopenharmony_ci+    stp x21, x22, [sp, #16]
1923be168c0dSopenharmony_ci 
1924be168c0dSopenharmony_ci-    ldr x8, [sp]
1925be168c0dSopenharmony_ci-    ldr x9, [sp, #8]
1926be168c0dSopenharmony_ci-    ldr x10, [sp, #16]
1927be168c0dSopenharmony_ci-    ldr x11, [sp, #24]
1928be168c0dSopenharmony_ci-    ldr x12, [sp, #32]
1929be168c0dSopenharmony_ci+    ldr x8, [sp, #32]
1930be168c0dSopenharmony_ci+    ldr x9, [sp, #40]
1931be168c0dSopenharmony_ci+    ldr x10, [sp, #48]
1932be168c0dSopenharmony_ci+    ldr x11, [sp, #56]
1933be168c0dSopenharmony_ci+    ldr x12, [sp, #64]
1934be168c0dSopenharmony_ci 
1935be168c0dSopenharmony_ci     LoopH:
1936be168c0dSopenharmony_ci         mov x15, x0
1937be168c0dSopenharmony_ci@@ -69,7 +69,6 @@ asm_function DeconvDwFp16Center
1938be168c0dSopenharmony_ci         subs x3, x3, #1
1939be168c0dSopenharmony_ci         bne LoopH
1940be168c0dSopenharmony_ci 
1941be168c0dSopenharmony_ci-    sub sp, sp, #32
1942be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
1943be168c0dSopenharmony_ci     ldp x21, x22, [sp], #16
1944be168c0dSopenharmony_ci     ret
1945be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S
1946be168c0dSopenharmony_ciindex 80a55b75..4f5441a3 100644
1947be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S
1948be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S
1949be168c0dSopenharmony_ci@@ -30,8 +30,9 @@
1950be168c0dSopenharmony_ci 
1951be168c0dSopenharmony_ci asm_function MatVecMulFp16Neon64
1952be168c0dSopenharmony_ci   sub sp, sp, #128
1953be168c0dSopenharmony_ci-  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
1954be168c0dSopenharmony_ci-  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
1955be168c0dSopenharmony_ci+  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
1956be168c0dSopenharmony_ci+  add x9, sp, #64
1957be168c0dSopenharmony_ci+  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9]
1958be168c0dSopenharmony_ci 
1959be168c0dSopenharmony_ci   mov w14, #2      // sizeof(float16)
1960be168c0dSopenharmony_ci   mul w8, w14, w5  // rhs depthx1 block stride
1961be168c0dSopenharmony_ci@@ -184,7 +185,6 @@ Write1x1:
1962be168c0dSopenharmony_ci   b Loop
1963be168c0dSopenharmony_ci 
1964be168c0dSopenharmony_ci End:
1965be168c0dSopenharmony_ci-  sub sp, sp, #128
1966be168c0dSopenharmony_ci   ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
1967be168c0dSopenharmony_ci   ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
1968be168c0dSopenharmony_ci   ret
1969be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S
1970be168c0dSopenharmony_ciindex a0e28b74..9f804fd3 100644
1971be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S
1972be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S
1973be168c0dSopenharmony_ci@@ -36,13 +36,14 @@
1974be168c0dSopenharmony_ci 
1975be168c0dSopenharmony_ci asm_function MatMul12x16Fp16Opt
1976be168c0dSopenharmony_ci     sub sp, sp, #160
1977be168c0dSopenharmony_ci-    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
1978be168c0dSopenharmony_ci-    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
1979be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
1980be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
1981be168c0dSopenharmony_ci+    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
1982be168c0dSopenharmony_ci+    add x9, sp, #64
1983be168c0dSopenharmony_ci+    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9]
1984be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
1985be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
1986be168c0dSopenharmony_ci 
1987be168c0dSopenharmony_ci-    ldr x8, [sp]
1988be168c0dSopenharmony_ci-    ldr x9, [sp, #8]
1989be168c0dSopenharmony_ci+    ldr x8, [sp, #160]
1990be168c0dSopenharmony_ci+    ldr x9, [sp, #168]
1991be168c0dSopenharmony_ci 
1992be168c0dSopenharmony_ci .macro CLEAR_OUTPUT_V8_V9
1993be168c0dSopenharmony_ci     dup v8.4s, wzr
1994be168c0dSopenharmony_ci@@ -1694,7 +1695,6 @@ LoopColEnd:
1995be168c0dSopenharmony_ci         subs x6, x6, #12
1996be168c0dSopenharmony_ci         bgt LoopRowStart
1997be168c0dSopenharmony_ci 
1998be168c0dSopenharmony_ci-    sub sp, sp, #160
1999be168c0dSopenharmony_ci     ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2000be168c0dSopenharmony_ci     ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2001be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
2002be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S
2003be168c0dSopenharmony_ciindex 79fa12bc..31f1adbd 100644
2004be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S
2005be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S
2006be168c0dSopenharmony_ci@@ -34,13 +34,14 @@
2007be168c0dSopenharmony_ci 
2008be168c0dSopenharmony_ci asm_function MatmulBaseFp16Neon
2009be168c0dSopenharmony_ci     sub sp, sp, #160
2010be168c0dSopenharmony_ci-    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2011be168c0dSopenharmony_ci-    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2012be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
2013be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
2014be168c0dSopenharmony_ci+    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
2015be168c0dSopenharmony_ci+    add x9, sp, #64
2016be168c0dSopenharmony_ci+    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9]
2017be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
2018be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
2019be168c0dSopenharmony_ci 
2020be168c0dSopenharmony_ci-    ldr x8, [sp]
2021be168c0dSopenharmony_ci-    ldr x9, [sp, #8]  // act
2022be168c0dSopenharmony_ci+    ldr x8, [sp, #160]
2023be168c0dSopenharmony_ci+    ldr x9, [sp, #168]  // act
2024be168c0dSopenharmony_ci     add x8, x8, x8  // stride * sizeof(float16_t)
2025be168c0dSopenharmony_ci 
2026be168c0dSopenharmony_ci     add x16, x7, x7 // col * sizeof(float16_t)
2027be168c0dSopenharmony_ci@@ -951,7 +952,6 @@ LoopColEnd:
2028be168c0dSopenharmony_ci     add x0, x0, x15
2029be168c0dSopenharmony_ci     bgt LoopRowStart
2030be168c0dSopenharmony_ci 
2031be168c0dSopenharmony_ci-    sub sp, sp, #160
2032be168c0dSopenharmony_ci     ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2033be168c0dSopenharmony_ci     ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2034be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
2035be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S
2036be168c0dSopenharmony_ciindex 6bb93f99..1d6b69a6 100644
2037be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S
2038be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S
2039be168c0dSopenharmony_ci@@ -34,15 +34,16 @@
2040be168c0dSopenharmony_ci 
2041be168c0dSopenharmony_ci asm_function MatmulFp16Neon64
2042be168c0dSopenharmony_ci   sub sp, sp, #144
2043be168c0dSopenharmony_ci-  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2044be168c0dSopenharmony_ci-  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2045be168c0dSopenharmony_ci-  stp x19, x20, [sp], #16
2046be168c0dSopenharmony_ci+  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
2047be168c0dSopenharmony_ci+  add x9, sp, #64
2048be168c0dSopenharmony_ci+  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9]
2049be168c0dSopenharmony_ci+  stp x19, x20, [sp, #128]
2050be168c0dSopenharmony_ci 
2051be168c0dSopenharmony_ci   mov w18, #16 // sizeof(float16) * 8
2052be168c0dSopenharmony_ci   mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float16) * 8 * depth
2053be168c0dSopenharmony_ci   mov x11, x3 // bias flag
2054be168c0dSopenharmony_ci   mov x19, #2
2055be168c0dSopenharmony_ci-  ldr x17, [sp]
2056be168c0dSopenharmony_ci+  ldr x17, [sp, #144]
2057be168c0dSopenharmony_ci   mul x17, x17, x19
2058be168c0dSopenharmony_ci 
2059be168c0dSopenharmony_ci L1:
2060be168c0dSopenharmony_ci@@ -308,7 +309,7 @@ Relu:
2061be168c0dSopenharmony_ci   fmax v31.8h, v31.8h, v14.8h
2062be168c0dSopenharmony_ci 
2063be168c0dSopenharmony_ci Write:
2064be168c0dSopenharmony_ci-  ldrb w13, [sp, #8]
2065be168c0dSopenharmony_ci+  ldrb w13, [sp, #152]
2066be168c0dSopenharmony_ci   cbz w13, WriteC8
2067be168c0dSopenharmony_ci   cmp w7, #1
2068be168c0dSopenharmony_ci   beq Write1
2069be168c0dSopenharmony_ci@@ -877,14 +878,13 @@ End2:
2070be168c0dSopenharmony_ci   subs w7, w7, #8 // rhs col - 8
2071be168c0dSopenharmony_ci   add x1, x1, x15 // rhs ptr + stride
2072be168c0dSopenharmony_ci   add x3, x3, #16 // bias ptr + stride
2073be168c0dSopenharmony_ci-  ldrb w13, [sp, #8]
2074be168c0dSopenharmony_ci+  ldrb w13, [sp, #152]
2075be168c0dSopenharmony_ci   cbz w13, NoDstStep
2076be168c0dSopenharmony_ci   add x2, x2, #16 // dst ptr + stride
2077be168c0dSopenharmony_ci NoDstStep:
2078be168c0dSopenharmony_ci   bgt L1
2079be168c0dSopenharmony_ci 
2080be168c0dSopenharmony_ci End1:
2081be168c0dSopenharmony_ci-  sub sp, sp, #144
2082be168c0dSopenharmony_ci   ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2083be168c0dSopenharmony_ci   ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2084be168c0dSopenharmony_ci   ldp x19, x20, [sp], #16
2085be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S
2086be168c0dSopenharmony_ciindex 4a111066..21348f80 100644
2087be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S
2088be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S
2089be168c0dSopenharmony_ci@@ -34,12 +34,12 @@
2090be168c0dSopenharmony_ci 
2091be168c0dSopenharmony_ci asm_function MatmulFp16Neon64Opt
2092be168c0dSopenharmony_ci     sub sp, sp, #96
2093be168c0dSopenharmony_ci-    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2094be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
2095be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
2096be168c0dSopenharmony_ci+    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
2097be168c0dSopenharmony_ci+    stp x19, x20, [sp, #64]
2098be168c0dSopenharmony_ci+    stp x21, x22, [sp, #80]
2099be168c0dSopenharmony_ci 
2100be168c0dSopenharmony_ci-    ldr x8, [sp]
2101be168c0dSopenharmony_ci-    ldr x9, [sp, #8]
2102be168c0dSopenharmony_ci+    ldr x8, [sp, #96]
2103be168c0dSopenharmony_ci+    ldr x9, [sp, #104]
2104be168c0dSopenharmony_ci 
2105be168c0dSopenharmony_ci     mov x21, #32 // sizeof(float16_t) * 16
2106be168c0dSopenharmony_ci     mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth
2107be168c0dSopenharmony_ci@@ -1178,7 +1178,6 @@ LoopColEnd:
2108be168c0dSopenharmony_ci         subs x6, x6, #16
2109be168c0dSopenharmony_ci         bgt LoopRowStart
2110be168c0dSopenharmony_ci 
2111be168c0dSopenharmony_ci-    sub sp, sp, #96
2112be168c0dSopenharmony_ci     ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2113be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
2114be168c0dSopenharmony_ci     ldp x21, x22, [sp], #16
2115be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S
2116be168c0dSopenharmony_ciindex 2d901a3d..40b788c9 100644
2117be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S
2118be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S
2119be168c0dSopenharmony_ci@@ -34,15 +34,16 @@
2120be168c0dSopenharmony_ci 
2121be168c0dSopenharmony_ci asm_function MatmulFp16OptV2
2122be168c0dSopenharmony_ci     sub sp, sp, #192
2123be168c0dSopenharmony_ci-    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2124be168c0dSopenharmony_ci-    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2125be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
2126be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
2127be168c0dSopenharmony_ci-    stp x23, x24, [sp], #16
2128be168c0dSopenharmony_ci-    stp x29, x30, [sp], #16
2129be168c0dSopenharmony_ci-
2130be168c0dSopenharmony_ci-    ldr x8, [sp]
2131be168c0dSopenharmony_ci-    ldr x9, [sp, #8]  // writeMode
2132be168c0dSopenharmony_ci+    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
2133be168c0dSopenharmony_ci+    add x9, sp, #64
2134be168c0dSopenharmony_ci+    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9]
2135be168c0dSopenharmony_ci+    stp x19, x20, [sp, #128]
2136be168c0dSopenharmony_ci+    stp x21, x22, [sp, #144]
2137be168c0dSopenharmony_ci+    stp x23, x24, [sp, #160]
2138be168c0dSopenharmony_ci+    stp x29, x30, [sp, #176]
2139be168c0dSopenharmony_ci+
2140be168c0dSopenharmony_ci+    ldr x8, [sp, #192]
2141be168c0dSopenharmony_ci+    ldr x9, [sp, #200]  // writeMode
2142be168c0dSopenharmony_ci     lsl x8, x8, #1  // stride * sizeof(float16_t)
2143be168c0dSopenharmony_ci 
2144be168c0dSopenharmony_ci     lsl x15, x7, #1 // col * sizeof(float16_t)
2145be168c0dSopenharmony_ci@@ -2955,7 +2956,6 @@ Compute1x4Unit:
2146be168c0dSopenharmony_ci         ret
2147be168c0dSopenharmony_ci 
2148be168c0dSopenharmony_ci End:
2149be168c0dSopenharmony_ci-  sub sp, sp, #192
2150be168c0dSopenharmony_ci   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
2151be168c0dSopenharmony_ci   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
2152be168c0dSopenharmony_ci   ldp x19, x20, [sp], #16
2153be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S
2154be168c0dSopenharmony_ciindex 9ee3c4d5..ca0542da 100644
2155be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S
2156be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S
2157be168c0dSopenharmony_ci@@ -27,9 +27,9 @@ asm_function MatrixMultiplyWinogradFp16
2158be168c0dSopenharmony_ci     // x19 ~ x29 should be also preserved
2159be168c0dSopenharmony_ci     // whereas our coding style do not permit such amount of parameters
2160be168c0dSopenharmony_ci     sub sp, sp, #48
2161be168c0dSopenharmony_ci-    st1 {v8.8h}, [sp], #16
2162be168c0dSopenharmony_ci-    stp x19, x20, [sp], #16
2163be168c0dSopenharmony_ci-    stp x21, x22, [sp], #16
2164be168c0dSopenharmony_ci+    st1 {v8.8h}, [sp]
2165be168c0dSopenharmony_ci+    stp x19, x20, [sp, #16]
2166be168c0dSopenharmony_ci+    stp x21, x22, [sp, #32]
2167be168c0dSopenharmony_ci 
2168be168c0dSopenharmony_ci     mov x8, #2
2169be168c0dSopenharmony_ci     mul x10, x5, x8    // n * 2
2170be168c0dSopenharmony_ci@@ -210,7 +210,6 @@ asm_function MatrixMultiplyWinogradFp16
2171be168c0dSopenharmony_ci             b LoopM
2172be168c0dSopenharmony_ci 
2173be168c0dSopenharmony_ci     EndLoopM:
2174be168c0dSopenharmony_ci-        sub sp, sp, #48
2175be168c0dSopenharmony_ci         ld1 {v8.8h}, [sp], #16
2176be168c0dSopenharmony_ci         ldp x19, x20, [sp], #16
2177be168c0dSopenharmony_ci         ldp x21, x22, [sp], #16
2178be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S
2179be168c0dSopenharmony_ciindex d7570d18..5b616ae7 100644
2180be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S
2181be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S
2182be168c0dSopenharmony_ci@@ -22,8 +22,9 @@
2183be168c0dSopenharmony_ci asm_function TiledC4MatmulFp16
2184be168c0dSopenharmony_ci 
2185be168c0dSopenharmony_ci sub sp, sp, #128
2186be168c0dSopenharmony_ci-st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
2187be168c0dSopenharmony_ci-st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
2188be168c0dSopenharmony_ci+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
2189be168c0dSopenharmony_ci+add x9, sp, #64
2190be168c0dSopenharmony_ci+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
2191be168c0dSopenharmony_ci 
2192be168c0dSopenharmony_ci mov x7, #2 //sizeof(float)
2193be168c0dSopenharmony_ci mul x3, x3, x7
2194be168c0dSopenharmony_ci@@ -265,7 +266,6 @@ LoopOcHalf:
2195be168c0dSopenharmony_ci     st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], #32
2196be168c0dSopenharmony_ci 
2197be168c0dSopenharmony_ci LoopOcEnd:
2198be168c0dSopenharmony_ci-    sub sp, sp, #128
2199be168c0dSopenharmony_ci     ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
2200be168c0dSopenharmony_ci     ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
2201be168c0dSopenharmony_ci     ret
2202be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S
2203be168c0dSopenharmony_ciindex d11dd472..0df891d3 100644
2204be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S
2205be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S
2206be168c0dSopenharmony_ci@@ -31,8 +31,9 @@
2207be168c0dSopenharmony_ci 
2208be168c0dSopenharmony_ci asm_function VecMatmulFp16Neon64_2
2209be168c0dSopenharmony_ci   sub sp, sp, #128
2210be168c0dSopenharmony_ci-  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2211be168c0dSopenharmony_ci-  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2212be168c0dSopenharmony_ci+  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp]
2213be168c0dSopenharmony_ci+  add x9, sp, #64
2214be168c0dSopenharmony_ci+  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9]
2215be168c0dSopenharmony_ci 
2216be168c0dSopenharmony_ci LoopCol:
2217be168c0dSopenharmony_ci   mov x15, x0   // reload a ptr
2218be168c0dSopenharmony_ci@@ -174,7 +175,6 @@ Write7:
2219be168c0dSopenharmony_ci   b End
2220be168c0dSopenharmony_ci 
2221be168c0dSopenharmony_ci End:
2222be168c0dSopenharmony_ci-  sub sp, sp, #128
2223be168c0dSopenharmony_ci   ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
2224be168c0dSopenharmony_ci   ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
2225be168c0dSopenharmony_ci   ret
2226be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S
2227be168c0dSopenharmony_ciindex 1970c16a..c9b4104e 100644
2228be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S
2229be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S
2230be168c0dSopenharmony_ci@@ -22,7 +22,7 @@
2231be168c0dSopenharmony_ci asm_function WinogradTransLeftFp16
2232be168c0dSopenharmony_ci 
2233be168c0dSopenharmony_ci sub sp, sp, #16
2234be168c0dSopenharmony_ci-stp x19, x20, [sp], #16
2235be168c0dSopenharmony_ci+stp x19, x20, [sp]
2236be168c0dSopenharmony_ci 
2237be168c0dSopenharmony_ci mov x8, #8 // 4 * sizeof(float16)
2238be168c0dSopenharmony_ci mul x8, x6, x8
2239be168c0dSopenharmony_ci@@ -144,7 +144,6 @@ LoopH:
2240be168c0dSopenharmony_ci     subs x4, x4, #1
2241be168c0dSopenharmony_ci     bne LoopH
2242be168c0dSopenharmony_ci 
2243be168c0dSopenharmony_ci-    sub sp, sp, #16
2244be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
2245be168c0dSopenharmony_ci     ret
2246be168c0dSopenharmony_ci 
2247be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S
2248be168c0dSopenharmony_ciindex c575f504..46c3cd84 100644
2249be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S
2250be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S
2251be168c0dSopenharmony_ci@@ -22,7 +22,7 @@
2252be168c0dSopenharmony_ci asm_function WinogradTransRightFp16
2253be168c0dSopenharmony_ci 
2254be168c0dSopenharmony_ci sub sp, sp, #16
2255be168c0dSopenharmony_ci-stp x19, x20, [sp], #16
2256be168c0dSopenharmony_ci+stp x19, x20, [sp]
2257be168c0dSopenharmony_ci 
2258be168c0dSopenharmony_ci mov x8, #8 // 4 * sizeof(float16)
2259be168c0dSopenharmony_ci mul x8, x6, x8
2260be168c0dSopenharmony_ci@@ -147,7 +147,6 @@ LoopH:
2261be168c0dSopenharmony_ci     subs x4, x4, #1
2262be168c0dSopenharmony_ci     bne LoopH
2263be168c0dSopenharmony_ci 
2264be168c0dSopenharmony_ci-    sub sp, sp, #16
2265be168c0dSopenharmony_ci     ldp x19, x20, [sp], #16
2266be168c0dSopenharmony_ci 
2267be168c0dSopenharmony_ci     ret
2268be168c0dSopenharmony_ci-- 
2269be168c0dSopenharmony_ci2.17.1
2270be168c0dSopenharmony_ci
2271