1From c580b97cbfea388ac393f617c4d960021bf11322 Mon Sep 17 00:00:00 2001 2From: chengfeng27 <chengfeng27@huawei.com> 3Date: Mon, 12 Aug 2024 11:42:12 +0800 4Subject: [PATCH] fix arm64/fp16 assemble can not protect stack in mutil-thread 5 switch case 6 7--- 8 .../kernel/nnacl/assembly/arm64/AdderFp32.S | 10 ++--- 9 .../nnacl/assembly/arm64/BigMatmulFp32Opt.S | 22 +++++----- 10 .../assembly/arm64/ConvDw3x3Fp32Stride1.S | 12 ++--- 11 .../assembly/arm64/ConvDw3x3Fp32Stride2.S | 12 ++--- 12 .../nnacl/assembly/arm64/ConvDw3x3Int8.S | 34 +++++++------- 13 .../assembly/arm64/ConvDw3x3Int8Corner.S | 19 ++++---- 14 .../assembly/arm64/ConvDw3x3Int8Horizontal.S | 25 +++++------ 15 .../assembly/arm64/ConvDw3x3Int8Stride2.S | 34 +++++++------- 16 .../assembly/arm64/ConvDw3x3Int8Vertical.S | 19 ++++---- 17 .../nnacl/assembly/arm64/ConvDw3x3Line.S | 6 +-- 18 .../nnacl/assembly/arm64/ConvDwFp32Center.S | 30 ++++++------- 19 .../assembly/arm64/ConvDwFp32Indirect3x3.S | 7 ++- 20 .../nnacl/assembly/arm64/ConvDwInt8Center.S | 44 +++++++++---------- 21 .../nnacl/assembly/arm64/ConvFp32Center.S | 42 +++++++++--------- 22 .../nnacl/assembly/arm64/ConvSW1x16Kernel.S | 23 +++++----- 23 .../nnacl/assembly/arm64/ConvSW1x8Kernel.S | 21 +++++---- 24 .../nnacl/assembly/arm64/ConvSW2x16Kernel.S | 21 +++++---- 25 .../nnacl/assembly/arm64/ConvSW2x8Kernel.S | 21 +++++---- 26 .../nnacl/assembly/arm64/ConvSW3x16Kernel.S | 23 +++++----- 27 .../nnacl/assembly/arm64/ConvSW3x8Kernel.S | 21 +++++---- 28 .../nnacl/assembly/arm64/ConvSW4x16Kernel.S | 28 ++++++------ 29 .../nnacl/assembly/arm64/ConvSW4x8Kernel.S | 28 ++++++------ 30 .../nnacl/assembly/arm64/ConvSW5x16Kernel.S | 28 ++++++------ 31 .../nnacl/assembly/arm64/ConvSW5x8Kernel.S | 28 ++++++------ 32 .../nnacl/assembly/arm64/DeconvDwFp32Center.S | 15 +++---- 33 .../nnacl/assembly/arm64/DeconvDwInt8Center.S | 15 +++---- 34 .../nnacl/assembly/arm64/MatVecMulFp32.S | 24 +++++----- 35 .../nnacl/assembly/arm64/MatVecMulPackFp32.S | 15 +++---- 36 .../kernel/nnacl/assembly/arm64/MatmulFp32.S | 14 +++--- 37 .../nnacl/assembly/arm64/MatmulFp32Opt.S | 16 +++---- 38 .../nnacl/assembly/arm64/MatmulFp32OptRow12.S | 14 +++--- 39 .../nnacl/assembly/arm64/MatmulFp32OptRow4.S | 16 +++---- 40 .../nnacl/assembly/arm64/MatmulFp32OptRow8.S | 14 +++--- 41 .../kernel/nnacl/assembly/arm64/MatmulInt8.S | 38 ++++++++-------- 42 .../nnacl/assembly/arm64/MatmulInt8Opt.S | 44 +++++++++---------- 43 .../nnacl/assembly/arm64/MatmulR4Int8.S | 8 ++-- 44 .../nnacl/assembly/arm64/MatmulWinogradFp32.S | 7 ++- 45 .../nnacl/assembly/arm64/PostFuncBiasReluC8.S | 6 +-- 46 .../assembly/arm64/PostFuncInt8C4Neon64.S | 15 +++---- 47 .../kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S | 8 ++-- 48 .../nnacl/assembly/arm64/TiledC4MatmulFp32.S | 6 +-- 49 .../nnacl/assembly/arm64/WinogradTransLeft.S | 3 +- 50 .../nnacl/assembly/arm64/WinogradTransRight.S | 3 +- 51 .../nnacl/assembly/fp16/ConvDwFp16Center.S | 30 ++++++------- 52 .../nnacl/assembly/fp16/DeconvDwFp16Center.S | 15 +++---- 53 .../nnacl/assembly/fp16/MatVecMulFp16.S | 6 +-- 54 .../nnacl/assembly/fp16/Matmul12X16Fp16.S | 14 +++--- 55 .../nnacl/assembly/fp16/MatmulBaseFp16Neon.S | 14 +++--- 56 .../kernel/nnacl/assembly/fp16/MatmulFp16.S | 14 +++--- 57 .../nnacl/assembly/fp16/MatmulFp16Opt.S | 11 +++-- 58 .../nnacl/assembly/fp16/MatmulFp16OptV2.S | 20 ++++----- 59 .../nnacl/assembly/fp16/MatmulWinogradFp16.S | 7 ++- 60 .../nnacl/assembly/fp16/TiledC4MatmulFp16.S | 6 +-- 61 .../nnacl/assembly/fp16/VecMatmulFp16.S | 6 +-- 62 .../assembly/fp16/WinogradTransLeftFp16.S | 3 +- 63 .../assembly/fp16/WinogradTransRightFp16.S | 3 +- 64 56 files changed, 483 insertions(+), 505 deletions(-) 65 66diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S 67index 66136f42..9123d88c 100644 68--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S 69+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S 70@@ -34,11 +34,12 @@ 71 72 asm_function AdderFloatNeon64 73 sub sp, sp, #144 74- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 75- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 76- stp x19, x20, [sp], #16 77+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 78+ add x9, sp, #64 79+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 80+ stp x19, x20, [sp, #128] 81 82- ldr x8, [sp] 83+ ldr x8, [sp, #144] 84 85 mov x20, #48 // sizeof(float) * 12 86 mul x17, x5, x20 // block stride of lhs/rhs: sizeof(float) * 12 * depth 87@@ -614,7 +615,6 @@ LoopColEnd: 88 subs x6, x6, #12 89 bgt LoopRowStart 90 91- sub sp, sp, #144 92 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 93 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 94 ldp x19, x20, [sp], #16 95diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S 96index 498038ff..03898585 100644 97--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S 98+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S 99@@ -33,16 +33,17 @@ 100 101 asm_function BigMatmulFloatNeon64Opt 102 sub sp, sp, #224 103- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 104- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 105- stp x19, x20, [sp], #16 106- stp x21, x22, [sp], #16 107- stp x23, x24, [sp], #16 108- stp x25, x26, [sp], #16 109- stp x27, x28, [sp], #16 110- stp x29, x30, [sp], #16 111- 112- ldr x8, [sp] 113+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 114+ add x9, sp, #64 115+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 116+ stp x19, x20, [sp, #128] 117+ stp x21, x22, [sp, #144] 118+ stp x23, x24, [sp, #160] 119+ stp x25, x26, [sp, #176] 120+ stp x27, x28, [sp, #192] 121+ stp x29, x30, [sp, #208] 122+ 123+ ldr x8, [sp, #224] 124 mov x20, #1 125 mov x22, #32 126 mov x23, #48 127@@ -2515,7 +2516,6 @@ Compute4x4Unit: 128 ret 129 130 End: 131- sub sp, sp, #224 132 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 133 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 134 ldp x19, x20, [sp], #16 135diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S 136index f04d9082..b96efd64 100644 137--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S 138+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S 139@@ -36,12 +36,13 @@ 140 141 asm_function ConvDw3x3Stride1 142 sub sp, sp, #128 143- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 144- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 145+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 146+ add x9, sp, #64 147+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 148 149- ldr w8, [sp] 150- ldr w9, [sp, #8] 151- ldr w10, [sp, #16] 152+ ldr w8, [sp, #128] 153+ ldr w9, [sp, #136] 154+ ldr w10, [sp, #144] 155 156 mov w11, #4 157 mul w15, w4, w11 // col_size * 4 158@@ -203,7 +204,6 @@ WIDTH1_LEFT: 159 st1 {v21.4s}, [x0] 160 161 End: 162- sub sp, sp, #128 163 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 164 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 165 ret 166diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S 167index 0dd075dd..7632d48e 100644 168--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S 169+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S 170@@ -36,12 +36,13 @@ 171 172 asm_function ConvDw3x3Stride2 173 sub sp, sp, #128 174- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 175- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 176+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 177+ add x9, sp, #64 178+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 179 180- ldr w8, [sp] 181- ldr w9, [sp, #8] 182- ldr w10, [sp, #16] 183+ ldr w8, [sp, #128] 184+ ldr w9, [sp, #136] 185+ ldr w10, [sp, #144] 186 187 mov w11, #4 188 mul w15, w4, w11 // col_size * 4 189@@ -205,7 +206,6 @@ WIDTH1_LEFT: 190 st1 {v24.4s}, [x0] 191 192 End: 193- sub sp, sp, #128 194 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 195 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 196 ret 197diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S 198index bfb9b8f6..5187d368 100644 199--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S 200+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S 201@@ -44,22 +44,23 @@ 202 203 asm_function ConvDw3x3Int8Neon64 204 sub sp, sp, #192 205- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 206- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 207- stp x19, x20, [sp], #16 208- stp x21, x22, [sp], #16 209- stp x23, x24, [sp], #16 210- stp x25, x26, [sp], #16 211- 212- ldr x8, [sp] 213- ldr x9, [sp, #8] 214- ldr x10, [sp, #16] 215- ldr x11, [sp, #24] 216- ldr x12, [sp, #32] 217- ldr x13, [sp, #40] 218- ldr x14, [sp, #48] 219- ldr x15, [sp, #56] 220- ldr x23, [sp, #64] // per_channel 221+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 222+ add x9, sp, #64 223+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 224+ stp x19, x20, [sp, #128] 225+ stp x21, x22, [sp, #144] 226+ stp x23, x24, [sp, #160] 227+ stp x25, x26, [sp, #176] 228+ 229+ ldr x8, [sp, #192] 230+ ldr x9, [sp, #200] 231+ ldr x10, [sp, #208] 232+ ldr x11, [sp, #216] 233+ ldr x12, [sp, #224] 234+ ldr x13, [sp, #232] 235+ ldr x14, [sp, #240] 236+ ldr x15, [sp, #248] 237+ ldr x23, [sp, #256] // per_channel 238 239 add x19, x3, #16 240 add w20, w6, w6 // channel * 2 241@@ -488,7 +489,6 @@ OUTZP3: 242 st1 {v21.8b}, [x0], x6 243 244 End: 245- sub sp, sp, #192 246 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 247 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 248 ldp x19, x20, [sp], #16 249diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S 250index b07ac01b..416e1a3a 100644 251--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S 252+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S 253@@ -32,20 +32,20 @@ asm_function ConvDw3x3Int8Corner 254 // x19 ~ x29 should be also preserved 255 // whereas our coding style do not permit such amount of parameters 256 sub sp, sp, #32 257- stp x19, x20, [sp], #16 258- stp x21, x22, [sp], #16 259+ stp x19, x20, [sp] 260+ stp x21, x22, [sp, #16] 261 262 dup v25.8b, w7 // in_zp 263- ldr x8, [sp] 264+ ldr x8, [sp, #32] 265 dup v26.4s, w8 // out_zp 266- ldr x9, [sp, #8] // out_multiplier 267- ldr x10, [sp, #16] // left_shift 268- ldr x11, [sp, #24] // right_shift 269- ldr x12, [sp, #32] 270+ ldr x9, [sp, #40] // out_multiplier 271+ ldr x10, [sp, #48] // left_shift 272+ ldr x11, [sp, #56] // right_shift 273+ ldr x12, [sp, #64] 274 dup v30.4s, w12 // acc_min 275- ldr x13, [sp, #40] 276+ ldr x13, [sp, #72] 277 dup v31.4s, w13 // acc_max 278- ldr x14, [sp, #48] // per_channel 279+ ldr x14, [sp, #80] // per_channel 280 cbnz x14, PerChannelDump 281 PerLayerDump: 282 ld1r {v27.4s}, [x9] 283@@ -216,7 +216,6 @@ asm_function ConvDw3x3Int8Corner 284 st1 {v23.s}[0], [x0], #4 285 st1 {v24.s}[0], [x0], #4 286 287- sub sp, sp, #32 288 ldp x19, x20, [sp], #16 289 ldp x21, x22, [sp], #16 290 ret 291diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S 292index 92eeffea..379154e6 100644 293--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S 294+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S 295@@ -32,21 +32,21 @@ asm_function ConvDw3x3Int8Horizontal 296 // x19 ~ x29 should be also preserved 297 // whereas our coding style do not permit such amount of parameters 298 sub sp, sp, #48 299- stp x19, x20, [sp], #16 300- stp x21, x22, [sp], #16 301- stp x23, x24, [sp], #16 302+ stp x19, x20, [sp] 303+ stp x21, x22, [sp, #16] 304+ stp x23, x24, [sp, #32] 305 306 dup v25.8b, w7 // in_zp 307- ldr x8, [sp] 308+ ldr x8, [sp, #48] 309 dup v26.4s, w8 // out_zp 310- ldr x9, [sp, #8] // out_multiplier 311- ldr x10, [sp, #16] // left_shift 312- ldr x11, [sp, #24] // right_shift 313- ldr x12, [sp, #32] 314+ ldr x9, [sp, #56] // out_multiplier 315+ ldr x10, [sp, #64] // left_shift 316+ ldr x11, [sp, #72] // right_shift 317+ ldr x12, [sp, #80] 318 dup v30.4s, w12 // acc_min 319- ldr x13, [sp, #40] 320+ ldr x13, [sp, #88] 321 dup v31.4s, w13 // acc_max 322- ldr x14, [sp, #48] // per_channel 323+ ldr x14, [sp, #96] // per_channel 324 cbnz x14, PerChannelDump 325 PerLayerDump: 326 ld1r {v27.4s}, [x9] 327@@ -58,9 +58,9 @@ asm_function ConvDw3x3Int8Horizontal 328 ld1 {v28.4s}, [x10], #16 329 ld1 {v29.4s}, [x11], #16 330 ContinueFunc: 331- ldr x12, [sp, #32] 332+ ldr x12, [sp, #80] 333 dup v30.4s, w12 // acc_min 334- ldr x13, [sp, #40] 335+ ldr x13, [sp, #88] 336 dup v31.4s, w13 // acc_max 337 338 mov x12, #2 339@@ -248,7 +248,6 @@ asm_function ConvDw3x3Int8Horizontal 340 341 st1 {v23.s}[0], [x0], #4 342 st1 {v24.s}[0], [x0], #4 343- sub sp, sp, #48 344 ldp x19, x20, [sp], #16 345 ldp x21, x22, [sp], #16 346 ldp x23, x24, [sp], #16 347diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S 348index cc1b3e9b..8643a536 100644 349--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S 350+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S 351@@ -44,22 +44,23 @@ 352 353 asm_function ConvDw3x3Int8Stride2 354 sub sp, sp, #192 355- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 356- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 357- stp x19, x20, [sp], #16 358- stp x21, x22, [sp], #16 359- stp x23, x24, [sp], #16 360- stp x25, x26, [sp], #16 361- 362- ldr x8, [sp] 363- ldr x9, [sp, #8] 364- ldr x10, [sp, #16] 365- ldr x11, [sp, #24] 366- ldr x12, [sp, #32] 367- ldr x13, [sp, #40] 368- ldr x14, [sp, #48] 369- ldr x15, [sp, #56] 370- ldr x23, [sp, #64] // per_channel 371+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 372+ add x9, sp, #64 373+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 374+ stp x19, x20, [sp, #128] 375+ stp x21, x22, [sp, #144] 376+ stp x23, x24, [sp, #160] 377+ stp x25, x26, [sp, #176] 378+ 379+ ldr x8, [sp, #192] 380+ ldr x9, [sp, #200] 381+ ldr x10, [sp, #208] 382+ ldr x11, [sp, #216] 383+ ldr x12, [sp, #224] 384+ ldr x13, [sp, #232] 385+ ldr x14, [sp, #240] 386+ ldr x15, [sp, #248] 387+ ldr x23, [sp, #256] // per_channel 388 389 add x19, x3, #16 390 add w20, w6, w6 // channel * 2 391@@ -463,7 +464,6 @@ OUTZP3: 392 st1 {v24.8b}, [x0], x6 393 394 End: 395- sub sp, sp, #192 396 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 397 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 398 ldp x19, x20, [sp], #16 399diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S 400index 67151534..706bc9fe 100644 401--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S 402+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S 403@@ -32,20 +32,20 @@ asm_function ConvDw3x3Int8Vertical 404 // x19 ~ x29 should be also preserved 405 // whereas our coding style do not permit such amount of parameters 406 sub sp, sp, #32 407- stp x19, x20, [sp], #16 408- stp x21, x22, [sp], #16 409+ stp x19, x20, [sp] 410+ stp x21, x22, [sp, #16] 411 412 dup v25.8b, w7 // in_zp 413- ldr x8, [sp] 414+ ldr x8, [sp, #32] 415 dup v26.4s, w8 // out_zp 416- ldr x9, [sp, #8] // out_multiplier 417- ldr x10, [sp, #16] // left_shift 418- ldr x11, [sp, #24] // right_shift 419- ldr x12, [sp, #32] 420+ ldr x9, [sp, #40] // out_multiplier 421+ ldr x10, [sp, #48] // left_shift 422+ ldr x11, [sp, #56] // right_shift 423+ ldr x12, [sp, #64] 424 dup v30.4s, w12 // acc_min 425- ldr x13, [sp, #40] 426+ ldr x13, [sp, #72] 427 dup v31.4s, w13 // acc_max 428- ldr x14, [sp, #48] // per_channel 429+ ldr x14, [sp, #80] // per_channel 430 cbnz x14, PerChannelDump 431 PerLayerDump: 432 ld1r {v27.4s}, [x9] 433@@ -239,7 +239,6 @@ asm_function ConvDw3x3Int8Vertical 434 435 st1 {v23.s}[0], [x0], #4 436 st1 {v24.s}[0], [x0], #4 437- sub sp, sp, #32 438 ldp x19, x20, [sp], #16 439 ldp x21, x22, [sp], #16 440 ret 441diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S 442index 6157848e..f939ec62 100644 443--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S 444+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S 445@@ -29,8 +29,9 @@ asm_function ConvDw3x3Line 446 // x19 ~ x29 should be also preserved 447 // whereas our coding style do not permit such amount of parameters 448 sub sp, sp, #128 449- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 450- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 451+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 452+ add x9, sp, #64 453+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 454 455 ldr x8, [x1] 456 ldr x9, [x1, #8] 457@@ -196,7 +197,6 @@ asm_function ConvDw3x3Line 458 add x0, x0, #16 459 bgt LoopC4 460 461- sub sp, sp, #128 462 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 463 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 464 ret 465diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S 466index e9ddd65a..6f30c3ac 100644 467--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S 468+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S 469@@ -31,21 +31,22 @@ asm_function ConvDwFp32Center 470 // x19 ~ x29 should be also preserved 471 // whereas our coding style do not permit such amount of parameters 472 sub sp, sp, #192 473- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 474- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 475- stp x19, x20, [sp], #16 476- stp x21, x22, [sp], #16 477- stp x23, x24, [sp], #16 478- stp x25, x26, [sp], #16 479+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 480+ add x9, sp, #64 481+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 482+ stp x19, x20, [sp, #128] 483+ stp x21, x22, [sp, #144] 484+ stp x23, x24, [sp, #160] 485+ stp x25, x26, [sp, #176] 486 487- ldr x8, [sp] 488- ldr x9, [sp, #8] 489- ldr x10, [sp, #16] 490- ldr x11, [sp, #24] 491- ldr x12, [sp, #32] 492- ldr x13, [sp, #40] 493- ldr x14, [sp, #48] 494- ldr x15, [sp, #56] 495+ ldr x8, [sp, #192] 496+ ldr x9, [sp, #200] 497+ ldr x10, [sp, #208] 498+ ldr x11, [sp, #216] 499+ ldr x12, [sp, #224] 500+ ldr x13, [sp, #232] 501+ ldr x14, [sp, #240] 502+ ldr x15, [sp, #248] 503 504 ld1 {v24.4s}, [x3] 505 movi v26.4s, #6 506@@ -302,7 +303,6 @@ asm_function ConvDwFp32Center 507 subs x4, x4, #1 508 bne LoopH 509 510- sub sp, sp, #192 511 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 512 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 513 ldp x19, x20, [sp], #16 514diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S 515index 34cc9037..ca93dc7d 100644 516--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S 517+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S 518@@ -25,14 +25,14 @@ 519 520 asm_function ConvDwFp32Indirect3x3 521 sub sp, sp, #32 522- stp x19, x20, [sp], #16 523- stp x21, x22, [sp], #16 524+ stp x19, x20, [sp] 525+ stp x21, x22, [sp, #16] 526 527 movi v31.4s, #6 528 scvtf v31.4s, v31.4s 529 dup v30.4s, wzr 530 531- ldr x8, [sp] 532+ ldr x8, [sp, #32] 533 cmp x5, #0 534 beq End 535 536@@ -153,7 +153,6 @@ asm_function ConvDwFp32Indirect3x3 537 cmp x5, #0 538 bgt LoopPixel 539 End: 540- sub sp, sp, #32 541 ldp x19, x20, [sp], #16 542 ldp x21, x22, [sp], #16 543 ret 544diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S 545index 7ed94e6b..328250f3 100644 546--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S 547+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S 548@@ -34,44 +34,45 @@ asm_function ConvDwInt8Center 549 // x19 ~ x29 should be also preserved 550 // whereas our coding style do not permit such amount of parameters 551 sub sp, sp, #192 552- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 553- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 554- stp x19, x20, [sp], #16 555- stp x21, x22, [sp], #16 556- stp x23, x24, [sp], #16 557- stp x25, x26, [sp], #16 558- 559- ldr x8, [sp] 560- ldr x9, [sp, #8] 561- ldr x10, [sp, #16] 562- ldr x11, [sp, #24] 563- ldr x12, [sp, #32] 564- ldr x13, [sp, #40] 565- 566- ldr x14, [sp, #48] // input_zp 567+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 568+ add x9, sp, #64 569+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 570+ stp x19, x20, [sp, #128] 571+ stp x21, x22, [sp, #144] 572+ stp x23, x24, [sp, #160] 573+ stp x25, x26, [sp, #176] 574+ 575+ ldr x8, [sp, #192] 576+ ldr x9, [sp, #200] 577+ ldr x10, [sp, #208] 578+ ldr x11, [sp, #216] 579+ ldr x12, [sp, #224] 580+ ldr x13, [sp, #232] 581+ 582+ ldr x14, [sp, #240] // input_zp 583 ld1 {v19.8b}, [x14], #8 584 585- ldr x15, [sp, #56] // output_zp 586+ ldr x15, [sp, #248] // output_zp 587 ld1 {v20.4s}, [x15], #16 588 ld1 {v21.4s}, [x15], #16 589 590- ldr x16, [sp, #64] // out_multiplier 591+ ldr x16, [sp, #256] // out_multiplier 592 ld1 {v22.4s}, [x16], #16 593 ld1 {v23.4s}, [x16], #16 594 595- ldr x17, [sp, #72] // left_shift 596+ ldr x17, [sp, #264] // left_shift 597 ld1 {v24.4s}, [x17], #16 598 ld1 {v25.4s}, [x17], #16 599 600- ldr x25, [sp, #80] // right shift 601+ ldr x25, [sp, #272] // right shift 602 ld1 {v26.4s}, [x25], #16 603 ld1 {v27.4s}, [x25], #16 604 605- ldr x19, [sp, #88] // acc_min 606+ ldr x19, [sp, #280] // acc_min 607 ld1 {v28.4s}, [x19], #16 608 ld1 {v29.4s}, [x19], #16 609 610- ldr x20, [sp, #96] // acc_max 611+ ldr x20, [sp, #288] // acc_max 612 ld1 {v30.4s}, [x20], #16 613 ld1 {v31.4s}, [x20], #16 614 615@@ -283,7 +284,6 @@ asm_function ConvDwInt8Center 616 subs x4, x4, #1 617 bne LoopH 618 619- sub sp, sp, #192 620 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 621 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 622 ldp x19, x20, [sp], #16 623diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S 624index 2cc456f6..0a9d3265 100644 625--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S 626+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S 627@@ -31,21 +31,22 @@ asm_function ConvSwFp32Center 628 // x19 ~ x29 should be also preserved 629 // whereas our coding style do not permit such amount of parameters 630 sub sp, sp, #208 631- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 632- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 633- stp x19, x20, [sp], #16 634- stp x21, x22, [sp], #16 635- stp x23, x24, [sp], #16 636- stp x25, x26, [sp], #16 637- stp x27, x28, [sp], #16 638+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 639+ add x9, sp, #64 640+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 641+ stp x19, x20, [sp, #128] 642+ stp x21, x22, [sp, #144] 643+ stp x23, x24, [sp, #160] 644+ stp x25, x26, [sp, #176] 645+ stp x27, x28, [sp, #192] 646 647- ldr x8, [sp] 648- ldr x9, [sp, #8] 649- ldr x10, [sp, #16] 650- ldr x11, [sp, #24] 651- ldr x12, [sp, #32] 652- ldr x13, [sp, #40] 653- ldr x14, [sp, #48] 654+ ldr x8, [sp, #208] 655+ ldr x9, [sp, #216] 656+ ldr x10, [sp, #224] 657+ ldr x11, [sp, #232] 658+ ldr x12, [sp, #240] 659+ ldr x13, [sp, #248] 660+ ldr x14, [sp, #256] 661 mul x15, x6, x7 662 mul x15, x10, x15 663 mov x16, #16 664@@ -198,9 +199,9 @@ asm_function ConvSwFp32Center 665 add x20, x20, x13 666 subs x22, x22, #1 667 bne LoopKh16 668- ldr x16, [sp, #64] 669+ ldr x16, [sp, #272] 670 cbnz x16, Relu616 671- ldr x26, [sp, #56] 672+ ldr x26, [sp, #264] 673 cbnz x26, Relu16 674 b Write16 675 Relu616: 676@@ -347,9 +348,9 @@ asm_function ConvSwFp32Center 677 add x20, x20, x13 678 subs x22, x22, #1 679 bne LoopKh8 680- ldr x16, [sp, #64] 681+ ldr x16, [sp, #272] 682 cbnz x16, Relu68 683- ldr x26, [sp, #56] 684+ ldr x26, [sp, #264] 685 cbnz x26, Relu8 686 b Write8 687 Relu68: 688@@ -426,9 +427,9 @@ asm_function ConvSwFp32Center 689 add x20, x20, x13 690 subs x22, x22, #1 691 bne LoopKh 692- ldr x16, [sp, #64] 693+ ldr x16, [sp, #272] 694 cbnz x16, Relu6 695- ldr x26, [sp, #56] 696+ ldr x26, [sp, #264] 697 cbnz x26, Relu 698 b Write 699 Relu6: 700@@ -446,7 +447,6 @@ asm_function ConvSwFp32Center 701 subs x4, x4, #1 702 bne LoopH 703 704- sub sp, sp, #208 705 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 706 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 707 ldp x19, x20, [sp], #16 708diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S 709index 2267e776..3b436c17 100644 710--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S 711+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S 712@@ -30,17 +30,17 @@ asm_function SWConv1x16Kernel 713 // x19 ~ x29 should be also preserved 714 // whereas our coding style do not permit such amount of parameters 715 sub sp, sp, #64 716- stp x19, x20, [sp], #16 717- stp x21, x22, [sp], #16 718- stp x23, x24, [sp], #16 719- stp x25, x26, [sp], #16 720- 721- ldr x10, [sp] 722- ldr x11, [sp, #8] 723- ldr x12, [sp, #16] 724- ldr x13, [sp, #24] 725- ldr x14, [sp, #32] 726- ldr x15, [sp, #40] 727+ stp x19, x20, [sp] 728+ stp x21, x22, [sp, #16] 729+ stp x23, x24, [sp, #32] 730+ stp x25, x26, [sp, #48] 731+ 732+ ldr x10, [sp, #64] 733+ ldr x11, [sp, #72] 734+ ldr x12, [sp, #80] 735+ ldr x13, [sp, #88] 736+ ldr x14, [sp, #96] 737+ ldr x15, [sp, #104] 738 lsl x7, x7, #2 739 lsl x11, x11, #2 740 lsl x12, x12, #2 741@@ -413,7 +413,6 @@ asm_function SWConv1x16Kernel 742 st1 {v2.4s}, [x21] 743 st1 {v3.4s}, [x22] 744 End: 745- sub sp, sp, #64 746 ldp x19, x20, [sp], #16 747 ldp x21, x22, [sp], #16 748 ldp x23, x24, [sp], #16 749diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S 750index fa8bb63d..6a29e95e 100644 751--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S 752+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S 753@@ -30,17 +30,17 @@ asm_function SWConv1x8Kernel 754 // x19 ~ x29 should be also preserved 755 // whereas our coding style do not permit such amount of parameters 756 sub sp, sp, #64 757- stp x19, x20, [sp], #16 758- stp x21, x22, [sp], #16 759- stp x23, x24, [sp], #16 760- stp x25, x26, [sp], #16 761+ stp x19, x20, [sp] 762+ stp x21, x22, [sp, #16] 763+ stp x23, x24, [sp, #32] 764+ stp x25, x26, [sp, #48] 765 766- ldr x10, [sp] 767- ldr x11, [sp, #8] 768- ldr x12, [sp, #16] 769- ldr x13, [sp, #24] 770- ldr x14, [sp, #32] 771- ldr x15, [sp, #40] 772+ ldr x10, [sp, #64] 773+ ldr x11, [sp, #72] 774+ ldr x12, [sp, #80] 775+ ldr x13, [sp, #88] 776+ ldr x14, [sp, #96] 777+ ldr x15, [sp, #104] 778 lsl x7, x7, #2 779 lsl x11, x11, #2 780 lsl x12, x12, #2 781@@ -270,7 +270,6 @@ asm_function SWConv1x8Kernel 782 st1 {v0.4s}, [x0] 783 st1 {v1.4s}, [x20] 784 End: 785- sub sp, sp, #64 786 ldp x19, x20, [sp], #16 787 ldp x21, x22, [sp], #16 788 ldp x23, x24, [sp], #16 789diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S 790index 69624af6..8a5dd83a 100644 791--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S 792+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S 793@@ -30,17 +30,17 @@ asm_function SWConv2x16Kernel 794 // x19 ~ x29 should be also preserved 795 // whereas our coding style do not permit such amount of parameters 796 sub sp, sp, #64 797- stp x19, x20, [sp], #16 798- stp x21, x22, [sp], #16 799- stp x23, x24, [sp], #16 800- stp x25, x26, [sp], #16 801+ stp x19, x20, [sp] 802+ stp x21, x22, [sp, #16] 803+ stp x23, x24, [sp, #32] 804+ stp x25, x26, [sp, #48] 805 806- ldr x10, [sp] 807- ldr x11, [sp, #8] 808- ldr x12, [sp, #16] 809- ldr x13, [sp, #24] 810- ldr x14, [sp, #32] 811- ldr x15, [sp, #40] 812+ ldr x10, [sp, #64] 813+ ldr x11, [sp, #72] 814+ ldr x12, [sp, #80] 815+ ldr x13, [sp, #88] 816+ ldr x14, [sp, #96] 817+ ldr x15, [sp, #104] 818 lsl x7, x7, #2 819 lsl x11, x11, #2 820 lsl x12, x12, #2 821@@ -399,7 +399,6 @@ asm_function SWConv2x16Kernel 822 st1 {v3.4s}, [x22], #16 823 st1 {v7.4s}, [x22] 824 End: 825- sub sp, sp, #64 826 ldp x19, x20, [sp], #16 827 ldp x21, x22, [sp], #16 828 ldp x23, x24, [sp], #16 829diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S 830index 8fefa4be..6efd21d0 100644 831--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S 832+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S 833@@ -30,17 +30,17 @@ asm_function SWConv2x8Kernel 834 // x19 ~ x29 should be also preserved 835 // whereas our coding style do not permit such amount of parameters 836 sub sp, sp, #64 837- stp x19, x20, [sp], #16 838- stp x21, x22, [sp], #16 839- stp x23, x24, [sp], #16 840- stp x25, x26, [sp], #16 841+ stp x19, x20, [sp] 842+ stp x21, x22, [sp, #16] 843+ stp x23, x24, [sp, #32] 844+ stp x25, x26, [sp, #48] 845 846- ldr x10, [sp] 847- ldr x11, [sp, #8] 848- ldr x12, [sp, #16] 849- ldr x13, [sp, #24] 850- ldr x14, [sp, #32] 851- ldr x15, [sp, #40] 852+ ldr x10, [sp, #64] 853+ ldr x11, [sp, #72] 854+ ldr x12, [sp, #80] 855+ ldr x13, [sp, #88] 856+ ldr x14, [sp, #96] 857+ ldr x15, [sp, #104] 858 lsl x7, x7, #2 859 lsl x11, x11, #2 860 lsl x12, x12, #2 861@@ -257,7 +257,6 @@ asm_function SWConv2x8Kernel 862 st1 {v1.4s}, [x20], #16 863 st1 {v3.4s}, [x20] 864 End: 865- sub sp, sp, #64 866 ldp x19, x20, [sp], #16 867 ldp x21, x22, [sp], #16 868 ldp x23, x24, [sp], #16 869diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S 870index 61efd444..428dea69 100644 871--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S 872+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S 873@@ -30,18 +30,18 @@ asm_function SWConv3x16Kernel 874 // x19 ~ x29 should be also preserved 875 // whereas our coding style do not permit such amount of parameters 876 sub sp, sp, #128 877- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 878- stp x19, x20, [sp], #16 879- stp x21, x22, [sp], #16 880- stp x23, x24, [sp], #16 881- stp x25, x26, [sp], #16 882+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 883+ stp x19, x20, [sp, #64] 884+ stp x21, x22, [sp, #80] 885+ stp x23, x24, [sp, #96] 886+ stp x25, x26, [sp, #112] 887 888- ldr x10, [sp] 889- ldr x11, [sp, #8] 890- ldr x12, [sp, #16] 891- ldr x13, [sp, #24] 892- ldr x14, [sp, #32] 893- ldr x15, [sp, #40] 894+ ldr x10, [sp, #128] 895+ ldr x11, [sp, #136] 896+ ldr x12, [sp, #144] 897+ ldr x13, [sp, #152] 898+ ldr x14, [sp, #160] 899+ ldr x15, [sp, #168] 900 lsl x7, x7, #2 901 lsl x11, x11, #2 902 lsl x12, x12, #2 903@@ -524,7 +524,6 @@ asm_function SWConv3x16Kernel 904 st1 {v7.4s}, [x22], #16 905 st1 {v11.4s}, [x22] 906 End: 907- sub sp, sp, #128 908 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 909 ldp x19, x20, [sp], #16 910 ldp x21, x22, [sp], #16 911diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S 912index 1e958572..472e50b9 100644 913--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S 914+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S 915@@ -30,17 +30,17 @@ asm_function SWConv3x8Kernel 916 // x19 ~ x29 should be also preserved 917 // whereas our coding style do not permit such amount of parameters 918 sub sp, sp, #64 919- stp x19, x20, [sp], #16 920- stp x21, x22, [sp], #16 921- stp x23, x24, [sp], #16 922- stp x25, x26, [sp], #16 923+ stp x19, x20, [sp] 924+ stp x21, x22, [sp, #16] 925+ stp x23, x24, [sp, #32] 926+ stp x25, x26, [sp, #48] 927 928- ldr x10, [sp] 929- ldr x11, [sp, #8] 930- ldr x12, [sp, #16] 931- ldr x13, [sp, #24] 932- ldr x14, [sp, #32] 933- ldr x15, [sp, #40] 934+ ldr x10, [sp, #64] 935+ ldr x11, [sp, #72] 936+ ldr x12, [sp, #80] 937+ ldr x13, [sp, #88] 938+ ldr x14, [sp, #96] 939+ ldr x15, [sp, #104] 940 lsl x7, x7, #2 941 lsl x11, x11, #2 942 lsl x12, x12, #2 943@@ -324,7 +324,6 @@ asm_function SWConv3x8Kernel 944 st1 {v3.4s}, [x20], #16 945 st1 {v5.4s}, [x20] 946 End: 947- sub sp, sp, #64 948 ldp x19, x20, [sp], #16 949 ldp x21, x22, [sp], #16 950 ldp x23, x24, [sp], #16 951diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S 952index 1cd5e124..076724a7 100644 953--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S 954+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S 955@@ -30,20 +30,21 @@ asm_function SWConv4x16Kernel 956 // x19 ~ x29 should be also preserved 957 // whereas our coding style do not permit such amount of parameters 958 sub sp, sp, #208 959- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 960- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 961- stp x19, x20, [sp], #16 962- stp x21, x22, [sp], #16 963- stp x23, x24, [sp], #16 964- stp x25, x26, [sp], #16 965- stp x27, x28, [sp], #16 966+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 967+ add x9, sp, #64 968+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 969+ stp x19, x20, [sp, #128] 970+ stp x21, x22, [sp, #144] 971+ stp x23, x24, [sp, #160] 972+ stp x25, x26, [sp, #176] 973+ stp x27, x28, [sp, #192] 974 975- ldr x10, [sp] 976- ldr x11, [sp, #8] 977- ldr x12, [sp, #16] 978- ldr x13, [sp, #24] 979- ldr x14, [sp, #32] 980- ldr x15, [sp, #40] 981+ ldr x10, [sp, #208] 982+ ldr x11, [sp, #216] 983+ ldr x12, [sp, #224] 984+ ldr x13, [sp, #232] 985+ ldr x14, [sp, #240] 986+ ldr x15, [sp, #248] 987 lsl x7, x7, #2 988 lsl x11, x11, #2 989 lsl x12, x12, #2 990@@ -650,7 +651,6 @@ asm_function SWConv4x16Kernel 991 st1 {v11.4s}, [x22], #16 992 st1 {v15.4s}, [x22] 993 End: 994- sub sp, sp, #208 995 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 996 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 997 ldp x19, x20, [sp], #16 998diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S 999index 28109031..6b24de97 100644 1000--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S 1001+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S 1002@@ -30,20 +30,21 @@ asm_function SWConv4x8Kernel 1003 // x19 ~ x29 should be also preserved 1004 // whereas our coding style do not permit such amount of parameters 1005 sub sp, sp, #208 1006- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1007- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1008- stp x19, x20, [sp], #16 1009- stp x21, x22, [sp], #16 1010- stp x23, x24, [sp], #16 1011- stp x25, x26, [sp], #16 1012- stp x27, x28, [sp], #16 1013+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1014+ add x9, sp, #64 1015+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1016+ stp x19, x20, [sp, #128] 1017+ stp x21, x22, [sp, #144] 1018+ stp x23, x24, [sp, #160] 1019+ stp x25, x26, [sp, #176] 1020+ stp x27, x28, [sp, #192] 1021 1022- ldr x10, [sp] 1023- ldr x11, [sp, #8] 1024- ldr x12, [sp, #16] 1025- ldr x13, [sp, #24] 1026- ldr x14, [sp, #32] 1027- ldr x15, [sp, #40] 1028+ ldr x10, [sp, #208] 1029+ ldr x11, [sp, #216] 1030+ ldr x12, [sp, #224] 1031+ ldr x13, [sp, #232] 1032+ ldr x14, [sp, #240] 1033+ ldr x15, [sp, #248] 1034 lsl x7, x7, #2 1035 lsl x11, x11, #2 1036 lsl x12, x12, #2 1037@@ -394,7 +395,6 @@ asm_function SWConv4x8Kernel 1038 st1 {v5.4s}, [x20], #16 1039 st1 {v7.4s}, [x20] 1040 End: 1041- sub sp, sp, #208 1042 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1043 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1044 ldp x19, x20, [sp], #16 1045diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S 1046index 302e5a3d..a2b7ea2c 100644 1047--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S 1048+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S 1049@@ -30,20 +30,21 @@ asm_function SWConv5x16Kernel 1050 // x19 ~ x29 should be also preserved 1051 // whereas our coding style do not permit such amount of parameters 1052 sub sp, sp, #208 1053- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1054- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1055- stp x19, x20, [sp], #16 1056- stp x21, x22, [sp], #16 1057- stp x23, x24, [sp], #16 1058- stp x25, x26, [sp], #16 1059- stp x27, x28, [sp], #16 1060+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1061+ add x9, sp, #64 1062+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1063+ stp x19, x20, [sp, #128] 1064+ stp x21, x22, [sp, #144] 1065+ stp x23, x24, [sp, #160] 1066+ stp x25, x26, [sp, #176] 1067+ stp x27, x28, [sp, #192] 1068 1069- ldr x10, [sp] 1070- ldr x11, [sp, #8] 1071- ldr x12, [sp, #16] 1072- ldr x13, [sp, #24] 1073- ldr x14, [sp, #32] 1074- ldr x15, [sp, #40] 1075+ ldr x10, [sp, #208] 1076+ ldr x11, [sp, #216] 1077+ ldr x12, [sp, #224] 1078+ ldr x13, [sp, #232] 1079+ ldr x14, [sp, #240] 1080+ ldr x15, [sp, #248] 1081 lsl x7, x7, #2 1082 lsl x11, x11, #2 1083 lsl x12, x12, #2 1084@@ -445,7 +446,6 @@ asm_function SWConv5x16Kernel 1085 st1 {v15.4s}, [x22], #16 1086 st1 {v19.4s}, [x22] 1087 End: 1088- sub sp, sp, #208 1089 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1090 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1091 ldp x19, x20, [sp], #16 1092diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S 1093index 059cc7fc..b7e48480 100644 1094--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S 1095+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S 1096@@ -30,20 +30,21 @@ asm_function SWConv5x8Kernel 1097 // x19 ~ x29 should be also preserved 1098 // whereas our coding style do not permit such amount of parameters 1099 sub sp, sp, #208 1100- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1101- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1102- stp x19, x20, [sp], #16 1103- stp x21, x22, [sp], #16 1104- stp x23, x24, [sp], #16 1105- stp x25, x26, [sp], #16 1106- stp x27, x28, [sp], #16 1107+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1108+ add x9, sp, #64 1109+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1110+ stp x19, x20, [sp, #128] 1111+ stp x21, x22, [sp, #144] 1112+ stp x23, x24, [sp, #160] 1113+ stp x25, x26, [sp, #176] 1114+ stp x27, x28, [sp, #192] 1115 1116- ldr x10, [sp] 1117- ldr x11, [sp, #8] 1118- ldr x12, [sp, #16] 1119- ldr x13, [sp, #24] 1120- ldr x14, [sp, #32] 1121- ldr x15, [sp, #40] 1122+ ldr x10, [sp, #208] 1123+ ldr x11, [sp, #216] 1124+ ldr x12, [sp, #224] 1125+ ldr x13, [sp, #232] 1126+ ldr x14, [sp, #240] 1127+ ldr x15, [sp, #248] 1128 lsl x7, x7, #2 1129 lsl x11, x11, #2 1130 lsl x12, x12, #2 1131@@ -296,7 +297,6 @@ asm_function SWConv5x8Kernel 1132 st1 {v7.4s}, [x20], #16 1133 st1 {v9.4s}, [x20] 1134 End: 1135- sub sp, sp, #208 1136 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1137 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1138 ldp x19, x20, [sp], #16 1139diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S 1140index e6875bb1..11722e71 100644 1141--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S 1142+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S 1143@@ -30,14 +30,14 @@ asm_function DeconvDwFp32Center 1144 // x19 ~ x29 should be also preserved 1145 // whereas our coding style do not permit such amount of parameters 1146 sub sp, sp, #32 1147- stp x19, x20, [sp], #16 1148- stp x21, x22, [sp], #16 1149+ stp x19, x20, [sp] 1150+ stp x21, x22, [sp, #16] 1151 1152- ldr x8, [sp] 1153- ldr x9, [sp, #8] 1154- ldr x10, [sp, #16] 1155- ldr x11, [sp, #24] 1156- ldr x12, [sp, #32] 1157+ ldr x8, [sp, #32] 1158+ ldr x9, [sp, #40] 1159+ ldr x10, [sp, #48] 1160+ ldr x11, [sp, #56] 1161+ ldr x12, [sp, #64] 1162 1163 LoopH: 1164 mov x15, x0 1165@@ -69,7 +69,6 @@ asm_function DeconvDwFp32Center 1166 subs x3, x3, #1 1167 bne LoopH 1168 1169- sub sp, sp, #32 1170 ldp x19, x20, [sp], #16 1171 ldp x21, x22, [sp], #16 1172 ret 1173diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S 1174index aaf210f0..1c3723fa 100644 1175--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S 1176+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S 1177@@ -30,14 +30,14 @@ asm_function DeconvDwInt8Center 1178 // x19 ~ x29 should be also preserved 1179 // whereas our coding style do not permit such amount of parameters 1180 sub sp, sp, #32 1181- stp x19, x20, [sp], #16 1182- stp x21, x22, [sp], #16 1183+ stp x19, x20, [sp] 1184+ stp x21, x22, [sp, #16] 1185 1186- ldr x8, [sp] 1187- ldr x9, [sp, #8] 1188- ldr x10, [sp, #16] 1189- ldr x11, [sp, #24] 1190- ldr x12, [sp, #32] 1191+ ldr x8, [sp, #32] 1192+ ldr x9, [sp, #40] 1193+ ldr x10, [sp, #48] 1194+ ldr x11, [sp, #56] 1195+ ldr x12, [sp, #64] 1196 1197 LoopH: 1198 mov x15, x0 1199@@ -69,7 +69,6 @@ asm_function DeconvDwInt8Center 1200 subs x3, x3, #1 1201 bne LoopH 1202 1203- sub sp, sp, #32 1204 ldp x19, x20, [sp], #16 1205 ldp x21, x22, [sp], #16 1206 ret 1207diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S 1208index 71a7f0f1..36c8d8ec 100644 1209--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S 1210+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S 1211@@ -15,7 +15,7 @@ 1212 */ 1213 #ifdef ENABLE_ARM64 1214 #include "nnacl/assembly_global.h" 1215- 1216+ 1217 .text 1218 .align 5 1219 1220@@ -30,24 +30,25 @@ 1221 1222 asm_default_function MatVecMulFp32 1223 sub sp, sp, #128 1224- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 1225- st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 1226+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 1227+ add x9, sp, #64 1228+ st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9] 1229 1230 mov w14, #4 // sizeof(float) 1231 mul w8, w14, w5 // rhs depthx1 block stride 1232 mov w14, #4 1233- mul w13, w8, w14 // rhs depthx4 block stride 1234+ mul w13, w8, w14 // rhs depthx4 block stride 1235 1236 Loop: 1237 mov x15, x0 // reload a ptr 1238 mov x7, x1 // reload b ptr 1239 mov w9, w5 // reload depth 1240 cmp w6, #4 1241- blt Loop1x1 1242+ blt Loop1x1 1243 1244-Loop1x4: 1245- dup v10.8h, wzr 1246- dup v11.8h, wzr 1247+Loop1x4: 1248+ dup v10.8h, wzr 1249+ dup v11.8h, wzr 1250 dup v12.8h, wzr 1251 dup v13.8h, wzr 1252 dup v14.8h, wzr 1253@@ -150,7 +151,7 @@ End1x4: 1254 1255 cbz x3, Act1x4 1256 ld1 {v15.4s}, [x3], #16 1257- fadd v14.4s, v14.4s, v15.4s // add bias 1258+ fadd v14.4s, v14.4s, v15.4s // add bias 1259 1260 Act1x4: 1261 cmp w4, #3 1262@@ -214,8 +215,8 @@ Depth1_1x1: 1263 b Depth1_1x1 1264 1265 End1x1: 1266- faddp v6.4s, v4.4s, v4.4s 1267- faddp v7.4s, v6.4s, v6.4s 1268+ faddp v6.4s, v4.4s, v4.4s 1269+ faddp v7.4s, v6.4s, v6.4s 1270 fadd v7.4s, v7.4s, v5.4s 1271 1272 cbz x3, Act1x1 1273@@ -245,7 +246,6 @@ Write1x1: 1274 b Loop 1275 1276 End: 1277- sub sp, sp, #128 1278 ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 1279 ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 1280 ret 1281diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S 1282index d485b012..b013f48a 100644 1283--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S 1284+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S 1285@@ -30,8 +30,8 @@ 1286 1287 asm_default_function MatVecMulPackFp32 1288 sub sp, sp, #16 1289- stp x29, x30, [sp], #16 1290- 1291+ stp x29, x30, [sp] 1292+ 1293 dup v1.2d, xzr 1294 mov w7, #6 1295 dup v2.4s, w7 1296@@ -43,7 +43,7 @@ asm_default_function MatVecMulPackFp32 1297 st1 {v24.4s, v25.4s}, [x2], #32 1298 subs w6, w6, #8 1299 bge Loop1x8Start 1300- 1301+ 1302 Loop1xNStart: 1303 add w6, w6, #8 1304 cbz w6, End 1305@@ -59,7 +59,7 @@ asm_default_function MatVecMulPackFp32 1306 beq End 1307 st1 {v25.s}[2], [x2] 1308 b End 1309- 1310+ 1311 Loop1x4Start: 1312 add w6, w6, #4 1313 cbz w6, End 1314@@ -75,7 +75,7 @@ asm_default_function MatVecMulPackFp32 1315 beq End 1316 st1 {v24.s}[3], [x2], #4 1317 b End 1318- 1319+ 1320 Compute1x8Unit: 1321 mov x7, x0 // reload a-ptr 1322 mov w8, w5 // reset depth 1323@@ -140,7 +140,7 @@ asm_default_function MatVecMulPackFp32 1324 fmax v25.4s, v25.4s, v1.4s 1325 Return1x8: 1326 ret 1327- 1328+ 1329 Compute1x4Unit: 1330 mov x7, x0 // reload a-ptr 1331 mov w8, w5 // reset depth 1332@@ -191,9 +191,8 @@ asm_default_function MatVecMulPackFp32 1333 fmax v24.4s, v24.4s, v1.4s 1334 Return1x4: 1335 ret 1336- 1337+ 1338 End: 1339- sub sp, sp, #16 1340 ldp x29, x30, [sp], #16 1341 ret 1342 #endif 1343diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S 1344index 67d20dcc..2dedccd0 100644 1345--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S 1346+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S 1347@@ -34,17 +34,18 @@ 1348 1349 asm_function MatmulFloatNeon64 1350 sub sp, sp, #144 1351- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1352- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1353- stp x19, x20, [sp], #16 1354+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1355+ add x9, sp, #64 1356+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1357+ stp x19, x20, [sp, #128] 1358 1359- ldr x9, [sp, #8] 1360- ldr x14, [sp, #16] 1361+ ldr x9, [sp, #152] 1362+ ldr x14, [sp, #160] 1363 1364 mov w19, #32 // sizeof(float) * 8 1365 mul w15, w5, w19 // block stride of lhs/rhs: sizeof(float) * 8 * depth 1366 mov x19, #4 1367- ldr x17, [sp] 1368+ ldr x17, [sp, #144] 1369 cbz x14, NoWinoSteps 1370 mul x8, x7, x17 1371 mov x11, #8 1372@@ -779,7 +780,6 @@ NoDstStep: 1373 bgt L1 1374 1375 End1: 1376- sub sp, sp, #144 1377 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1378 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1379 ldp x19, x20, [sp], #16 1380diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S 1381index 6937f4ba..51d107c8 100644 1382--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S 1383+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S 1384@@ -19,7 +19,7 @@ 1385 .text 1386 .align 5 1387 1388-// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth 1389+// void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth 1390 // int row, int col, size_t stride, size_t writeMode) 1391 // x0: a 1392 // x1: b 1393@@ -34,13 +34,14 @@ 1394 1395 asm_function MatmulFloatNeon64Opt 1396 sub sp, sp, #160 1397- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1398- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1399- stp x19, x20, [sp], #16 1400- stp x21, x22, [sp], #16 1401+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1402+ add x9, sp, #64 1403+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1404+ stp x19, x20, [sp, #128] 1405+ stp x21, x22, [sp, #144] 1406 1407- ldr x8, [sp] 1408- ldr x9, [sp, #8] 1409+ ldr x8, [sp, #160] 1410+ ldr x9, [sp, #168] 1411 1412 mov x21, #48 // sizeof(float) * 12 1413 mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth 1414@@ -1659,7 +1660,6 @@ LoopColEnd: 1415 subs x6, x6, #12 1416 bgt LoopRowStart 1417 1418- sub sp, sp, #160 1419 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1420 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1421 ldp x19, x20, [sp], #16 1422diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S 1423index c9151a99..05465bd1 100644 1424--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S 1425+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S 1426@@ -34,13 +34,14 @@ 1427 1428 asm_function MatmulFloatNeon64OptRow12 1429 sub sp, sp, #160 1430- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1431- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1432- stp x19, x20, [sp], #16 1433- stp x21, x22, [sp], #16 1434+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1435+ add x9, sp, #64 1436+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1437+ stp x19, x20, [sp, #128] 1438+ stp x21, x22, [sp, #144] 1439 1440- ldr x8, [sp] 1441- ldr x9, [sp, #8] 1442+ ldr x8, [sp, #160] 1443+ ldr x9, [sp, #168] 1444 1445 mov x21, #48 // sizeof(float) * 12 1446 mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth 1447@@ -1220,7 +1221,6 @@ LoopColEnd: 1448 subs x6, x6, #12 1449 bgt LoopRow 1450 1451- sub sp, sp, #160 1452 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1453 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1454 ldp x19, x20, [sp], #16 1455diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S 1456index 0cc49fb9..b984c494 100644 1457--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S 1458+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S 1459@@ -19,7 +19,7 @@ 1460 .text 1461 .align 5 1462 1463-// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth 1464+// void MatmulFloatNeon64OptRow4(const float *a, const float *b, float *c, const float *bias, int act_type, int depth 1465 // int row, int col, size_t stride, size_t writeMode) 1466 // x0: a 1467 // x1: b 1468@@ -34,13 +34,14 @@ 1469 1470 asm_function MatmulFloatNeon64OptRow4 1471 sub sp, sp, #160 1472- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1473- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1474- stp x19, x20, [sp], #16 1475- stp x21, x22, [sp], #16 1476+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1477+ add x9, sp, #64 1478+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1479+ stp x19, x20, [sp, #128] 1480+ stp x21, x22, [sp, #144] 1481 1482- ldr x8, [sp] 1483- ldr x9, [sp, #8] 1484+ ldr x8, [sp, #160] 1485+ ldr x9, [sp, #168] 1486 1487 mov x21, #48 // sizeof(float) * 12 1488 1489@@ -588,7 +589,6 @@ LoopColEnd: 1490 subs x6, x6, #12 1491 bgt LoopRow4 1492 1493- sub sp, sp, #160 1494 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1495 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1496 ldp x19, x20, [sp], #16 1497diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S 1498index a9e42a54..c5b260c0 100644 1499--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S 1500+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S 1501@@ -34,13 +34,14 @@ 1502 1503 asm_function MatmulFloatNeon64OptRow8 1504 sub sp, sp, #160 1505- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1506- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1507- stp x19, x20, [sp], #16 1508- stp x21, x22, [sp], #16 1509+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1510+ add x9, sp, #64 1511+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1512+ stp x19, x20, [sp, #128] 1513+ stp x21, x22, [sp, #144] 1514 1515- ldr x8, [sp] 1516- ldr x9, [sp, #8] 1517+ ldr x8, [sp, #160] 1518+ ldr x9, [sp, #168] 1519 1520 mov x21, #48 // sizeof(float) * 12 1521 mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth 1522@@ -902,7 +903,6 @@ LoopColEnd: 1523 subs x6, x6, #12 1524 bgt LoopCol8 1525 1526- sub sp, sp, #160 1527 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1528 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1529 ldp x19, x20, [sp], #16 1530diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S 1531index a0e94c5f..731bac4b 100644 1532--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S 1533+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S 1534@@ -44,24 +44,25 @@ 1535 1536 asm_function MatmulInt8Neon64 1537 sub sp, sp, #208 1538- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1539- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1540- stp x19, x20, [sp], #16 1541- stp x21, x22, [sp], #16 1542- stp x23, x24, [sp], #16 1543- stp x25, x26, [sp], #16 1544- stp x27, x28, [sp], #16 1545- 1546- ldr w8, [sp] 1547- ldr w9, [sp, #8] 1548- ldr w10, [sp, #16] 1549- ldr x11, [sp, #24] 1550- ldr x12, [sp, #32] 1551- ldr x13, [sp, #40] 1552- ldr w14, [sp, #48] 1553- ldr w15, [sp, #56] 1554- ldr w24, [sp, #64] 1555- ldr w27, [sp, #72] 1556+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1557+ add x9, sp, #64 1558+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1559+ stp x19, x20, [sp, #128] 1560+ stp x21, x22, [sp, #144] 1561+ stp x23, x24, [sp, #160] 1562+ stp x25, x26, [sp, #176] 1563+ stp x27, x28, [sp, #192] 1564+ 1565+ ldr w8, [sp, #208] 1566+ ldr w9, [sp, #216] 1567+ ldr w10, [sp, #224] 1568+ ldr x11, [sp, #232] 1569+ ldr x12, [sp, #240] 1570+ ldr x13, [sp, #248] 1571+ ldr w14, [sp, #256] 1572+ ldr w15, [sp, #264] 1573+ ldr w24, [sp, #272] 1574+ ldr w27, [sp, #280] 1575 1576 mov w17, #4 // sizeof(int8)*4 1577 mul w21, w5, w17 // the stride of a/b: sizeof(int8)*4*deep16 1578@@ -408,7 +409,6 @@ PerTEnd2: 1579 b L1 1580 1581 End1: 1582- sub sp, sp, #208 1583 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1584 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1585 ldp x19, x20, [sp], #16 1586diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S 1587index 64be8a14..a54ee5b8 100644 1588--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S 1589+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S 1590@@ -43,23 +43,24 @@ 1591 1592 asm_function MatmulInt8Opt 1593 sub sp, sp, #224 1594- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1595- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1596- stp x19, x20, [sp], #16 1597- stp x21, x22, [sp], #16 1598- stp x23, x24, [sp], #16 1599- stp x25, x26, [sp], #16 1600- stp x27, x28, [sp], #16 1601- stp x29, x30, [sp], #16 1602- 1603- ldr w8, [sp] 1604- ldr w9, [sp, #8] 1605- ldr w10, [sp, #16] 1606- ldr x11, [sp, #24] 1607- ldr x12, [sp, #32] 1608- ldr x13, [sp, #40] 1609- ldr x14, [sp, #48] 1610- ldr x15, [sp, #56] 1611+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1612+ add x9, sp, #64 1613+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1614+ stp x19, x20, [sp, #128] 1615+ stp x21, x22, [sp, #144] 1616+ stp x23, x24, [sp, #160] 1617+ stp x25, x26, [sp, #176] 1618+ stp x27, x28, [sp, #192] 1619+ stp x29, x30, [sp, #208] 1620+ 1621+ ldr w8, [sp, #224] 1622+ ldr w9, [sp, #232] 1623+ ldr w10, [sp, #240] 1624+ ldr x11, [sp, #248] 1625+ ldr x12, [sp, #256] 1626+ ldr x13, [sp, #264] 1627+ ldr x14, [sp, #272] 1628+ ldr x15, [sp, #280] 1629 1630 mov x23, #4 1631 mul x23, x23, x5 // lhs step 1632@@ -70,7 +71,7 @@ LoopRow: 1633 mov x17, x4 // reload rhs col 1634 mov x29, x7 // reload bias ptr 1635 mov x27, x2 // reload dst ptr 1636- ldr x28, [sp, #64] // reload filter_zp 1637+ ldr x28, [sp, #288] // reload filter_zp 1638 1639 LoopCol: 1640 mov x25, x6 // reload a_sums ptr 1641@@ -334,16 +335,15 @@ LoopRow: 1642 LoopColEnd: 1643 subs x3, x3, #4 1644 ble LoopRowEnd 1645- ldr x11, [sp, #24] 1646- ldr x12, [sp, #32] 1647- ldr x13, [sp, #40] 1648+ ldr x11, [sp, #248] 1649+ ldr x12, [sp, #256] 1650+ ldr x13, [sp, #264] 1651 add x6, x6, #16 1652 add x0, x0, x23 1653 add x2, x2, x24 1654 b LoopRow 1655 1656 LoopRowEnd: 1657- sub sp, sp, #224 1658 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1659 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1660 ldp x19, x20, [sp], #16 1661diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S 1662index fe5207ad..adb0a42c 100644 1663--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S 1664+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S 1665@@ -33,9 +33,10 @@ 1666 1667 asm_function MatMulR4Int8Neon64 1668 sub sp, sp, #144 1669- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1670- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1671- stp x19, x20, [sp], #16 1672+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1673+ add x9, sp, #64 1674+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1675+ stp x19, x20, [sp, #128] 1676 1677 mov w15, #0 // b col index 1678 mov w16, #0 // a row index 1679@@ -185,7 +186,6 @@ End2: 1680 b L1 1681 1682 End1: 1683- sub sp, sp, #144 1684 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1685 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1686 ldp x19, x20, [sp], #16 1687diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S 1688index 0b814ce4..23032ab9 100644 1689--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S 1690+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S 1691@@ -27,9 +27,9 @@ asm_function MatrixMultiplyWinograd 1692 // x19 ~ x29 should be also preserved 1693 // whereas our coding style do not permit such amount of parameters 1694 sub sp, sp, #48 1695- st1 {v8.4s}, [sp], #16 1696- stp x19, x20, [sp], #16 1697- stp x21, x22, [sp], #16 1698+ st1 {v8.4s}, [sp] 1699+ stp x19, x20, [sp, #16] 1700+ stp x21, x22, [sp, #32] 1701 mov x8, #4 1702 mul x10, x5, x8 1703 mov x17, x3 // m 1704@@ -176,7 +176,6 @@ asm_function MatrixMultiplyWinograd 1705 add x0, x0, x21 1706 b LoopM 1707 EndLoopM: 1708- sub sp, sp, #48 1709 ld1 {v8.4s}, [sp], #16 1710 ldp x19, x20, [sp], #16 1711 ldp x21, x22, [sp], #16 1712diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S 1713index 5355d302..1392ab4a 100644 1714--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S 1715+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S 1716@@ -34,8 +34,9 @@ 1717 1718 asm_function PostFuncBiasReluC8 1719 sub sp, sp, #128 1720- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1721- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1722+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1723+ add x9, sp, #64 1724+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1725 1726 movi v26.4s, #6 1727 scvtf v26.4s, v26.4s 1728@@ -546,7 +547,6 @@ Loop_C1_7_Write: 1729 b Loop_C1_7_Write 1730 1731 End: 1732- sub sp, sp, #128 1733 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1734 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1735 ret 1736diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S 1737index 0818d74e..a240b64d 100644 1738--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S 1739+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S 1740@@ -54,14 +54,14 @@ 1741 1742 asm_function PostFuncInt8C4Neon64 1743 sub sp, sp, #16 1744- stp x24, x25, [sp], #16 1745+ stp x24, x25, [sp] 1746 1747- ldr w8, [sp] 1748- ldr w9, [sp, #8] 1749- ldr w10, [sp, #16] 1750- ldr w11, [sp, #24] 1751- ldr w12, [sp, #32] 1752- ldr w13, [sp, #40] 1753+ ldr w8, [sp, #16] 1754+ ldr w9, [sp, #24] 1755+ ldr w10, [sp, #32] 1756+ ldr w11, [sp, #40] 1757+ ldr w12, [sp, #48] 1758+ ldr w13, [sp, #56] 1759 1760 dup v26.4s, w7 1761 dup v27.4s, w8 1762@@ -254,7 +254,6 @@ Loop_C1_3: 1763 1764 1765 End: 1766- sub sp, sp, #16 1767 ldp x24, x25, [sp], #16 1768 ret 1769 #endif 1770diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S 1771index cfa9bdf8..614d83f8 100644 1772--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S 1773+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S 1774@@ -55,9 +55,10 @@ 1775 1776 asm_function SPMM8x8Fp32 1777 sub sp, sp, #144 1778- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1779- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1780- stp x19, x20, [sp], #16 1781+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1782+ add x9, sp, #64 1783+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1784+ stp x19, x20, [sp, #128] 1785 1786 // init output with bias 1787 ldr w8, [x5], #4 1788@@ -286,7 +287,6 @@ WRITE_OUT: 1789 st1 {v14.4s, v15.4s}, [x4] 1790 1791 End: 1792- sub sp, sp, #144 1793 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1794 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1795 ldp x19, x20, [sp], #16 1796diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S 1797index 5987e68a..e0efc7b2 100644 1798--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S 1799+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S 1800@@ -29,8 +29,9 @@ asm_function TiledC4MatmulFp32 1801 //x5: oc4 1802 1803 sub sp, sp, #128 1804-st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1805-st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1806+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1807+add x9, sp, #64 1808+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1809 1810 mov x7, #4 //sizeof(float) 1811 mul x3, x3, x7 1812@@ -272,7 +273,6 @@ LoopOcHalf: 1813 st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 1814 1815 LoopOcEnd: 1816- sub sp, sp, #128 1817 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1818 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1819 ret 1820diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S 1821index 4a26b251..243b19de 100644 1822--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S 1823+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S 1824@@ -30,7 +30,7 @@ asm_function WinogradTransLeft 1825 //x6:length 1826 1827 sub sp, sp, #32 1828-stp x19, x20, [sp], #32 1829+stp x19, x20, [sp] 1830 1831 mov x8, #16 // 4 * sizeof(float) 1832 mul x8, x6, x8 1833@@ -152,7 +152,6 @@ LoopH: 1834 subs x4, x4, #1 1835 bne LoopH 1836 1837- sub sp, sp, #32 1838 ldp x19, x20, [sp], #32 1839 ret 1840 1841diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S 1842index 931fa016..95ee50a5 100644 1843--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S 1844+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S 1845@@ -30,7 +30,7 @@ asm_function WinogradTransRight 1846 //x6: length 1847 1848 sub sp, sp, #16 1849-stp x19, x20, [sp], #16 1850+stp x19, x20, [sp] 1851 1852 mov x8, #16 // 4 * sizeof(float) 1853 mul x8, x6, x8 1854@@ -155,7 +155,6 @@ LoopH: 1855 subs x4, x4, #1 1856 bne LoopH 1857 1858- sub sp, sp, #16 1859 ldp x19, x20, [sp], #16 1860 ret 1861 #endif 1862diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S 1863index 221a1609..56f03dbd 100644 1864--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S 1865+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S 1866@@ -31,21 +31,22 @@ asm_function ConvDwFp16Center 1867 // x19 ~ x29 should be also preserved 1868 // whereas our coding style do not permit such amount of parameters 1869 sub sp, sp, #192 1870- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1871- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1872- stp x19, x20, [sp], #16 1873- stp x21, x22, [sp], #16 1874- stp x23, x24, [sp], #16 1875- stp x25, x26, [sp], #16 1876+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1877+ add x9, sp, #64 1878+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1879+ stp x19, x20, [sp, #128] 1880+ stp x21, x22, [sp, #144] 1881+ stp x23, x24, [sp, #160] 1882+ stp x25, x26, [sp, #176] 1883 1884- ldr x8, [sp] 1885- ldr x9, [sp, #8] 1886- ldr x10, [sp, #16] 1887- ldr x11, [sp, #24] 1888- ldr x12, [sp, #32] 1889- ldr x13, [sp, #40] 1890- ldr x14, [sp, #48] 1891- ldr x15, [sp, #56] 1892+ ldr x8, [sp, #192] 1893+ ldr x9, [sp, #200] 1894+ ldr x10, [sp, #208] 1895+ ldr x11, [sp, #216] 1896+ ldr x12, [sp, #224] 1897+ ldr x13, [sp, #232] 1898+ ldr x14, [sp, #240] 1899+ ldr x15, [sp, #248] 1900 1901 ld1 {v24.8h}, [x3] 1902 movi v26.8h, #0x46, lsl #8 1903@@ -301,7 +302,6 @@ asm_function ConvDwFp16Center 1904 subs x4, x4, #1 1905 bne LoopH 1906 1907- sub sp, sp, #192 1908 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1909 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1910 ldp x19, x20, [sp], #16 1911diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S 1912index 1266b160..bb37a913 100644 1913--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S 1914+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S 1915@@ -30,14 +30,14 @@ asm_function DeconvDwFp16Center 1916 // x19 ~ x29 should be also preserved 1917 // whereas our coding style do not permit such amount of parameters 1918 sub sp, sp, #32 1919- stp x19, x20, [sp], #16 1920- stp x21, x22, [sp], #16 1921+ stp x19, x20, [sp] 1922+ stp x21, x22, [sp, #16] 1923 1924- ldr x8, [sp] 1925- ldr x9, [sp, #8] 1926- ldr x10, [sp, #16] 1927- ldr x11, [sp, #24] 1928- ldr x12, [sp, #32] 1929+ ldr x8, [sp, #32] 1930+ ldr x9, [sp, #40] 1931+ ldr x10, [sp, #48] 1932+ ldr x11, [sp, #56] 1933+ ldr x12, [sp, #64] 1934 1935 LoopH: 1936 mov x15, x0 1937@@ -69,7 +69,6 @@ asm_function DeconvDwFp16Center 1938 subs x3, x3, #1 1939 bne LoopH 1940 1941- sub sp, sp, #32 1942 ldp x19, x20, [sp], #16 1943 ldp x21, x22, [sp], #16 1944 ret 1945diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S 1946index 80a55b75..4f5441a3 100644 1947--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S 1948+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S 1949@@ -30,8 +30,9 @@ 1950 1951 asm_function MatVecMulFp16Neon64 1952 sub sp, sp, #128 1953- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 1954- st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 1955+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 1956+ add x9, sp, #64 1957+ st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9] 1958 1959 mov w14, #2 // sizeof(float16) 1960 mul w8, w14, w5 // rhs depthx1 block stride 1961@@ -184,7 +185,6 @@ Write1x1: 1962 b Loop 1963 1964 End: 1965- sub sp, sp, #128 1966 ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 1967 ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 1968 ret 1969diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S 1970index a0e28b74..9f804fd3 100644 1971--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S 1972+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S 1973@@ -36,13 +36,14 @@ 1974 1975 asm_function MatMul12x16Fp16Opt 1976 sub sp, sp, #160 1977- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 1978- st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 1979- stp x19, x20, [sp], #16 1980- stp x21, x22, [sp], #16 1981+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 1982+ add x9, sp, #64 1983+ st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9] 1984+ stp x19, x20, [sp, #128] 1985+ stp x21, x22, [sp, #144] 1986 1987- ldr x8, [sp] 1988- ldr x9, [sp, #8] 1989+ ldr x8, [sp, #160] 1990+ ldr x9, [sp, #168] 1991 1992 .macro CLEAR_OUTPUT_V8_V9 1993 dup v8.4s, wzr 1994@@ -1694,7 +1695,6 @@ LoopColEnd: 1995 subs x6, x6, #12 1996 bgt LoopRowStart 1997 1998- sub sp, sp, #160 1999 ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2000 ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2001 ldp x19, x20, [sp], #16 2002diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S 2003index 79fa12bc..31f1adbd 100644 2004--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S 2005+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S 2006@@ -34,13 +34,14 @@ 2007 2008 asm_function MatmulBaseFp16Neon 2009 sub sp, sp, #160 2010- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2011- st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2012- stp x19, x20, [sp], #16 2013- stp x21, x22, [sp], #16 2014+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 2015+ add x9, sp, #64 2016+ st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9] 2017+ stp x19, x20, [sp, #128] 2018+ stp x21, x22, [sp, #144] 2019 2020- ldr x8, [sp] 2021- ldr x9, [sp, #8] // act 2022+ ldr x8, [sp, #160] 2023+ ldr x9, [sp, #168] // act 2024 add x8, x8, x8 // stride * sizeof(float16_t) 2025 2026 add x16, x7, x7 // col * sizeof(float16_t) 2027@@ -951,7 +952,6 @@ LoopColEnd: 2028 add x0, x0, x15 2029 bgt LoopRowStart 2030 2031- sub sp, sp, #160 2032 ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2033 ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2034 ldp x19, x20, [sp], #16 2035diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S 2036index 6bb93f99..1d6b69a6 100644 2037--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S 2038+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S 2039@@ -34,15 +34,16 @@ 2040 2041 asm_function MatmulFp16Neon64 2042 sub sp, sp, #144 2043- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2044- st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2045- stp x19, x20, [sp], #16 2046+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 2047+ add x9, sp, #64 2048+ st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9] 2049+ stp x19, x20, [sp, #128] 2050 2051 mov w18, #16 // sizeof(float16) * 8 2052 mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float16) * 8 * depth 2053 mov x11, x3 // bias flag 2054 mov x19, #2 2055- ldr x17, [sp] 2056+ ldr x17, [sp, #144] 2057 mul x17, x17, x19 2058 2059 L1: 2060@@ -308,7 +309,7 @@ Relu: 2061 fmax v31.8h, v31.8h, v14.8h 2062 2063 Write: 2064- ldrb w13, [sp, #8] 2065+ ldrb w13, [sp, #152] 2066 cbz w13, WriteC8 2067 cmp w7, #1 2068 beq Write1 2069@@ -877,14 +878,13 @@ End2: 2070 subs w7, w7, #8 // rhs col - 8 2071 add x1, x1, x15 // rhs ptr + stride 2072 add x3, x3, #16 // bias ptr + stride 2073- ldrb w13, [sp, #8] 2074+ ldrb w13, [sp, #152] 2075 cbz w13, NoDstStep 2076 add x2, x2, #16 // dst ptr + stride 2077 NoDstStep: 2078 bgt L1 2079 2080 End1: 2081- sub sp, sp, #144 2082 ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2083 ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2084 ldp x19, x20, [sp], #16 2085diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S 2086index 4a111066..21348f80 100644 2087--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S 2088+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S 2089@@ -34,12 +34,12 @@ 2090 2091 asm_function MatmulFp16Neon64Opt 2092 sub sp, sp, #96 2093- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2094- stp x19, x20, [sp], #16 2095- stp x21, x22, [sp], #16 2096+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 2097+ stp x19, x20, [sp, #64] 2098+ stp x21, x22, [sp, #80] 2099 2100- ldr x8, [sp] 2101- ldr x9, [sp, #8] 2102+ ldr x8, [sp, #96] 2103+ ldr x9, [sp, #104] 2104 2105 mov x21, #32 // sizeof(float16_t) * 16 2106 mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth 2107@@ -1178,7 +1178,6 @@ LoopColEnd: 2108 subs x6, x6, #16 2109 bgt LoopRowStart 2110 2111- sub sp, sp, #96 2112 ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2113 ldp x19, x20, [sp], #16 2114 ldp x21, x22, [sp], #16 2115diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S 2116index 2d901a3d..40b788c9 100644 2117--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S 2118+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S 2119@@ -34,15 +34,16 @@ 2120 2121 asm_function MatmulFp16OptV2 2122 sub sp, sp, #192 2123- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2124- st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2125- stp x19, x20, [sp], #16 2126- stp x21, x22, [sp], #16 2127- stp x23, x24, [sp], #16 2128- stp x29, x30, [sp], #16 2129- 2130- ldr x8, [sp] 2131- ldr x9, [sp, #8] // writeMode 2132+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 2133+ add x9, sp, #64 2134+ st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9] 2135+ stp x19, x20, [sp, #128] 2136+ stp x21, x22, [sp, #144] 2137+ stp x23, x24, [sp, #160] 2138+ stp x29, x30, [sp, #176] 2139+ 2140+ ldr x8, [sp, #192] 2141+ ldr x9, [sp, #200] // writeMode 2142 lsl x8, x8, #1 // stride * sizeof(float16_t) 2143 2144 lsl x15, x7, #1 // col * sizeof(float16_t) 2145@@ -2955,7 +2956,6 @@ Compute1x4Unit: 2146 ret 2147 2148 End: 2149- sub sp, sp, #192 2150 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 2151 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 2152 ldp x19, x20, [sp], #16 2153diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S 2154index 9ee3c4d5..ca0542da 100644 2155--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S 2156+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S 2157@@ -27,9 +27,9 @@ asm_function MatrixMultiplyWinogradFp16 2158 // x19 ~ x29 should be also preserved 2159 // whereas our coding style do not permit such amount of parameters 2160 sub sp, sp, #48 2161- st1 {v8.8h}, [sp], #16 2162- stp x19, x20, [sp], #16 2163- stp x21, x22, [sp], #16 2164+ st1 {v8.8h}, [sp] 2165+ stp x19, x20, [sp, #16] 2166+ stp x21, x22, [sp, #32] 2167 2168 mov x8, #2 2169 mul x10, x5, x8 // n * 2 2170@@ -210,7 +210,6 @@ asm_function MatrixMultiplyWinogradFp16 2171 b LoopM 2172 2173 EndLoopM: 2174- sub sp, sp, #48 2175 ld1 {v8.8h}, [sp], #16 2176 ldp x19, x20, [sp], #16 2177 ldp x21, x22, [sp], #16 2178diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S 2179index d7570d18..5b616ae7 100644 2180--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S 2181+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S 2182@@ -22,8 +22,9 @@ 2183 asm_function TiledC4MatmulFp16 2184 2185 sub sp, sp, #128 2186-st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 2187-st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 2188+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 2189+add x9, sp, #64 2190+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 2191 2192 mov x7, #2 //sizeof(float) 2193 mul x3, x3, x7 2194@@ -265,7 +266,6 @@ LoopOcHalf: 2195 st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], #32 2196 2197 LoopOcEnd: 2198- sub sp, sp, #128 2199 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 2200 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 2201 ret 2202diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S 2203index d11dd472..0df891d3 100644 2204--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S 2205+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S 2206@@ -31,8 +31,9 @@ 2207 2208 asm_function VecMatmulFp16Neon64_2 2209 sub sp, sp, #128 2210- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2211- st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2212+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 2213+ add x9, sp, #64 2214+ st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9] 2215 2216 LoopCol: 2217 mov x15, x0 // reload a ptr 2218@@ -174,7 +175,6 @@ Write7: 2219 b End 2220 2221 End: 2222- sub sp, sp, #128 2223 ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2224 ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2225 ret 2226diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S 2227index 1970c16a..c9b4104e 100644 2228--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S 2229+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S 2230@@ -22,7 +22,7 @@ 2231 asm_function WinogradTransLeftFp16 2232 2233 sub sp, sp, #16 2234-stp x19, x20, [sp], #16 2235+stp x19, x20, [sp] 2236 2237 mov x8, #8 // 4 * sizeof(float16) 2238 mul x8, x6, x8 2239@@ -144,7 +144,6 @@ LoopH: 2240 subs x4, x4, #1 2241 bne LoopH 2242 2243- sub sp, sp, #16 2244 ldp x19, x20, [sp], #16 2245 ret 2246 2247diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S 2248index c575f504..46c3cd84 100644 2249--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S 2250+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S 2251@@ -22,7 +22,7 @@ 2252 asm_function WinogradTransRightFp16 2253 2254 sub sp, sp, #16 2255-stp x19, x20, [sp], #16 2256+stp x19, x20, [sp] 2257 2258 mov x8, #8 // 4 * sizeof(float16) 2259 mul x8, x6, x8 2260@@ -147,7 +147,6 @@ LoopH: 2261 subs x4, x4, #1 2262 bne LoopH 2263 2264- sub sp, sp, #16 2265 ldp x19, x20, [sp], #16 2266 2267 ret 2268-- 22692.17.1 2270 2271