1be168c0dSopenharmony_ciFrom c580b97cbfea388ac393f617c4d960021bf11322 Mon Sep 17 00:00:00 2001 2be168c0dSopenharmony_ciFrom: chengfeng27 <chengfeng27@huawei.com> 3be168c0dSopenharmony_ciDate: Mon, 12 Aug 2024 11:42:12 +0800 4be168c0dSopenharmony_ciSubject: [PATCH] fix arm64/fp16 assemble can not protect stack in mutil-thread 5be168c0dSopenharmony_ci switch case 6be168c0dSopenharmony_ci 7be168c0dSopenharmony_ci--- 8be168c0dSopenharmony_ci .../kernel/nnacl/assembly/arm64/AdderFp32.S | 10 ++--- 9be168c0dSopenharmony_ci .../nnacl/assembly/arm64/BigMatmulFp32Opt.S | 22 +++++----- 10be168c0dSopenharmony_ci .../assembly/arm64/ConvDw3x3Fp32Stride1.S | 12 ++--- 11be168c0dSopenharmony_ci .../assembly/arm64/ConvDw3x3Fp32Stride2.S | 12 ++--- 12be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvDw3x3Int8.S | 34 +++++++------- 13be168c0dSopenharmony_ci .../assembly/arm64/ConvDw3x3Int8Corner.S | 19 ++++---- 14be168c0dSopenharmony_ci .../assembly/arm64/ConvDw3x3Int8Horizontal.S | 25 +++++------ 15be168c0dSopenharmony_ci .../assembly/arm64/ConvDw3x3Int8Stride2.S | 34 +++++++------- 16be168c0dSopenharmony_ci .../assembly/arm64/ConvDw3x3Int8Vertical.S | 19 ++++---- 17be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvDw3x3Line.S | 6 +-- 18be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvDwFp32Center.S | 30 ++++++------- 19be168c0dSopenharmony_ci .../assembly/arm64/ConvDwFp32Indirect3x3.S | 7 ++- 20be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvDwInt8Center.S | 44 +++++++++---------- 21be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvFp32Center.S | 42 +++++++++--------- 22be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW1x16Kernel.S | 23 +++++----- 23be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW1x8Kernel.S | 21 +++++---- 24be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW2x16Kernel.S | 21 +++++---- 25be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW2x8Kernel.S | 21 +++++---- 26be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW3x16Kernel.S | 23 +++++----- 27be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW3x8Kernel.S | 21 +++++---- 28be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW4x16Kernel.S | 28 ++++++------ 29be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW4x8Kernel.S | 28 ++++++------ 30be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW5x16Kernel.S | 28 ++++++------ 31be168c0dSopenharmony_ci .../nnacl/assembly/arm64/ConvSW5x8Kernel.S | 28 ++++++------ 32be168c0dSopenharmony_ci .../nnacl/assembly/arm64/DeconvDwFp32Center.S | 15 +++---- 33be168c0dSopenharmony_ci .../nnacl/assembly/arm64/DeconvDwInt8Center.S | 15 +++---- 34be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatVecMulFp32.S | 24 +++++----- 35be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatVecMulPackFp32.S | 15 +++---- 36be168c0dSopenharmony_ci .../kernel/nnacl/assembly/arm64/MatmulFp32.S | 14 +++--- 37be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatmulFp32Opt.S | 16 +++---- 38be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatmulFp32OptRow12.S | 14 +++--- 39be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatmulFp32OptRow4.S | 16 +++---- 40be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatmulFp32OptRow8.S | 14 +++--- 41be168c0dSopenharmony_ci .../kernel/nnacl/assembly/arm64/MatmulInt8.S | 38 ++++++++-------- 42be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatmulInt8Opt.S | 44 +++++++++---------- 43be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatmulR4Int8.S | 8 ++-- 44be168c0dSopenharmony_ci .../nnacl/assembly/arm64/MatmulWinogradFp32.S | 7 ++- 45be168c0dSopenharmony_ci .../nnacl/assembly/arm64/PostFuncBiasReluC8.S | 6 +-- 46be168c0dSopenharmony_ci .../assembly/arm64/PostFuncInt8C4Neon64.S | 15 +++---- 47be168c0dSopenharmony_ci .../kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S | 8 ++-- 48be168c0dSopenharmony_ci .../nnacl/assembly/arm64/TiledC4MatmulFp32.S | 6 +-- 49be168c0dSopenharmony_ci .../nnacl/assembly/arm64/WinogradTransLeft.S | 3 +- 50be168c0dSopenharmony_ci .../nnacl/assembly/arm64/WinogradTransRight.S | 3 +- 51be168c0dSopenharmony_ci .../nnacl/assembly/fp16/ConvDwFp16Center.S | 30 ++++++------- 52be168c0dSopenharmony_ci .../nnacl/assembly/fp16/DeconvDwFp16Center.S | 15 +++---- 53be168c0dSopenharmony_ci .../nnacl/assembly/fp16/MatVecMulFp16.S | 6 +-- 54be168c0dSopenharmony_ci .../nnacl/assembly/fp16/Matmul12X16Fp16.S | 14 +++--- 55be168c0dSopenharmony_ci .../nnacl/assembly/fp16/MatmulBaseFp16Neon.S | 14 +++--- 56be168c0dSopenharmony_ci .../kernel/nnacl/assembly/fp16/MatmulFp16.S | 14 +++--- 57be168c0dSopenharmony_ci .../nnacl/assembly/fp16/MatmulFp16Opt.S | 11 +++-- 58be168c0dSopenharmony_ci .../nnacl/assembly/fp16/MatmulFp16OptV2.S | 20 ++++----- 59be168c0dSopenharmony_ci .../nnacl/assembly/fp16/MatmulWinogradFp16.S | 7 ++- 60be168c0dSopenharmony_ci .../nnacl/assembly/fp16/TiledC4MatmulFp16.S | 6 +-- 61be168c0dSopenharmony_ci .../nnacl/assembly/fp16/VecMatmulFp16.S | 6 +-- 62be168c0dSopenharmony_ci .../assembly/fp16/WinogradTransLeftFp16.S | 3 +- 63be168c0dSopenharmony_ci .../assembly/fp16/WinogradTransRightFp16.S | 3 +- 64be168c0dSopenharmony_ci 56 files changed, 483 insertions(+), 505 deletions(-) 65be168c0dSopenharmony_ci 66be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S 67be168c0dSopenharmony_ciindex 66136f42..9123d88c 100644 68be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S 69be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/AdderFp32.S 70be168c0dSopenharmony_ci@@ -34,11 +34,12 @@ 71be168c0dSopenharmony_ci 72be168c0dSopenharmony_ci asm_function AdderFloatNeon64 73be168c0dSopenharmony_ci sub sp, sp, #144 74be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 75be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 76be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 77be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 78be168c0dSopenharmony_ci+ add x9, sp, #64 79be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 80be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 81be168c0dSopenharmony_ci 82be168c0dSopenharmony_ci- ldr x8, [sp] 83be168c0dSopenharmony_ci+ ldr x8, [sp, #144] 84be168c0dSopenharmony_ci 85be168c0dSopenharmony_ci mov x20, #48 // sizeof(float) * 12 86be168c0dSopenharmony_ci mul x17, x5, x20 // block stride of lhs/rhs: sizeof(float) * 12 * depth 87be168c0dSopenharmony_ci@@ -614,7 +615,6 @@ LoopColEnd: 88be168c0dSopenharmony_ci subs x6, x6, #12 89be168c0dSopenharmony_ci bgt LoopRowStart 90be168c0dSopenharmony_ci 91be168c0dSopenharmony_ci- sub sp, sp, #144 92be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 93be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 94be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 95be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S 96be168c0dSopenharmony_ciindex 498038ff..03898585 100644 97be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S 98be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S 99be168c0dSopenharmony_ci@@ -33,16 +33,17 @@ 100be168c0dSopenharmony_ci 101be168c0dSopenharmony_ci asm_function BigMatmulFloatNeon64Opt 102be168c0dSopenharmony_ci sub sp, sp, #224 103be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 104be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 105be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 106be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 107be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 108be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 109be168c0dSopenharmony_ci- stp x27, x28, [sp], #16 110be168c0dSopenharmony_ci- stp x29, x30, [sp], #16 111be168c0dSopenharmony_ci- 112be168c0dSopenharmony_ci- ldr x8, [sp] 113be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 114be168c0dSopenharmony_ci+ add x9, sp, #64 115be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 116be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 117be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 118be168c0dSopenharmony_ci+ stp x23, x24, [sp, #160] 119be168c0dSopenharmony_ci+ stp x25, x26, [sp, #176] 120be168c0dSopenharmony_ci+ stp x27, x28, [sp, #192] 121be168c0dSopenharmony_ci+ stp x29, x30, [sp, #208] 122be168c0dSopenharmony_ci+ 123be168c0dSopenharmony_ci+ ldr x8, [sp, #224] 124be168c0dSopenharmony_ci mov x20, #1 125be168c0dSopenharmony_ci mov x22, #32 126be168c0dSopenharmony_ci mov x23, #48 127be168c0dSopenharmony_ci@@ -2515,7 +2516,6 @@ Compute4x4Unit: 128be168c0dSopenharmony_ci ret 129be168c0dSopenharmony_ci 130be168c0dSopenharmony_ci End: 131be168c0dSopenharmony_ci- sub sp, sp, #224 132be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 133be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 134be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 135be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S 136be168c0dSopenharmony_ciindex f04d9082..b96efd64 100644 137be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S 138be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S 139be168c0dSopenharmony_ci@@ -36,12 +36,13 @@ 140be168c0dSopenharmony_ci 141be168c0dSopenharmony_ci asm_function ConvDw3x3Stride1 142be168c0dSopenharmony_ci sub sp, sp, #128 143be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 144be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 145be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 146be168c0dSopenharmony_ci+ add x9, sp, #64 147be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 148be168c0dSopenharmony_ci 149be168c0dSopenharmony_ci- ldr w8, [sp] 150be168c0dSopenharmony_ci- ldr w9, [sp, #8] 151be168c0dSopenharmony_ci- ldr w10, [sp, #16] 152be168c0dSopenharmony_ci+ ldr w8, [sp, #128] 153be168c0dSopenharmony_ci+ ldr w9, [sp, #136] 154be168c0dSopenharmony_ci+ ldr w10, [sp, #144] 155be168c0dSopenharmony_ci 156be168c0dSopenharmony_ci mov w11, #4 157be168c0dSopenharmony_ci mul w15, w4, w11 // col_size * 4 158be168c0dSopenharmony_ci@@ -203,7 +204,6 @@ WIDTH1_LEFT: 159be168c0dSopenharmony_ci st1 {v21.4s}, [x0] 160be168c0dSopenharmony_ci 161be168c0dSopenharmony_ci End: 162be168c0dSopenharmony_ci- sub sp, sp, #128 163be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 164be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 165be168c0dSopenharmony_ci ret 166be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S 167be168c0dSopenharmony_ciindex 0dd075dd..7632d48e 100644 168be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S 169be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S 170be168c0dSopenharmony_ci@@ -36,12 +36,13 @@ 171be168c0dSopenharmony_ci 172be168c0dSopenharmony_ci asm_function ConvDw3x3Stride2 173be168c0dSopenharmony_ci sub sp, sp, #128 174be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 175be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 176be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 177be168c0dSopenharmony_ci+ add x9, sp, #64 178be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 179be168c0dSopenharmony_ci 180be168c0dSopenharmony_ci- ldr w8, [sp] 181be168c0dSopenharmony_ci- ldr w9, [sp, #8] 182be168c0dSopenharmony_ci- ldr w10, [sp, #16] 183be168c0dSopenharmony_ci+ ldr w8, [sp, #128] 184be168c0dSopenharmony_ci+ ldr w9, [sp, #136] 185be168c0dSopenharmony_ci+ ldr w10, [sp, #144] 186be168c0dSopenharmony_ci 187be168c0dSopenharmony_ci mov w11, #4 188be168c0dSopenharmony_ci mul w15, w4, w11 // col_size * 4 189be168c0dSopenharmony_ci@@ -205,7 +206,6 @@ WIDTH1_LEFT: 190be168c0dSopenharmony_ci st1 {v24.4s}, [x0] 191be168c0dSopenharmony_ci 192be168c0dSopenharmony_ci End: 193be168c0dSopenharmony_ci- sub sp, sp, #128 194be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 195be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 196be168c0dSopenharmony_ci ret 197be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S 198be168c0dSopenharmony_ciindex bfb9b8f6..5187d368 100644 199be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S 200be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8.S 201be168c0dSopenharmony_ci@@ -44,22 +44,23 @@ 202be168c0dSopenharmony_ci 203be168c0dSopenharmony_ci asm_function ConvDw3x3Int8Neon64 204be168c0dSopenharmony_ci sub sp, sp, #192 205be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 206be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 207be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 208be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 209be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 210be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 211be168c0dSopenharmony_ci- 212be168c0dSopenharmony_ci- ldr x8, [sp] 213be168c0dSopenharmony_ci- ldr x9, [sp, #8] 214be168c0dSopenharmony_ci- ldr x10, [sp, #16] 215be168c0dSopenharmony_ci- ldr x11, [sp, #24] 216be168c0dSopenharmony_ci- ldr x12, [sp, #32] 217be168c0dSopenharmony_ci- ldr x13, [sp, #40] 218be168c0dSopenharmony_ci- ldr x14, [sp, #48] 219be168c0dSopenharmony_ci- ldr x15, [sp, #56] 220be168c0dSopenharmony_ci- ldr x23, [sp, #64] // per_channel 221be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 222be168c0dSopenharmony_ci+ add x9, sp, #64 223be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 224be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 225be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 226be168c0dSopenharmony_ci+ stp x23, x24, [sp, #160] 227be168c0dSopenharmony_ci+ stp x25, x26, [sp, #176] 228be168c0dSopenharmony_ci+ 229be168c0dSopenharmony_ci+ ldr x8, [sp, #192] 230be168c0dSopenharmony_ci+ ldr x9, [sp, #200] 231be168c0dSopenharmony_ci+ ldr x10, [sp, #208] 232be168c0dSopenharmony_ci+ ldr x11, [sp, #216] 233be168c0dSopenharmony_ci+ ldr x12, [sp, #224] 234be168c0dSopenharmony_ci+ ldr x13, [sp, #232] 235be168c0dSopenharmony_ci+ ldr x14, [sp, #240] 236be168c0dSopenharmony_ci+ ldr x15, [sp, #248] 237be168c0dSopenharmony_ci+ ldr x23, [sp, #256] // per_channel 238be168c0dSopenharmony_ci 239be168c0dSopenharmony_ci add x19, x3, #16 240be168c0dSopenharmony_ci add w20, w6, w6 // channel * 2 241be168c0dSopenharmony_ci@@ -488,7 +489,6 @@ OUTZP3: 242be168c0dSopenharmony_ci st1 {v21.8b}, [x0], x6 243be168c0dSopenharmony_ci 244be168c0dSopenharmony_ci End: 245be168c0dSopenharmony_ci- sub sp, sp, #192 246be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 247be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 248be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 249be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S 250be168c0dSopenharmony_ciindex b07ac01b..416e1a3a 100644 251be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S 252be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S 253be168c0dSopenharmony_ci@@ -32,20 +32,20 @@ asm_function ConvDw3x3Int8Corner 254be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 255be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 256be168c0dSopenharmony_ci sub sp, sp, #32 257be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 258be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 259be168c0dSopenharmony_ci+ stp x19, x20, [sp] 260be168c0dSopenharmony_ci+ stp x21, x22, [sp, #16] 261be168c0dSopenharmony_ci 262be168c0dSopenharmony_ci dup v25.8b, w7 // in_zp 263be168c0dSopenharmony_ci- ldr x8, [sp] 264be168c0dSopenharmony_ci+ ldr x8, [sp, #32] 265be168c0dSopenharmony_ci dup v26.4s, w8 // out_zp 266be168c0dSopenharmony_ci- ldr x9, [sp, #8] // out_multiplier 267be168c0dSopenharmony_ci- ldr x10, [sp, #16] // left_shift 268be168c0dSopenharmony_ci- ldr x11, [sp, #24] // right_shift 269be168c0dSopenharmony_ci- ldr x12, [sp, #32] 270be168c0dSopenharmony_ci+ ldr x9, [sp, #40] // out_multiplier 271be168c0dSopenharmony_ci+ ldr x10, [sp, #48] // left_shift 272be168c0dSopenharmony_ci+ ldr x11, [sp, #56] // right_shift 273be168c0dSopenharmony_ci+ ldr x12, [sp, #64] 274be168c0dSopenharmony_ci dup v30.4s, w12 // acc_min 275be168c0dSopenharmony_ci- ldr x13, [sp, #40] 276be168c0dSopenharmony_ci+ ldr x13, [sp, #72] 277be168c0dSopenharmony_ci dup v31.4s, w13 // acc_max 278be168c0dSopenharmony_ci- ldr x14, [sp, #48] // per_channel 279be168c0dSopenharmony_ci+ ldr x14, [sp, #80] // per_channel 280be168c0dSopenharmony_ci cbnz x14, PerChannelDump 281be168c0dSopenharmony_ci PerLayerDump: 282be168c0dSopenharmony_ci ld1r {v27.4s}, [x9] 283be168c0dSopenharmony_ci@@ -216,7 +216,6 @@ asm_function ConvDw3x3Int8Corner 284be168c0dSopenharmony_ci st1 {v23.s}[0], [x0], #4 285be168c0dSopenharmony_ci st1 {v24.s}[0], [x0], #4 286be168c0dSopenharmony_ci 287be168c0dSopenharmony_ci- sub sp, sp, #32 288be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 289be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 290be168c0dSopenharmony_ci ret 291be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S 292be168c0dSopenharmony_ciindex 92eeffea..379154e6 100644 293be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S 294be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S 295be168c0dSopenharmony_ci@@ -32,21 +32,21 @@ asm_function ConvDw3x3Int8Horizontal 296be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 297be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 298be168c0dSopenharmony_ci sub sp, sp, #48 299be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 300be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 301be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 302be168c0dSopenharmony_ci+ stp x19, x20, [sp] 303be168c0dSopenharmony_ci+ stp x21, x22, [sp, #16] 304be168c0dSopenharmony_ci+ stp x23, x24, [sp, #32] 305be168c0dSopenharmony_ci 306be168c0dSopenharmony_ci dup v25.8b, w7 // in_zp 307be168c0dSopenharmony_ci- ldr x8, [sp] 308be168c0dSopenharmony_ci+ ldr x8, [sp, #48] 309be168c0dSopenharmony_ci dup v26.4s, w8 // out_zp 310be168c0dSopenharmony_ci- ldr x9, [sp, #8] // out_multiplier 311be168c0dSopenharmony_ci- ldr x10, [sp, #16] // left_shift 312be168c0dSopenharmony_ci- ldr x11, [sp, #24] // right_shift 313be168c0dSopenharmony_ci- ldr x12, [sp, #32] 314be168c0dSopenharmony_ci+ ldr x9, [sp, #56] // out_multiplier 315be168c0dSopenharmony_ci+ ldr x10, [sp, #64] // left_shift 316be168c0dSopenharmony_ci+ ldr x11, [sp, #72] // right_shift 317be168c0dSopenharmony_ci+ ldr x12, [sp, #80] 318be168c0dSopenharmony_ci dup v30.4s, w12 // acc_min 319be168c0dSopenharmony_ci- ldr x13, [sp, #40] 320be168c0dSopenharmony_ci+ ldr x13, [sp, #88] 321be168c0dSopenharmony_ci dup v31.4s, w13 // acc_max 322be168c0dSopenharmony_ci- ldr x14, [sp, #48] // per_channel 323be168c0dSopenharmony_ci+ ldr x14, [sp, #96] // per_channel 324be168c0dSopenharmony_ci cbnz x14, PerChannelDump 325be168c0dSopenharmony_ci PerLayerDump: 326be168c0dSopenharmony_ci ld1r {v27.4s}, [x9] 327be168c0dSopenharmony_ci@@ -58,9 +58,9 @@ asm_function ConvDw3x3Int8Horizontal 328be168c0dSopenharmony_ci ld1 {v28.4s}, [x10], #16 329be168c0dSopenharmony_ci ld1 {v29.4s}, [x11], #16 330be168c0dSopenharmony_ci ContinueFunc: 331be168c0dSopenharmony_ci- ldr x12, [sp, #32] 332be168c0dSopenharmony_ci+ ldr x12, [sp, #80] 333be168c0dSopenharmony_ci dup v30.4s, w12 // acc_min 334be168c0dSopenharmony_ci- ldr x13, [sp, #40] 335be168c0dSopenharmony_ci+ ldr x13, [sp, #88] 336be168c0dSopenharmony_ci dup v31.4s, w13 // acc_max 337be168c0dSopenharmony_ci 338be168c0dSopenharmony_ci mov x12, #2 339be168c0dSopenharmony_ci@@ -248,7 +248,6 @@ asm_function ConvDw3x3Int8Horizontal 340be168c0dSopenharmony_ci 341be168c0dSopenharmony_ci st1 {v23.s}[0], [x0], #4 342be168c0dSopenharmony_ci st1 {v24.s}[0], [x0], #4 343be168c0dSopenharmony_ci- sub sp, sp, #48 344be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 345be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 346be168c0dSopenharmony_ci ldp x23, x24, [sp], #16 347be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S 348be168c0dSopenharmony_ciindex cc1b3e9b..8643a536 100644 349be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S 350be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S 351be168c0dSopenharmony_ci@@ -44,22 +44,23 @@ 352be168c0dSopenharmony_ci 353be168c0dSopenharmony_ci asm_function ConvDw3x3Int8Stride2 354be168c0dSopenharmony_ci sub sp, sp, #192 355be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 356be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 357be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 358be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 359be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 360be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 361be168c0dSopenharmony_ci- 362be168c0dSopenharmony_ci- ldr x8, [sp] 363be168c0dSopenharmony_ci- ldr x9, [sp, #8] 364be168c0dSopenharmony_ci- ldr x10, [sp, #16] 365be168c0dSopenharmony_ci- ldr x11, [sp, #24] 366be168c0dSopenharmony_ci- ldr x12, [sp, #32] 367be168c0dSopenharmony_ci- ldr x13, [sp, #40] 368be168c0dSopenharmony_ci- ldr x14, [sp, #48] 369be168c0dSopenharmony_ci- ldr x15, [sp, #56] 370be168c0dSopenharmony_ci- ldr x23, [sp, #64] // per_channel 371be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 372be168c0dSopenharmony_ci+ add x9, sp, #64 373be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 374be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 375be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 376be168c0dSopenharmony_ci+ stp x23, x24, [sp, #160] 377be168c0dSopenharmony_ci+ stp x25, x26, [sp, #176] 378be168c0dSopenharmony_ci+ 379be168c0dSopenharmony_ci+ ldr x8, [sp, #192] 380be168c0dSopenharmony_ci+ ldr x9, [sp, #200] 381be168c0dSopenharmony_ci+ ldr x10, [sp, #208] 382be168c0dSopenharmony_ci+ ldr x11, [sp, #216] 383be168c0dSopenharmony_ci+ ldr x12, [sp, #224] 384be168c0dSopenharmony_ci+ ldr x13, [sp, #232] 385be168c0dSopenharmony_ci+ ldr x14, [sp, #240] 386be168c0dSopenharmony_ci+ ldr x15, [sp, #248] 387be168c0dSopenharmony_ci+ ldr x23, [sp, #256] // per_channel 388be168c0dSopenharmony_ci 389be168c0dSopenharmony_ci add x19, x3, #16 390be168c0dSopenharmony_ci add w20, w6, w6 // channel * 2 391be168c0dSopenharmony_ci@@ -463,7 +464,6 @@ OUTZP3: 392be168c0dSopenharmony_ci st1 {v24.8b}, [x0], x6 393be168c0dSopenharmony_ci 394be168c0dSopenharmony_ci End: 395be168c0dSopenharmony_ci- sub sp, sp, #192 396be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 397be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 398be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 399be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S 400be168c0dSopenharmony_ciindex 67151534..706bc9fe 100644 401be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S 402be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S 403be168c0dSopenharmony_ci@@ -32,20 +32,20 @@ asm_function ConvDw3x3Int8Vertical 404be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 405be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 406be168c0dSopenharmony_ci sub sp, sp, #32 407be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 408be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 409be168c0dSopenharmony_ci+ stp x19, x20, [sp] 410be168c0dSopenharmony_ci+ stp x21, x22, [sp, #16] 411be168c0dSopenharmony_ci 412be168c0dSopenharmony_ci dup v25.8b, w7 // in_zp 413be168c0dSopenharmony_ci- ldr x8, [sp] 414be168c0dSopenharmony_ci+ ldr x8, [sp, #32] 415be168c0dSopenharmony_ci dup v26.4s, w8 // out_zp 416be168c0dSopenharmony_ci- ldr x9, [sp, #8] // out_multiplier 417be168c0dSopenharmony_ci- ldr x10, [sp, #16] // left_shift 418be168c0dSopenharmony_ci- ldr x11, [sp, #24] // right_shift 419be168c0dSopenharmony_ci- ldr x12, [sp, #32] 420be168c0dSopenharmony_ci+ ldr x9, [sp, #40] // out_multiplier 421be168c0dSopenharmony_ci+ ldr x10, [sp, #48] // left_shift 422be168c0dSopenharmony_ci+ ldr x11, [sp, #56] // right_shift 423be168c0dSopenharmony_ci+ ldr x12, [sp, #64] 424be168c0dSopenharmony_ci dup v30.4s, w12 // acc_min 425be168c0dSopenharmony_ci- ldr x13, [sp, #40] 426be168c0dSopenharmony_ci+ ldr x13, [sp, #72] 427be168c0dSopenharmony_ci dup v31.4s, w13 // acc_max 428be168c0dSopenharmony_ci- ldr x14, [sp, #48] // per_channel 429be168c0dSopenharmony_ci+ ldr x14, [sp, #80] // per_channel 430be168c0dSopenharmony_ci cbnz x14, PerChannelDump 431be168c0dSopenharmony_ci PerLayerDump: 432be168c0dSopenharmony_ci ld1r {v27.4s}, [x9] 433be168c0dSopenharmony_ci@@ -239,7 +239,6 @@ asm_function ConvDw3x3Int8Vertical 434be168c0dSopenharmony_ci 435be168c0dSopenharmony_ci st1 {v23.s}[0], [x0], #4 436be168c0dSopenharmony_ci st1 {v24.s}[0], [x0], #4 437be168c0dSopenharmony_ci- sub sp, sp, #32 438be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 439be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 440be168c0dSopenharmony_ci ret 441be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S 442be168c0dSopenharmony_ciindex 6157848e..f939ec62 100644 443be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S 444be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDw3x3Line.S 445be168c0dSopenharmony_ci@@ -29,8 +29,9 @@ asm_function ConvDw3x3Line 446be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 447be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 448be168c0dSopenharmony_ci sub sp, sp, #128 449be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 450be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 451be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 452be168c0dSopenharmony_ci+ add x9, sp, #64 453be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 454be168c0dSopenharmony_ci 455be168c0dSopenharmony_ci ldr x8, [x1] 456be168c0dSopenharmony_ci ldr x9, [x1, #8] 457be168c0dSopenharmony_ci@@ -196,7 +197,6 @@ asm_function ConvDw3x3Line 458be168c0dSopenharmony_ci add x0, x0, #16 459be168c0dSopenharmony_ci bgt LoopC4 460be168c0dSopenharmony_ci 461be168c0dSopenharmony_ci- sub sp, sp, #128 462be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 463be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 464be168c0dSopenharmony_ci ret 465be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S 466be168c0dSopenharmony_ciindex e9ddd65a..6f30c3ac 100644 467be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S 468be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Center.S 469be168c0dSopenharmony_ci@@ -31,21 +31,22 @@ asm_function ConvDwFp32Center 470be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 471be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 472be168c0dSopenharmony_ci sub sp, sp, #192 473be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 474be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 475be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 476be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 477be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 478be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 479be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 480be168c0dSopenharmony_ci+ add x9, sp, #64 481be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 482be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 483be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 484be168c0dSopenharmony_ci+ stp x23, x24, [sp, #160] 485be168c0dSopenharmony_ci+ stp x25, x26, [sp, #176] 486be168c0dSopenharmony_ci 487be168c0dSopenharmony_ci- ldr x8, [sp] 488be168c0dSopenharmony_ci- ldr x9, [sp, #8] 489be168c0dSopenharmony_ci- ldr x10, [sp, #16] 490be168c0dSopenharmony_ci- ldr x11, [sp, #24] 491be168c0dSopenharmony_ci- ldr x12, [sp, #32] 492be168c0dSopenharmony_ci- ldr x13, [sp, #40] 493be168c0dSopenharmony_ci- ldr x14, [sp, #48] 494be168c0dSopenharmony_ci- ldr x15, [sp, #56] 495be168c0dSopenharmony_ci+ ldr x8, [sp, #192] 496be168c0dSopenharmony_ci+ ldr x9, [sp, #200] 497be168c0dSopenharmony_ci+ ldr x10, [sp, #208] 498be168c0dSopenharmony_ci+ ldr x11, [sp, #216] 499be168c0dSopenharmony_ci+ ldr x12, [sp, #224] 500be168c0dSopenharmony_ci+ ldr x13, [sp, #232] 501be168c0dSopenharmony_ci+ ldr x14, [sp, #240] 502be168c0dSopenharmony_ci+ ldr x15, [sp, #248] 503be168c0dSopenharmony_ci 504be168c0dSopenharmony_ci ld1 {v24.4s}, [x3] 505be168c0dSopenharmony_ci movi v26.4s, #6 506be168c0dSopenharmony_ci@@ -302,7 +303,6 @@ asm_function ConvDwFp32Center 507be168c0dSopenharmony_ci subs x4, x4, #1 508be168c0dSopenharmony_ci bne LoopH 509be168c0dSopenharmony_ci 510be168c0dSopenharmony_ci- sub sp, sp, #192 511be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 512be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 513be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 514be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S 515be168c0dSopenharmony_ciindex 34cc9037..ca93dc7d 100644 516be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S 517be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S 518be168c0dSopenharmony_ci@@ -25,14 +25,14 @@ 519be168c0dSopenharmony_ci 520be168c0dSopenharmony_ci asm_function ConvDwFp32Indirect3x3 521be168c0dSopenharmony_ci sub sp, sp, #32 522be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 523be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 524be168c0dSopenharmony_ci+ stp x19, x20, [sp] 525be168c0dSopenharmony_ci+ stp x21, x22, [sp, #16] 526be168c0dSopenharmony_ci 527be168c0dSopenharmony_ci movi v31.4s, #6 528be168c0dSopenharmony_ci scvtf v31.4s, v31.4s 529be168c0dSopenharmony_ci dup v30.4s, wzr 530be168c0dSopenharmony_ci 531be168c0dSopenharmony_ci- ldr x8, [sp] 532be168c0dSopenharmony_ci+ ldr x8, [sp, #32] 533be168c0dSopenharmony_ci cmp x5, #0 534be168c0dSopenharmony_ci beq End 535be168c0dSopenharmony_ci 536be168c0dSopenharmony_ci@@ -153,7 +153,6 @@ asm_function ConvDwFp32Indirect3x3 537be168c0dSopenharmony_ci cmp x5, #0 538be168c0dSopenharmony_ci bgt LoopPixel 539be168c0dSopenharmony_ci End: 540be168c0dSopenharmony_ci- sub sp, sp, #32 541be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 542be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 543be168c0dSopenharmony_ci ret 544be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S 545be168c0dSopenharmony_ciindex 7ed94e6b..328250f3 100644 546be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S 547be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvDwInt8Center.S 548be168c0dSopenharmony_ci@@ -34,44 +34,45 @@ asm_function ConvDwInt8Center 549be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 550be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 551be168c0dSopenharmony_ci sub sp, sp, #192 552be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 553be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 554be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 555be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 556be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 557be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 558be168c0dSopenharmony_ci- 559be168c0dSopenharmony_ci- ldr x8, [sp] 560be168c0dSopenharmony_ci- ldr x9, [sp, #8] 561be168c0dSopenharmony_ci- ldr x10, [sp, #16] 562be168c0dSopenharmony_ci- ldr x11, [sp, #24] 563be168c0dSopenharmony_ci- ldr x12, [sp, #32] 564be168c0dSopenharmony_ci- ldr x13, [sp, #40] 565be168c0dSopenharmony_ci- 566be168c0dSopenharmony_ci- ldr x14, [sp, #48] // input_zp 567be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 568be168c0dSopenharmony_ci+ add x9, sp, #64 569be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 570be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 571be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 572be168c0dSopenharmony_ci+ stp x23, x24, [sp, #160] 573be168c0dSopenharmony_ci+ stp x25, x26, [sp, #176] 574be168c0dSopenharmony_ci+ 575be168c0dSopenharmony_ci+ ldr x8, [sp, #192] 576be168c0dSopenharmony_ci+ ldr x9, [sp, #200] 577be168c0dSopenharmony_ci+ ldr x10, [sp, #208] 578be168c0dSopenharmony_ci+ ldr x11, [sp, #216] 579be168c0dSopenharmony_ci+ ldr x12, [sp, #224] 580be168c0dSopenharmony_ci+ ldr x13, [sp, #232] 581be168c0dSopenharmony_ci+ 582be168c0dSopenharmony_ci+ ldr x14, [sp, #240] // input_zp 583be168c0dSopenharmony_ci ld1 {v19.8b}, [x14], #8 584be168c0dSopenharmony_ci 585be168c0dSopenharmony_ci- ldr x15, [sp, #56] // output_zp 586be168c0dSopenharmony_ci+ ldr x15, [sp, #248] // output_zp 587be168c0dSopenharmony_ci ld1 {v20.4s}, [x15], #16 588be168c0dSopenharmony_ci ld1 {v21.4s}, [x15], #16 589be168c0dSopenharmony_ci 590be168c0dSopenharmony_ci- ldr x16, [sp, #64] // out_multiplier 591be168c0dSopenharmony_ci+ ldr x16, [sp, #256] // out_multiplier 592be168c0dSopenharmony_ci ld1 {v22.4s}, [x16], #16 593be168c0dSopenharmony_ci ld1 {v23.4s}, [x16], #16 594be168c0dSopenharmony_ci 595be168c0dSopenharmony_ci- ldr x17, [sp, #72] // left_shift 596be168c0dSopenharmony_ci+ ldr x17, [sp, #264] // left_shift 597be168c0dSopenharmony_ci ld1 {v24.4s}, [x17], #16 598be168c0dSopenharmony_ci ld1 {v25.4s}, [x17], #16 599be168c0dSopenharmony_ci 600be168c0dSopenharmony_ci- ldr x25, [sp, #80] // right shift 601be168c0dSopenharmony_ci+ ldr x25, [sp, #272] // right shift 602be168c0dSopenharmony_ci ld1 {v26.4s}, [x25], #16 603be168c0dSopenharmony_ci ld1 {v27.4s}, [x25], #16 604be168c0dSopenharmony_ci 605be168c0dSopenharmony_ci- ldr x19, [sp, #88] // acc_min 606be168c0dSopenharmony_ci+ ldr x19, [sp, #280] // acc_min 607be168c0dSopenharmony_ci ld1 {v28.4s}, [x19], #16 608be168c0dSopenharmony_ci ld1 {v29.4s}, [x19], #16 609be168c0dSopenharmony_ci 610be168c0dSopenharmony_ci- ldr x20, [sp, #96] // acc_max 611be168c0dSopenharmony_ci+ ldr x20, [sp, #288] // acc_max 612be168c0dSopenharmony_ci ld1 {v30.4s}, [x20], #16 613be168c0dSopenharmony_ci ld1 {v31.4s}, [x20], #16 614be168c0dSopenharmony_ci 615be168c0dSopenharmony_ci@@ -283,7 +284,6 @@ asm_function ConvDwInt8Center 616be168c0dSopenharmony_ci subs x4, x4, #1 617be168c0dSopenharmony_ci bne LoopH 618be168c0dSopenharmony_ci 619be168c0dSopenharmony_ci- sub sp, sp, #192 620be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 621be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 622be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 623be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S 624be168c0dSopenharmony_ciindex 2cc456f6..0a9d3265 100644 625be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S 626be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvFp32Center.S 627be168c0dSopenharmony_ci@@ -31,21 +31,22 @@ asm_function ConvSwFp32Center 628be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 629be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 630be168c0dSopenharmony_ci sub sp, sp, #208 631be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 632be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 633be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 634be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 635be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 636be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 637be168c0dSopenharmony_ci- stp x27, x28, [sp], #16 638be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 639be168c0dSopenharmony_ci+ add x9, sp, #64 640be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 641be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 642be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 643be168c0dSopenharmony_ci+ stp x23, x24, [sp, #160] 644be168c0dSopenharmony_ci+ stp x25, x26, [sp, #176] 645be168c0dSopenharmony_ci+ stp x27, x28, [sp, #192] 646be168c0dSopenharmony_ci 647be168c0dSopenharmony_ci- ldr x8, [sp] 648be168c0dSopenharmony_ci- ldr x9, [sp, #8] 649be168c0dSopenharmony_ci- ldr x10, [sp, #16] 650be168c0dSopenharmony_ci- ldr x11, [sp, #24] 651be168c0dSopenharmony_ci- ldr x12, [sp, #32] 652be168c0dSopenharmony_ci- ldr x13, [sp, #40] 653be168c0dSopenharmony_ci- ldr x14, [sp, #48] 654be168c0dSopenharmony_ci+ ldr x8, [sp, #208] 655be168c0dSopenharmony_ci+ ldr x9, [sp, #216] 656be168c0dSopenharmony_ci+ ldr x10, [sp, #224] 657be168c0dSopenharmony_ci+ ldr x11, [sp, #232] 658be168c0dSopenharmony_ci+ ldr x12, [sp, #240] 659be168c0dSopenharmony_ci+ ldr x13, [sp, #248] 660be168c0dSopenharmony_ci+ ldr x14, [sp, #256] 661be168c0dSopenharmony_ci mul x15, x6, x7 662be168c0dSopenharmony_ci mul x15, x10, x15 663be168c0dSopenharmony_ci mov x16, #16 664be168c0dSopenharmony_ci@@ -198,9 +199,9 @@ asm_function ConvSwFp32Center 665be168c0dSopenharmony_ci add x20, x20, x13 666be168c0dSopenharmony_ci subs x22, x22, #1 667be168c0dSopenharmony_ci bne LoopKh16 668be168c0dSopenharmony_ci- ldr x16, [sp, #64] 669be168c0dSopenharmony_ci+ ldr x16, [sp, #272] 670be168c0dSopenharmony_ci cbnz x16, Relu616 671be168c0dSopenharmony_ci- ldr x26, [sp, #56] 672be168c0dSopenharmony_ci+ ldr x26, [sp, #264] 673be168c0dSopenharmony_ci cbnz x26, Relu16 674be168c0dSopenharmony_ci b Write16 675be168c0dSopenharmony_ci Relu616: 676be168c0dSopenharmony_ci@@ -347,9 +348,9 @@ asm_function ConvSwFp32Center 677be168c0dSopenharmony_ci add x20, x20, x13 678be168c0dSopenharmony_ci subs x22, x22, #1 679be168c0dSopenharmony_ci bne LoopKh8 680be168c0dSopenharmony_ci- ldr x16, [sp, #64] 681be168c0dSopenharmony_ci+ ldr x16, [sp, #272] 682be168c0dSopenharmony_ci cbnz x16, Relu68 683be168c0dSopenharmony_ci- ldr x26, [sp, #56] 684be168c0dSopenharmony_ci+ ldr x26, [sp, #264] 685be168c0dSopenharmony_ci cbnz x26, Relu8 686be168c0dSopenharmony_ci b Write8 687be168c0dSopenharmony_ci Relu68: 688be168c0dSopenharmony_ci@@ -426,9 +427,9 @@ asm_function ConvSwFp32Center 689be168c0dSopenharmony_ci add x20, x20, x13 690be168c0dSopenharmony_ci subs x22, x22, #1 691be168c0dSopenharmony_ci bne LoopKh 692be168c0dSopenharmony_ci- ldr x16, [sp, #64] 693be168c0dSopenharmony_ci+ ldr x16, [sp, #272] 694be168c0dSopenharmony_ci cbnz x16, Relu6 695be168c0dSopenharmony_ci- ldr x26, [sp, #56] 696be168c0dSopenharmony_ci+ ldr x26, [sp, #264] 697be168c0dSopenharmony_ci cbnz x26, Relu 698be168c0dSopenharmony_ci b Write 699be168c0dSopenharmony_ci Relu6: 700be168c0dSopenharmony_ci@@ -446,7 +447,6 @@ asm_function ConvSwFp32Center 701be168c0dSopenharmony_ci subs x4, x4, #1 702be168c0dSopenharmony_ci bne LoopH 703be168c0dSopenharmony_ci 704be168c0dSopenharmony_ci- sub sp, sp, #208 705be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 706be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 707be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 708be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S 709be168c0dSopenharmony_ciindex 2267e776..3b436c17 100644 710be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S 711be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x16Kernel.S 712be168c0dSopenharmony_ci@@ -30,17 +30,17 @@ asm_function SWConv1x16Kernel 713be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 714be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 715be168c0dSopenharmony_ci sub sp, sp, #64 716be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 717be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 718be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 719be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 720be168c0dSopenharmony_ci- 721be168c0dSopenharmony_ci- ldr x10, [sp] 722be168c0dSopenharmony_ci- ldr x11, [sp, #8] 723be168c0dSopenharmony_ci- ldr x12, [sp, #16] 724be168c0dSopenharmony_ci- ldr x13, [sp, #24] 725be168c0dSopenharmony_ci- ldr x14, [sp, #32] 726be168c0dSopenharmony_ci- ldr x15, [sp, #40] 727be168c0dSopenharmony_ci+ stp x19, x20, [sp] 728be168c0dSopenharmony_ci+ stp x21, x22, [sp, #16] 729be168c0dSopenharmony_ci+ stp x23, x24, [sp, #32] 730be168c0dSopenharmony_ci+ stp x25, x26, [sp, #48] 731be168c0dSopenharmony_ci+ 732be168c0dSopenharmony_ci+ ldr x10, [sp, #64] 733be168c0dSopenharmony_ci+ ldr x11, [sp, #72] 734be168c0dSopenharmony_ci+ ldr x12, [sp, #80] 735be168c0dSopenharmony_ci+ ldr x13, [sp, #88] 736be168c0dSopenharmony_ci+ ldr x14, [sp, #96] 737be168c0dSopenharmony_ci+ ldr x15, [sp, #104] 738be168c0dSopenharmony_ci lsl x7, x7, #2 739be168c0dSopenharmony_ci lsl x11, x11, #2 740be168c0dSopenharmony_ci lsl x12, x12, #2 741be168c0dSopenharmony_ci@@ -413,7 +413,6 @@ asm_function SWConv1x16Kernel 742be168c0dSopenharmony_ci st1 {v2.4s}, [x21] 743be168c0dSopenharmony_ci st1 {v3.4s}, [x22] 744be168c0dSopenharmony_ci End: 745be168c0dSopenharmony_ci- sub sp, sp, #64 746be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 747be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 748be168c0dSopenharmony_ci ldp x23, x24, [sp], #16 749be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S 750be168c0dSopenharmony_ciindex fa8bb63d..6a29e95e 100644 751be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S 752be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW1x8Kernel.S 753be168c0dSopenharmony_ci@@ -30,17 +30,17 @@ asm_function SWConv1x8Kernel 754be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 755be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 756be168c0dSopenharmony_ci sub sp, sp, #64 757be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 758be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 759be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 760be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 761be168c0dSopenharmony_ci+ stp x19, x20, [sp] 762be168c0dSopenharmony_ci+ stp x21, x22, [sp, #16] 763be168c0dSopenharmony_ci+ stp x23, x24, [sp, #32] 764be168c0dSopenharmony_ci+ stp x25, x26, [sp, #48] 765be168c0dSopenharmony_ci 766be168c0dSopenharmony_ci- ldr x10, [sp] 767be168c0dSopenharmony_ci- ldr x11, [sp, #8] 768be168c0dSopenharmony_ci- ldr x12, [sp, #16] 769be168c0dSopenharmony_ci- ldr x13, [sp, #24] 770be168c0dSopenharmony_ci- ldr x14, [sp, #32] 771be168c0dSopenharmony_ci- ldr x15, [sp, #40] 772be168c0dSopenharmony_ci+ ldr x10, [sp, #64] 773be168c0dSopenharmony_ci+ ldr x11, [sp, #72] 774be168c0dSopenharmony_ci+ ldr x12, [sp, #80] 775be168c0dSopenharmony_ci+ ldr x13, [sp, #88] 776be168c0dSopenharmony_ci+ ldr x14, [sp, #96] 777be168c0dSopenharmony_ci+ ldr x15, [sp, #104] 778be168c0dSopenharmony_ci lsl x7, x7, #2 779be168c0dSopenharmony_ci lsl x11, x11, #2 780be168c0dSopenharmony_ci lsl x12, x12, #2 781be168c0dSopenharmony_ci@@ -270,7 +270,6 @@ asm_function SWConv1x8Kernel 782be168c0dSopenharmony_ci st1 {v0.4s}, [x0] 783be168c0dSopenharmony_ci st1 {v1.4s}, [x20] 784be168c0dSopenharmony_ci End: 785be168c0dSopenharmony_ci- sub sp, sp, #64 786be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 787be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 788be168c0dSopenharmony_ci ldp x23, x24, [sp], #16 789be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S 790be168c0dSopenharmony_ciindex 69624af6..8a5dd83a 100644 791be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S 792be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x16Kernel.S 793be168c0dSopenharmony_ci@@ -30,17 +30,17 @@ asm_function SWConv2x16Kernel 794be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 795be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 796be168c0dSopenharmony_ci sub sp, sp, #64 797be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 798be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 799be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 800be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 801be168c0dSopenharmony_ci+ stp x19, x20, [sp] 802be168c0dSopenharmony_ci+ stp x21, x22, [sp, #16] 803be168c0dSopenharmony_ci+ stp x23, x24, [sp, #32] 804be168c0dSopenharmony_ci+ stp x25, x26, [sp, #48] 805be168c0dSopenharmony_ci 806be168c0dSopenharmony_ci- ldr x10, [sp] 807be168c0dSopenharmony_ci- ldr x11, [sp, #8] 808be168c0dSopenharmony_ci- ldr x12, [sp, #16] 809be168c0dSopenharmony_ci- ldr x13, [sp, #24] 810be168c0dSopenharmony_ci- ldr x14, [sp, #32] 811be168c0dSopenharmony_ci- ldr x15, [sp, #40] 812be168c0dSopenharmony_ci+ ldr x10, [sp, #64] 813be168c0dSopenharmony_ci+ ldr x11, [sp, #72] 814be168c0dSopenharmony_ci+ ldr x12, [sp, #80] 815be168c0dSopenharmony_ci+ ldr x13, [sp, #88] 816be168c0dSopenharmony_ci+ ldr x14, [sp, #96] 817be168c0dSopenharmony_ci+ ldr x15, [sp, #104] 818be168c0dSopenharmony_ci lsl x7, x7, #2 819be168c0dSopenharmony_ci lsl x11, x11, #2 820be168c0dSopenharmony_ci lsl x12, x12, #2 821be168c0dSopenharmony_ci@@ -399,7 +399,6 @@ asm_function SWConv2x16Kernel 822be168c0dSopenharmony_ci st1 {v3.4s}, [x22], #16 823be168c0dSopenharmony_ci st1 {v7.4s}, [x22] 824be168c0dSopenharmony_ci End: 825be168c0dSopenharmony_ci- sub sp, sp, #64 826be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 827be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 828be168c0dSopenharmony_ci ldp x23, x24, [sp], #16 829be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S 830be168c0dSopenharmony_ciindex 8fefa4be..6efd21d0 100644 831be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S 832be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW2x8Kernel.S 833be168c0dSopenharmony_ci@@ -30,17 +30,17 @@ asm_function SWConv2x8Kernel 834be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 835be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 836be168c0dSopenharmony_ci sub sp, sp, #64 837be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 838be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 839be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 840be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 841be168c0dSopenharmony_ci+ stp x19, x20, [sp] 842be168c0dSopenharmony_ci+ stp x21, x22, [sp, #16] 843be168c0dSopenharmony_ci+ stp x23, x24, [sp, #32] 844be168c0dSopenharmony_ci+ stp x25, x26, [sp, #48] 845be168c0dSopenharmony_ci 846be168c0dSopenharmony_ci- ldr x10, [sp] 847be168c0dSopenharmony_ci- ldr x11, [sp, #8] 848be168c0dSopenharmony_ci- ldr x12, [sp, #16] 849be168c0dSopenharmony_ci- ldr x13, [sp, #24] 850be168c0dSopenharmony_ci- ldr x14, [sp, #32] 851be168c0dSopenharmony_ci- ldr x15, [sp, #40] 852be168c0dSopenharmony_ci+ ldr x10, [sp, #64] 853be168c0dSopenharmony_ci+ ldr x11, [sp, #72] 854be168c0dSopenharmony_ci+ ldr x12, [sp, #80] 855be168c0dSopenharmony_ci+ ldr x13, [sp, #88] 856be168c0dSopenharmony_ci+ ldr x14, [sp, #96] 857be168c0dSopenharmony_ci+ ldr x15, [sp, #104] 858be168c0dSopenharmony_ci lsl x7, x7, #2 859be168c0dSopenharmony_ci lsl x11, x11, #2 860be168c0dSopenharmony_ci lsl x12, x12, #2 861be168c0dSopenharmony_ci@@ -257,7 +257,6 @@ asm_function SWConv2x8Kernel 862be168c0dSopenharmony_ci st1 {v1.4s}, [x20], #16 863be168c0dSopenharmony_ci st1 {v3.4s}, [x20] 864be168c0dSopenharmony_ci End: 865be168c0dSopenharmony_ci- sub sp, sp, #64 866be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 867be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 868be168c0dSopenharmony_ci ldp x23, x24, [sp], #16 869be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S 870be168c0dSopenharmony_ciindex 61efd444..428dea69 100644 871be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S 872be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x16Kernel.S 873be168c0dSopenharmony_ci@@ -30,18 +30,18 @@ asm_function SWConv3x16Kernel 874be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 875be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 876be168c0dSopenharmony_ci sub sp, sp, #128 877be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 878be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 879be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 880be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 881be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 882be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 883be168c0dSopenharmony_ci+ stp x19, x20, [sp, #64] 884be168c0dSopenharmony_ci+ stp x21, x22, [sp, #80] 885be168c0dSopenharmony_ci+ stp x23, x24, [sp, #96] 886be168c0dSopenharmony_ci+ stp x25, x26, [sp, #112] 887be168c0dSopenharmony_ci 888be168c0dSopenharmony_ci- ldr x10, [sp] 889be168c0dSopenharmony_ci- ldr x11, [sp, #8] 890be168c0dSopenharmony_ci- ldr x12, [sp, #16] 891be168c0dSopenharmony_ci- ldr x13, [sp, #24] 892be168c0dSopenharmony_ci- ldr x14, [sp, #32] 893be168c0dSopenharmony_ci- ldr x15, [sp, #40] 894be168c0dSopenharmony_ci+ ldr x10, [sp, #128] 895be168c0dSopenharmony_ci+ ldr x11, [sp, #136] 896be168c0dSopenharmony_ci+ ldr x12, [sp, #144] 897be168c0dSopenharmony_ci+ ldr x13, [sp, #152] 898be168c0dSopenharmony_ci+ ldr x14, [sp, #160] 899be168c0dSopenharmony_ci+ ldr x15, [sp, #168] 900be168c0dSopenharmony_ci lsl x7, x7, #2 901be168c0dSopenharmony_ci lsl x11, x11, #2 902be168c0dSopenharmony_ci lsl x12, x12, #2 903be168c0dSopenharmony_ci@@ -524,7 +524,6 @@ asm_function SWConv3x16Kernel 904be168c0dSopenharmony_ci st1 {v7.4s}, [x22], #16 905be168c0dSopenharmony_ci st1 {v11.4s}, [x22] 906be168c0dSopenharmony_ci End: 907be168c0dSopenharmony_ci- sub sp, sp, #128 908be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 909be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 910be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 911be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S 912be168c0dSopenharmony_ciindex 1e958572..472e50b9 100644 913be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S 914be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW3x8Kernel.S 915be168c0dSopenharmony_ci@@ -30,17 +30,17 @@ asm_function SWConv3x8Kernel 916be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 917be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 918be168c0dSopenharmony_ci sub sp, sp, #64 919be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 920be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 921be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 922be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 923be168c0dSopenharmony_ci+ stp x19, x20, [sp] 924be168c0dSopenharmony_ci+ stp x21, x22, [sp, #16] 925be168c0dSopenharmony_ci+ stp x23, x24, [sp, #32] 926be168c0dSopenharmony_ci+ stp x25, x26, [sp, #48] 927be168c0dSopenharmony_ci 928be168c0dSopenharmony_ci- ldr x10, [sp] 929be168c0dSopenharmony_ci- ldr x11, [sp, #8] 930be168c0dSopenharmony_ci- ldr x12, [sp, #16] 931be168c0dSopenharmony_ci- ldr x13, [sp, #24] 932be168c0dSopenharmony_ci- ldr x14, [sp, #32] 933be168c0dSopenharmony_ci- ldr x15, [sp, #40] 934be168c0dSopenharmony_ci+ ldr x10, [sp, #64] 935be168c0dSopenharmony_ci+ ldr x11, [sp, #72] 936be168c0dSopenharmony_ci+ ldr x12, [sp, #80] 937be168c0dSopenharmony_ci+ ldr x13, [sp, #88] 938be168c0dSopenharmony_ci+ ldr x14, [sp, #96] 939be168c0dSopenharmony_ci+ ldr x15, [sp, #104] 940be168c0dSopenharmony_ci lsl x7, x7, #2 941be168c0dSopenharmony_ci lsl x11, x11, #2 942be168c0dSopenharmony_ci lsl x12, x12, #2 943be168c0dSopenharmony_ci@@ -324,7 +324,6 @@ asm_function SWConv3x8Kernel 944be168c0dSopenharmony_ci st1 {v3.4s}, [x20], #16 945be168c0dSopenharmony_ci st1 {v5.4s}, [x20] 946be168c0dSopenharmony_ci End: 947be168c0dSopenharmony_ci- sub sp, sp, #64 948be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 949be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 950be168c0dSopenharmony_ci ldp x23, x24, [sp], #16 951be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S 952be168c0dSopenharmony_ciindex 1cd5e124..076724a7 100644 953be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S 954be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x16Kernel.S 955be168c0dSopenharmony_ci@@ -30,20 +30,21 @@ asm_function SWConv4x16Kernel 956be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 957be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 958be168c0dSopenharmony_ci sub sp, sp, #208 959be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 960be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 961be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 962be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 963be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 964be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 965be168c0dSopenharmony_ci- stp x27, x28, [sp], #16 966be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 967be168c0dSopenharmony_ci+ add x9, sp, #64 968be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 969be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 970be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 971be168c0dSopenharmony_ci+ stp x23, x24, [sp, #160] 972be168c0dSopenharmony_ci+ stp x25, x26, [sp, #176] 973be168c0dSopenharmony_ci+ stp x27, x28, [sp, #192] 974be168c0dSopenharmony_ci 975be168c0dSopenharmony_ci- ldr x10, [sp] 976be168c0dSopenharmony_ci- ldr x11, [sp, #8] 977be168c0dSopenharmony_ci- ldr x12, [sp, #16] 978be168c0dSopenharmony_ci- ldr x13, [sp, #24] 979be168c0dSopenharmony_ci- ldr x14, [sp, #32] 980be168c0dSopenharmony_ci- ldr x15, [sp, #40] 981be168c0dSopenharmony_ci+ ldr x10, [sp, #208] 982be168c0dSopenharmony_ci+ ldr x11, [sp, #216] 983be168c0dSopenharmony_ci+ ldr x12, [sp, #224] 984be168c0dSopenharmony_ci+ ldr x13, [sp, #232] 985be168c0dSopenharmony_ci+ ldr x14, [sp, #240] 986be168c0dSopenharmony_ci+ ldr x15, [sp, #248] 987be168c0dSopenharmony_ci lsl x7, x7, #2 988be168c0dSopenharmony_ci lsl x11, x11, #2 989be168c0dSopenharmony_ci lsl x12, x12, #2 990be168c0dSopenharmony_ci@@ -650,7 +651,6 @@ asm_function SWConv4x16Kernel 991be168c0dSopenharmony_ci st1 {v11.4s}, [x22], #16 992be168c0dSopenharmony_ci st1 {v15.4s}, [x22] 993be168c0dSopenharmony_ci End: 994be168c0dSopenharmony_ci- sub sp, sp, #208 995be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 996be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 997be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 998be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S 999be168c0dSopenharmony_ciindex 28109031..6b24de97 100644 1000be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S 1001be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW4x8Kernel.S 1002be168c0dSopenharmony_ci@@ -30,20 +30,21 @@ asm_function SWConv4x8Kernel 1003be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 1004be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 1005be168c0dSopenharmony_ci sub sp, sp, #208 1006be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1007be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1008be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1009be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 1010be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 1011be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 1012be168c0dSopenharmony_ci- stp x27, x28, [sp], #16 1013be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1014be168c0dSopenharmony_ci+ add x9, sp, #64 1015be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1016be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 1017be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 1018be168c0dSopenharmony_ci+ stp x23, x24, [sp, #160] 1019be168c0dSopenharmony_ci+ stp x25, x26, [sp, #176] 1020be168c0dSopenharmony_ci+ stp x27, x28, [sp, #192] 1021be168c0dSopenharmony_ci 1022be168c0dSopenharmony_ci- ldr x10, [sp] 1023be168c0dSopenharmony_ci- ldr x11, [sp, #8] 1024be168c0dSopenharmony_ci- ldr x12, [sp, #16] 1025be168c0dSopenharmony_ci- ldr x13, [sp, #24] 1026be168c0dSopenharmony_ci- ldr x14, [sp, #32] 1027be168c0dSopenharmony_ci- ldr x15, [sp, #40] 1028be168c0dSopenharmony_ci+ ldr x10, [sp, #208] 1029be168c0dSopenharmony_ci+ ldr x11, [sp, #216] 1030be168c0dSopenharmony_ci+ ldr x12, [sp, #224] 1031be168c0dSopenharmony_ci+ ldr x13, [sp, #232] 1032be168c0dSopenharmony_ci+ ldr x14, [sp, #240] 1033be168c0dSopenharmony_ci+ ldr x15, [sp, #248] 1034be168c0dSopenharmony_ci lsl x7, x7, #2 1035be168c0dSopenharmony_ci lsl x11, x11, #2 1036be168c0dSopenharmony_ci lsl x12, x12, #2 1037be168c0dSopenharmony_ci@@ -394,7 +395,6 @@ asm_function SWConv4x8Kernel 1038be168c0dSopenharmony_ci st1 {v5.4s}, [x20], #16 1039be168c0dSopenharmony_ci st1 {v7.4s}, [x20] 1040be168c0dSopenharmony_ci End: 1041be168c0dSopenharmony_ci- sub sp, sp, #208 1042be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1043be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1044be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1045be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S 1046be168c0dSopenharmony_ciindex 302e5a3d..a2b7ea2c 100644 1047be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S 1048be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x16Kernel.S 1049be168c0dSopenharmony_ci@@ -30,20 +30,21 @@ asm_function SWConv5x16Kernel 1050be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 1051be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 1052be168c0dSopenharmony_ci sub sp, sp, #208 1053be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1054be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1055be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1056be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 1057be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 1058be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 1059be168c0dSopenharmony_ci- stp x27, x28, [sp], #16 1060be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1061be168c0dSopenharmony_ci+ add x9, sp, #64 1062be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1063be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 1064be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 1065be168c0dSopenharmony_ci+ stp x23, x24, [sp, #160] 1066be168c0dSopenharmony_ci+ stp x25, x26, [sp, #176] 1067be168c0dSopenharmony_ci+ stp x27, x28, [sp, #192] 1068be168c0dSopenharmony_ci 1069be168c0dSopenharmony_ci- ldr x10, [sp] 1070be168c0dSopenharmony_ci- ldr x11, [sp, #8] 1071be168c0dSopenharmony_ci- ldr x12, [sp, #16] 1072be168c0dSopenharmony_ci- ldr x13, [sp, #24] 1073be168c0dSopenharmony_ci- ldr x14, [sp, #32] 1074be168c0dSopenharmony_ci- ldr x15, [sp, #40] 1075be168c0dSopenharmony_ci+ ldr x10, [sp, #208] 1076be168c0dSopenharmony_ci+ ldr x11, [sp, #216] 1077be168c0dSopenharmony_ci+ ldr x12, [sp, #224] 1078be168c0dSopenharmony_ci+ ldr x13, [sp, #232] 1079be168c0dSopenharmony_ci+ ldr x14, [sp, #240] 1080be168c0dSopenharmony_ci+ ldr x15, [sp, #248] 1081be168c0dSopenharmony_ci lsl x7, x7, #2 1082be168c0dSopenharmony_ci lsl x11, x11, #2 1083be168c0dSopenharmony_ci lsl x12, x12, #2 1084be168c0dSopenharmony_ci@@ -445,7 +446,6 @@ asm_function SWConv5x16Kernel 1085be168c0dSopenharmony_ci st1 {v15.4s}, [x22], #16 1086be168c0dSopenharmony_ci st1 {v19.4s}, [x22] 1087be168c0dSopenharmony_ci End: 1088be168c0dSopenharmony_ci- sub sp, sp, #208 1089be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1090be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1091be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1092be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S 1093be168c0dSopenharmony_ciindex 059cc7fc..b7e48480 100644 1094be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S 1095be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/ConvSW5x8Kernel.S 1096be168c0dSopenharmony_ci@@ -30,20 +30,21 @@ asm_function SWConv5x8Kernel 1097be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 1098be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 1099be168c0dSopenharmony_ci sub sp, sp, #208 1100be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1101be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1102be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1103be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 1104be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 1105be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 1106be168c0dSopenharmony_ci- stp x27, x28, [sp], #16 1107be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1108be168c0dSopenharmony_ci+ add x9, sp, #64 1109be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1110be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 1111be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 1112be168c0dSopenharmony_ci+ stp x23, x24, [sp, #160] 1113be168c0dSopenharmony_ci+ stp x25, x26, [sp, #176] 1114be168c0dSopenharmony_ci+ stp x27, x28, [sp, #192] 1115be168c0dSopenharmony_ci 1116be168c0dSopenharmony_ci- ldr x10, [sp] 1117be168c0dSopenharmony_ci- ldr x11, [sp, #8] 1118be168c0dSopenharmony_ci- ldr x12, [sp, #16] 1119be168c0dSopenharmony_ci- ldr x13, [sp, #24] 1120be168c0dSopenharmony_ci- ldr x14, [sp, #32] 1121be168c0dSopenharmony_ci- ldr x15, [sp, #40] 1122be168c0dSopenharmony_ci+ ldr x10, [sp, #208] 1123be168c0dSopenharmony_ci+ ldr x11, [sp, #216] 1124be168c0dSopenharmony_ci+ ldr x12, [sp, #224] 1125be168c0dSopenharmony_ci+ ldr x13, [sp, #232] 1126be168c0dSopenharmony_ci+ ldr x14, [sp, #240] 1127be168c0dSopenharmony_ci+ ldr x15, [sp, #248] 1128be168c0dSopenharmony_ci lsl x7, x7, #2 1129be168c0dSopenharmony_ci lsl x11, x11, #2 1130be168c0dSopenharmony_ci lsl x12, x12, #2 1131be168c0dSopenharmony_ci@@ -296,7 +297,6 @@ asm_function SWConv5x8Kernel 1132be168c0dSopenharmony_ci st1 {v7.4s}, [x20], #16 1133be168c0dSopenharmony_ci st1 {v9.4s}, [x20] 1134be168c0dSopenharmony_ci End: 1135be168c0dSopenharmony_ci- sub sp, sp, #208 1136be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1137be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1138be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1139be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S 1140be168c0dSopenharmony_ciindex e6875bb1..11722e71 100644 1141be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S 1142be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwFp32Center.S 1143be168c0dSopenharmony_ci@@ -30,14 +30,14 @@ asm_function DeconvDwFp32Center 1144be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 1145be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 1146be168c0dSopenharmony_ci sub sp, sp, #32 1147be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1148be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 1149be168c0dSopenharmony_ci+ stp x19, x20, [sp] 1150be168c0dSopenharmony_ci+ stp x21, x22, [sp, #16] 1151be168c0dSopenharmony_ci 1152be168c0dSopenharmony_ci- ldr x8, [sp] 1153be168c0dSopenharmony_ci- ldr x9, [sp, #8] 1154be168c0dSopenharmony_ci- ldr x10, [sp, #16] 1155be168c0dSopenharmony_ci- ldr x11, [sp, #24] 1156be168c0dSopenharmony_ci- ldr x12, [sp, #32] 1157be168c0dSopenharmony_ci+ ldr x8, [sp, #32] 1158be168c0dSopenharmony_ci+ ldr x9, [sp, #40] 1159be168c0dSopenharmony_ci+ ldr x10, [sp, #48] 1160be168c0dSopenharmony_ci+ ldr x11, [sp, #56] 1161be168c0dSopenharmony_ci+ ldr x12, [sp, #64] 1162be168c0dSopenharmony_ci 1163be168c0dSopenharmony_ci LoopH: 1164be168c0dSopenharmony_ci mov x15, x0 1165be168c0dSopenharmony_ci@@ -69,7 +69,6 @@ asm_function DeconvDwFp32Center 1166be168c0dSopenharmony_ci subs x3, x3, #1 1167be168c0dSopenharmony_ci bne LoopH 1168be168c0dSopenharmony_ci 1169be168c0dSopenharmony_ci- sub sp, sp, #32 1170be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1171be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 1172be168c0dSopenharmony_ci ret 1173be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S 1174be168c0dSopenharmony_ciindex aaf210f0..1c3723fa 100644 1175be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S 1176be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/DeconvDwInt8Center.S 1177be168c0dSopenharmony_ci@@ -30,14 +30,14 @@ asm_function DeconvDwInt8Center 1178be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 1179be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 1180be168c0dSopenharmony_ci sub sp, sp, #32 1181be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1182be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 1183be168c0dSopenharmony_ci+ stp x19, x20, [sp] 1184be168c0dSopenharmony_ci+ stp x21, x22, [sp, #16] 1185be168c0dSopenharmony_ci 1186be168c0dSopenharmony_ci- ldr x8, [sp] 1187be168c0dSopenharmony_ci- ldr x9, [sp, #8] 1188be168c0dSopenharmony_ci- ldr x10, [sp, #16] 1189be168c0dSopenharmony_ci- ldr x11, [sp, #24] 1190be168c0dSopenharmony_ci- ldr x12, [sp, #32] 1191be168c0dSopenharmony_ci+ ldr x8, [sp, #32] 1192be168c0dSopenharmony_ci+ ldr x9, [sp, #40] 1193be168c0dSopenharmony_ci+ ldr x10, [sp, #48] 1194be168c0dSopenharmony_ci+ ldr x11, [sp, #56] 1195be168c0dSopenharmony_ci+ ldr x12, [sp, #64] 1196be168c0dSopenharmony_ci 1197be168c0dSopenharmony_ci LoopH: 1198be168c0dSopenharmony_ci mov x15, x0 1199be168c0dSopenharmony_ci@@ -69,7 +69,6 @@ asm_function DeconvDwInt8Center 1200be168c0dSopenharmony_ci subs x3, x3, #1 1201be168c0dSopenharmony_ci bne LoopH 1202be168c0dSopenharmony_ci 1203be168c0dSopenharmony_ci- sub sp, sp, #32 1204be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1205be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 1206be168c0dSopenharmony_ci ret 1207be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S 1208be168c0dSopenharmony_ciindex 71a7f0f1..36c8d8ec 100644 1209be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S 1210be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulFp32.S 1211be168c0dSopenharmony_ci@@ -15,7 +15,7 @@ 1212be168c0dSopenharmony_ci */ 1213be168c0dSopenharmony_ci #ifdef ENABLE_ARM64 1214be168c0dSopenharmony_ci #include "nnacl/assembly_global.h" 1215be168c0dSopenharmony_ci- 1216be168c0dSopenharmony_ci+ 1217be168c0dSopenharmony_ci .text 1218be168c0dSopenharmony_ci .align 5 1219be168c0dSopenharmony_ci 1220be168c0dSopenharmony_ci@@ -30,24 +30,25 @@ 1221be168c0dSopenharmony_ci 1222be168c0dSopenharmony_ci asm_default_function MatVecMulFp32 1223be168c0dSopenharmony_ci sub sp, sp, #128 1224be168c0dSopenharmony_ci- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 1225be168c0dSopenharmony_ci- st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 1226be168c0dSopenharmony_ci+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 1227be168c0dSopenharmony_ci+ add x9, sp, #64 1228be168c0dSopenharmony_ci+ st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9] 1229be168c0dSopenharmony_ci 1230be168c0dSopenharmony_ci mov w14, #4 // sizeof(float) 1231be168c0dSopenharmony_ci mul w8, w14, w5 // rhs depthx1 block stride 1232be168c0dSopenharmony_ci mov w14, #4 1233be168c0dSopenharmony_ci- mul w13, w8, w14 // rhs depthx4 block stride 1234be168c0dSopenharmony_ci+ mul w13, w8, w14 // rhs depthx4 block stride 1235be168c0dSopenharmony_ci 1236be168c0dSopenharmony_ci Loop: 1237be168c0dSopenharmony_ci mov x15, x0 // reload a ptr 1238be168c0dSopenharmony_ci mov x7, x1 // reload b ptr 1239be168c0dSopenharmony_ci mov w9, w5 // reload depth 1240be168c0dSopenharmony_ci cmp w6, #4 1241be168c0dSopenharmony_ci- blt Loop1x1 1242be168c0dSopenharmony_ci+ blt Loop1x1 1243be168c0dSopenharmony_ci 1244be168c0dSopenharmony_ci-Loop1x4: 1245be168c0dSopenharmony_ci- dup v10.8h, wzr 1246be168c0dSopenharmony_ci- dup v11.8h, wzr 1247be168c0dSopenharmony_ci+Loop1x4: 1248be168c0dSopenharmony_ci+ dup v10.8h, wzr 1249be168c0dSopenharmony_ci+ dup v11.8h, wzr 1250be168c0dSopenharmony_ci dup v12.8h, wzr 1251be168c0dSopenharmony_ci dup v13.8h, wzr 1252be168c0dSopenharmony_ci dup v14.8h, wzr 1253be168c0dSopenharmony_ci@@ -150,7 +151,7 @@ End1x4: 1254be168c0dSopenharmony_ci 1255be168c0dSopenharmony_ci cbz x3, Act1x4 1256be168c0dSopenharmony_ci ld1 {v15.4s}, [x3], #16 1257be168c0dSopenharmony_ci- fadd v14.4s, v14.4s, v15.4s // add bias 1258be168c0dSopenharmony_ci+ fadd v14.4s, v14.4s, v15.4s // add bias 1259be168c0dSopenharmony_ci 1260be168c0dSopenharmony_ci Act1x4: 1261be168c0dSopenharmony_ci cmp w4, #3 1262be168c0dSopenharmony_ci@@ -214,8 +215,8 @@ Depth1_1x1: 1263be168c0dSopenharmony_ci b Depth1_1x1 1264be168c0dSopenharmony_ci 1265be168c0dSopenharmony_ci End1x1: 1266be168c0dSopenharmony_ci- faddp v6.4s, v4.4s, v4.4s 1267be168c0dSopenharmony_ci- faddp v7.4s, v6.4s, v6.4s 1268be168c0dSopenharmony_ci+ faddp v6.4s, v4.4s, v4.4s 1269be168c0dSopenharmony_ci+ faddp v7.4s, v6.4s, v6.4s 1270be168c0dSopenharmony_ci fadd v7.4s, v7.4s, v5.4s 1271be168c0dSopenharmony_ci 1272be168c0dSopenharmony_ci cbz x3, Act1x1 1273be168c0dSopenharmony_ci@@ -245,7 +246,6 @@ Write1x1: 1274be168c0dSopenharmony_ci b Loop 1275be168c0dSopenharmony_ci 1276be168c0dSopenharmony_ci End: 1277be168c0dSopenharmony_ci- sub sp, sp, #128 1278be168c0dSopenharmony_ci ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 1279be168c0dSopenharmony_ci ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 1280be168c0dSopenharmony_ci ret 1281be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S 1282be168c0dSopenharmony_ciindex d485b012..b013f48a 100644 1283be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S 1284be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatVecMulPackFp32.S 1285be168c0dSopenharmony_ci@@ -30,8 +30,8 @@ 1286be168c0dSopenharmony_ci 1287be168c0dSopenharmony_ci asm_default_function MatVecMulPackFp32 1288be168c0dSopenharmony_ci sub sp, sp, #16 1289be168c0dSopenharmony_ci- stp x29, x30, [sp], #16 1290be168c0dSopenharmony_ci- 1291be168c0dSopenharmony_ci+ stp x29, x30, [sp] 1292be168c0dSopenharmony_ci+ 1293be168c0dSopenharmony_ci dup v1.2d, xzr 1294be168c0dSopenharmony_ci mov w7, #6 1295be168c0dSopenharmony_ci dup v2.4s, w7 1296be168c0dSopenharmony_ci@@ -43,7 +43,7 @@ asm_default_function MatVecMulPackFp32 1297be168c0dSopenharmony_ci st1 {v24.4s, v25.4s}, [x2], #32 1298be168c0dSopenharmony_ci subs w6, w6, #8 1299be168c0dSopenharmony_ci bge Loop1x8Start 1300be168c0dSopenharmony_ci- 1301be168c0dSopenharmony_ci+ 1302be168c0dSopenharmony_ci Loop1xNStart: 1303be168c0dSopenharmony_ci add w6, w6, #8 1304be168c0dSopenharmony_ci cbz w6, End 1305be168c0dSopenharmony_ci@@ -59,7 +59,7 @@ asm_default_function MatVecMulPackFp32 1306be168c0dSopenharmony_ci beq End 1307be168c0dSopenharmony_ci st1 {v25.s}[2], [x2] 1308be168c0dSopenharmony_ci b End 1309be168c0dSopenharmony_ci- 1310be168c0dSopenharmony_ci+ 1311be168c0dSopenharmony_ci Loop1x4Start: 1312be168c0dSopenharmony_ci add w6, w6, #4 1313be168c0dSopenharmony_ci cbz w6, End 1314be168c0dSopenharmony_ci@@ -75,7 +75,7 @@ asm_default_function MatVecMulPackFp32 1315be168c0dSopenharmony_ci beq End 1316be168c0dSopenharmony_ci st1 {v24.s}[3], [x2], #4 1317be168c0dSopenharmony_ci b End 1318be168c0dSopenharmony_ci- 1319be168c0dSopenharmony_ci+ 1320be168c0dSopenharmony_ci Compute1x8Unit: 1321be168c0dSopenharmony_ci mov x7, x0 // reload a-ptr 1322be168c0dSopenharmony_ci mov w8, w5 // reset depth 1323be168c0dSopenharmony_ci@@ -140,7 +140,7 @@ asm_default_function MatVecMulPackFp32 1324be168c0dSopenharmony_ci fmax v25.4s, v25.4s, v1.4s 1325be168c0dSopenharmony_ci Return1x8: 1326be168c0dSopenharmony_ci ret 1327be168c0dSopenharmony_ci- 1328be168c0dSopenharmony_ci+ 1329be168c0dSopenharmony_ci Compute1x4Unit: 1330be168c0dSopenharmony_ci mov x7, x0 // reload a-ptr 1331be168c0dSopenharmony_ci mov w8, w5 // reset depth 1332be168c0dSopenharmony_ci@@ -191,9 +191,8 @@ asm_default_function MatVecMulPackFp32 1333be168c0dSopenharmony_ci fmax v24.4s, v24.4s, v1.4s 1334be168c0dSopenharmony_ci Return1x4: 1335be168c0dSopenharmony_ci ret 1336be168c0dSopenharmony_ci- 1337be168c0dSopenharmony_ci+ 1338be168c0dSopenharmony_ci End: 1339be168c0dSopenharmony_ci- sub sp, sp, #16 1340be168c0dSopenharmony_ci ldp x29, x30, [sp], #16 1341be168c0dSopenharmony_ci ret 1342be168c0dSopenharmony_ci #endif 1343be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S 1344be168c0dSopenharmony_ciindex 67d20dcc..2dedccd0 100644 1345be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S 1346be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S 1347be168c0dSopenharmony_ci@@ -34,17 +34,18 @@ 1348be168c0dSopenharmony_ci 1349be168c0dSopenharmony_ci asm_function MatmulFloatNeon64 1350be168c0dSopenharmony_ci sub sp, sp, #144 1351be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1352be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1353be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1354be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1355be168c0dSopenharmony_ci+ add x9, sp, #64 1356be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1357be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 1358be168c0dSopenharmony_ci 1359be168c0dSopenharmony_ci- ldr x9, [sp, #8] 1360be168c0dSopenharmony_ci- ldr x14, [sp, #16] 1361be168c0dSopenharmony_ci+ ldr x9, [sp, #152] 1362be168c0dSopenharmony_ci+ ldr x14, [sp, #160] 1363be168c0dSopenharmony_ci 1364be168c0dSopenharmony_ci mov w19, #32 // sizeof(float) * 8 1365be168c0dSopenharmony_ci mul w15, w5, w19 // block stride of lhs/rhs: sizeof(float) * 8 * depth 1366be168c0dSopenharmony_ci mov x19, #4 1367be168c0dSopenharmony_ci- ldr x17, [sp] 1368be168c0dSopenharmony_ci+ ldr x17, [sp, #144] 1369be168c0dSopenharmony_ci cbz x14, NoWinoSteps 1370be168c0dSopenharmony_ci mul x8, x7, x17 1371be168c0dSopenharmony_ci mov x11, #8 1372be168c0dSopenharmony_ci@@ -779,7 +780,6 @@ NoDstStep: 1373be168c0dSopenharmony_ci bgt L1 1374be168c0dSopenharmony_ci 1375be168c0dSopenharmony_ci End1: 1376be168c0dSopenharmony_ci- sub sp, sp, #144 1377be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1378be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1379be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1380be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S 1381be168c0dSopenharmony_ciindex 6937f4ba..51d107c8 100644 1382be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S 1383be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S 1384be168c0dSopenharmony_ci@@ -19,7 +19,7 @@ 1385be168c0dSopenharmony_ci .text 1386be168c0dSopenharmony_ci .align 5 1387be168c0dSopenharmony_ci 1388be168c0dSopenharmony_ci-// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth 1389be168c0dSopenharmony_ci+// void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth 1390be168c0dSopenharmony_ci // int row, int col, size_t stride, size_t writeMode) 1391be168c0dSopenharmony_ci // x0: a 1392be168c0dSopenharmony_ci // x1: b 1393be168c0dSopenharmony_ci@@ -34,13 +34,14 @@ 1394be168c0dSopenharmony_ci 1395be168c0dSopenharmony_ci asm_function MatmulFloatNeon64Opt 1396be168c0dSopenharmony_ci sub sp, sp, #160 1397be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1398be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1399be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1400be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 1401be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1402be168c0dSopenharmony_ci+ add x9, sp, #64 1403be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1404be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 1405be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 1406be168c0dSopenharmony_ci 1407be168c0dSopenharmony_ci- ldr x8, [sp] 1408be168c0dSopenharmony_ci- ldr x9, [sp, #8] 1409be168c0dSopenharmony_ci+ ldr x8, [sp, #160] 1410be168c0dSopenharmony_ci+ ldr x9, [sp, #168] 1411be168c0dSopenharmony_ci 1412be168c0dSopenharmony_ci mov x21, #48 // sizeof(float) * 12 1413be168c0dSopenharmony_ci mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth 1414be168c0dSopenharmony_ci@@ -1659,7 +1660,6 @@ LoopColEnd: 1415be168c0dSopenharmony_ci subs x6, x6, #12 1416be168c0dSopenharmony_ci bgt LoopRowStart 1417be168c0dSopenharmony_ci 1418be168c0dSopenharmony_ci- sub sp, sp, #160 1419be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1420be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1421be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1422be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S 1423be168c0dSopenharmony_ciindex c9151a99..05465bd1 100644 1424be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S 1425be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S 1426be168c0dSopenharmony_ci@@ -34,13 +34,14 @@ 1427be168c0dSopenharmony_ci 1428be168c0dSopenharmony_ci asm_function MatmulFloatNeon64OptRow12 1429be168c0dSopenharmony_ci sub sp, sp, #160 1430be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1431be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1432be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1433be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 1434be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1435be168c0dSopenharmony_ci+ add x9, sp, #64 1436be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1437be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 1438be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 1439be168c0dSopenharmony_ci 1440be168c0dSopenharmony_ci- ldr x8, [sp] 1441be168c0dSopenharmony_ci- ldr x9, [sp, #8] 1442be168c0dSopenharmony_ci+ ldr x8, [sp, #160] 1443be168c0dSopenharmony_ci+ ldr x9, [sp, #168] 1444be168c0dSopenharmony_ci 1445be168c0dSopenharmony_ci mov x21, #48 // sizeof(float) * 12 1446be168c0dSopenharmony_ci mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth 1447be168c0dSopenharmony_ci@@ -1220,7 +1221,6 @@ LoopColEnd: 1448be168c0dSopenharmony_ci subs x6, x6, #12 1449be168c0dSopenharmony_ci bgt LoopRow 1450be168c0dSopenharmony_ci 1451be168c0dSopenharmony_ci- sub sp, sp, #160 1452be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1453be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1454be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1455be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S 1456be168c0dSopenharmony_ciindex 0cc49fb9..b984c494 100644 1457be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S 1458be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S 1459be168c0dSopenharmony_ci@@ -19,7 +19,7 @@ 1460be168c0dSopenharmony_ci .text 1461be168c0dSopenharmony_ci .align 5 1462be168c0dSopenharmony_ci 1463be168c0dSopenharmony_ci-// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth 1464be168c0dSopenharmony_ci+// void MatmulFloatNeon64OptRow4(const float *a, const float *b, float *c, const float *bias, int act_type, int depth 1465be168c0dSopenharmony_ci // int row, int col, size_t stride, size_t writeMode) 1466be168c0dSopenharmony_ci // x0: a 1467be168c0dSopenharmony_ci // x1: b 1468be168c0dSopenharmony_ci@@ -34,13 +34,14 @@ 1469be168c0dSopenharmony_ci 1470be168c0dSopenharmony_ci asm_function MatmulFloatNeon64OptRow4 1471be168c0dSopenharmony_ci sub sp, sp, #160 1472be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1473be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1474be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1475be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 1476be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1477be168c0dSopenharmony_ci+ add x9, sp, #64 1478be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1479be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 1480be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 1481be168c0dSopenharmony_ci 1482be168c0dSopenharmony_ci- ldr x8, [sp] 1483be168c0dSopenharmony_ci- ldr x9, [sp, #8] 1484be168c0dSopenharmony_ci+ ldr x8, [sp, #160] 1485be168c0dSopenharmony_ci+ ldr x9, [sp, #168] 1486be168c0dSopenharmony_ci 1487be168c0dSopenharmony_ci mov x21, #48 // sizeof(float) * 12 1488be168c0dSopenharmony_ci 1489be168c0dSopenharmony_ci@@ -588,7 +589,6 @@ LoopColEnd: 1490be168c0dSopenharmony_ci subs x6, x6, #12 1491be168c0dSopenharmony_ci bgt LoopRow4 1492be168c0dSopenharmony_ci 1493be168c0dSopenharmony_ci- sub sp, sp, #160 1494be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1495be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1496be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1497be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S 1498be168c0dSopenharmony_ciindex a9e42a54..c5b260c0 100644 1499be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S 1500be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S 1501be168c0dSopenharmony_ci@@ -34,13 +34,14 @@ 1502be168c0dSopenharmony_ci 1503be168c0dSopenharmony_ci asm_function MatmulFloatNeon64OptRow8 1504be168c0dSopenharmony_ci sub sp, sp, #160 1505be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1506be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1507be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1508be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 1509be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1510be168c0dSopenharmony_ci+ add x9, sp, #64 1511be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1512be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 1513be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 1514be168c0dSopenharmony_ci 1515be168c0dSopenharmony_ci- ldr x8, [sp] 1516be168c0dSopenharmony_ci- ldr x9, [sp, #8] 1517be168c0dSopenharmony_ci+ ldr x8, [sp, #160] 1518be168c0dSopenharmony_ci+ ldr x9, [sp, #168] 1519be168c0dSopenharmony_ci 1520be168c0dSopenharmony_ci mov x21, #48 // sizeof(float) * 12 1521be168c0dSopenharmony_ci mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth 1522be168c0dSopenharmony_ci@@ -902,7 +903,6 @@ LoopColEnd: 1523be168c0dSopenharmony_ci subs x6, x6, #12 1524be168c0dSopenharmony_ci bgt LoopCol8 1525be168c0dSopenharmony_ci 1526be168c0dSopenharmony_ci- sub sp, sp, #160 1527be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1528be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1529be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1530be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S 1531be168c0dSopenharmony_ciindex a0e94c5f..731bac4b 100644 1532be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S 1533be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8.S 1534be168c0dSopenharmony_ci@@ -44,24 +44,25 @@ 1535be168c0dSopenharmony_ci 1536be168c0dSopenharmony_ci asm_function MatmulInt8Neon64 1537be168c0dSopenharmony_ci sub sp, sp, #208 1538be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1539be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1540be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1541be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 1542be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 1543be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 1544be168c0dSopenharmony_ci- stp x27, x28, [sp], #16 1545be168c0dSopenharmony_ci- 1546be168c0dSopenharmony_ci- ldr w8, [sp] 1547be168c0dSopenharmony_ci- ldr w9, [sp, #8] 1548be168c0dSopenharmony_ci- ldr w10, [sp, #16] 1549be168c0dSopenharmony_ci- ldr x11, [sp, #24] 1550be168c0dSopenharmony_ci- ldr x12, [sp, #32] 1551be168c0dSopenharmony_ci- ldr x13, [sp, #40] 1552be168c0dSopenharmony_ci- ldr w14, [sp, #48] 1553be168c0dSopenharmony_ci- ldr w15, [sp, #56] 1554be168c0dSopenharmony_ci- ldr w24, [sp, #64] 1555be168c0dSopenharmony_ci- ldr w27, [sp, #72] 1556be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1557be168c0dSopenharmony_ci+ add x9, sp, #64 1558be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1559be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 1560be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 1561be168c0dSopenharmony_ci+ stp x23, x24, [sp, #160] 1562be168c0dSopenharmony_ci+ stp x25, x26, [sp, #176] 1563be168c0dSopenharmony_ci+ stp x27, x28, [sp, #192] 1564be168c0dSopenharmony_ci+ 1565be168c0dSopenharmony_ci+ ldr w8, [sp, #208] 1566be168c0dSopenharmony_ci+ ldr w9, [sp, #216] 1567be168c0dSopenharmony_ci+ ldr w10, [sp, #224] 1568be168c0dSopenharmony_ci+ ldr x11, [sp, #232] 1569be168c0dSopenharmony_ci+ ldr x12, [sp, #240] 1570be168c0dSopenharmony_ci+ ldr x13, [sp, #248] 1571be168c0dSopenharmony_ci+ ldr w14, [sp, #256] 1572be168c0dSopenharmony_ci+ ldr w15, [sp, #264] 1573be168c0dSopenharmony_ci+ ldr w24, [sp, #272] 1574be168c0dSopenharmony_ci+ ldr w27, [sp, #280] 1575be168c0dSopenharmony_ci 1576be168c0dSopenharmony_ci mov w17, #4 // sizeof(int8)*4 1577be168c0dSopenharmony_ci mul w21, w5, w17 // the stride of a/b: sizeof(int8)*4*deep16 1578be168c0dSopenharmony_ci@@ -408,7 +409,6 @@ PerTEnd2: 1579be168c0dSopenharmony_ci b L1 1580be168c0dSopenharmony_ci 1581be168c0dSopenharmony_ci End1: 1582be168c0dSopenharmony_ci- sub sp, sp, #208 1583be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1584be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1585be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1586be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S 1587be168c0dSopenharmony_ciindex 64be8a14..a54ee5b8 100644 1588be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S 1589be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulInt8Opt.S 1590be168c0dSopenharmony_ci@@ -43,23 +43,24 @@ 1591be168c0dSopenharmony_ci 1592be168c0dSopenharmony_ci asm_function MatmulInt8Opt 1593be168c0dSopenharmony_ci sub sp, sp, #224 1594be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1595be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1596be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1597be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 1598be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 1599be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 1600be168c0dSopenharmony_ci- stp x27, x28, [sp], #16 1601be168c0dSopenharmony_ci- stp x29, x30, [sp], #16 1602be168c0dSopenharmony_ci- 1603be168c0dSopenharmony_ci- ldr w8, [sp] 1604be168c0dSopenharmony_ci- ldr w9, [sp, #8] 1605be168c0dSopenharmony_ci- ldr w10, [sp, #16] 1606be168c0dSopenharmony_ci- ldr x11, [sp, #24] 1607be168c0dSopenharmony_ci- ldr x12, [sp, #32] 1608be168c0dSopenharmony_ci- ldr x13, [sp, #40] 1609be168c0dSopenharmony_ci- ldr x14, [sp, #48] 1610be168c0dSopenharmony_ci- ldr x15, [sp, #56] 1611be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1612be168c0dSopenharmony_ci+ add x9, sp, #64 1613be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1614be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 1615be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 1616be168c0dSopenharmony_ci+ stp x23, x24, [sp, #160] 1617be168c0dSopenharmony_ci+ stp x25, x26, [sp, #176] 1618be168c0dSopenharmony_ci+ stp x27, x28, [sp, #192] 1619be168c0dSopenharmony_ci+ stp x29, x30, [sp, #208] 1620be168c0dSopenharmony_ci+ 1621be168c0dSopenharmony_ci+ ldr w8, [sp, #224] 1622be168c0dSopenharmony_ci+ ldr w9, [sp, #232] 1623be168c0dSopenharmony_ci+ ldr w10, [sp, #240] 1624be168c0dSopenharmony_ci+ ldr x11, [sp, #248] 1625be168c0dSopenharmony_ci+ ldr x12, [sp, #256] 1626be168c0dSopenharmony_ci+ ldr x13, [sp, #264] 1627be168c0dSopenharmony_ci+ ldr x14, [sp, #272] 1628be168c0dSopenharmony_ci+ ldr x15, [sp, #280] 1629be168c0dSopenharmony_ci 1630be168c0dSopenharmony_ci mov x23, #4 1631be168c0dSopenharmony_ci mul x23, x23, x5 // lhs step 1632be168c0dSopenharmony_ci@@ -70,7 +71,7 @@ LoopRow: 1633be168c0dSopenharmony_ci mov x17, x4 // reload rhs col 1634be168c0dSopenharmony_ci mov x29, x7 // reload bias ptr 1635be168c0dSopenharmony_ci mov x27, x2 // reload dst ptr 1636be168c0dSopenharmony_ci- ldr x28, [sp, #64] // reload filter_zp 1637be168c0dSopenharmony_ci+ ldr x28, [sp, #288] // reload filter_zp 1638be168c0dSopenharmony_ci 1639be168c0dSopenharmony_ci LoopCol: 1640be168c0dSopenharmony_ci mov x25, x6 // reload a_sums ptr 1641be168c0dSopenharmony_ci@@ -334,16 +335,15 @@ LoopRow: 1642be168c0dSopenharmony_ci LoopColEnd: 1643be168c0dSopenharmony_ci subs x3, x3, #4 1644be168c0dSopenharmony_ci ble LoopRowEnd 1645be168c0dSopenharmony_ci- ldr x11, [sp, #24] 1646be168c0dSopenharmony_ci- ldr x12, [sp, #32] 1647be168c0dSopenharmony_ci- ldr x13, [sp, #40] 1648be168c0dSopenharmony_ci+ ldr x11, [sp, #248] 1649be168c0dSopenharmony_ci+ ldr x12, [sp, #256] 1650be168c0dSopenharmony_ci+ ldr x13, [sp, #264] 1651be168c0dSopenharmony_ci add x6, x6, #16 1652be168c0dSopenharmony_ci add x0, x0, x23 1653be168c0dSopenharmony_ci add x2, x2, x24 1654be168c0dSopenharmony_ci b LoopRow 1655be168c0dSopenharmony_ci 1656be168c0dSopenharmony_ci LoopRowEnd: 1657be168c0dSopenharmony_ci- sub sp, sp, #224 1658be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1659be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1660be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1661be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S 1662be168c0dSopenharmony_ciindex fe5207ad..adb0a42c 100644 1663be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S 1664be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulR4Int8.S 1665be168c0dSopenharmony_ci@@ -33,9 +33,10 @@ 1666be168c0dSopenharmony_ci 1667be168c0dSopenharmony_ci asm_function MatMulR4Int8Neon64 1668be168c0dSopenharmony_ci sub sp, sp, #144 1669be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1670be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1671be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1672be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1673be168c0dSopenharmony_ci+ add x9, sp, #64 1674be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1675be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 1676be168c0dSopenharmony_ci 1677be168c0dSopenharmony_ci mov w15, #0 // b col index 1678be168c0dSopenharmony_ci mov w16, #0 // a row index 1679be168c0dSopenharmony_ci@@ -185,7 +186,6 @@ End2: 1680be168c0dSopenharmony_ci b L1 1681be168c0dSopenharmony_ci 1682be168c0dSopenharmony_ci End1: 1683be168c0dSopenharmony_ci- sub sp, sp, #144 1684be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1685be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1686be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1687be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S 1688be168c0dSopenharmony_ciindex 0b814ce4..23032ab9 100644 1689be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S 1690be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S 1691be168c0dSopenharmony_ci@@ -27,9 +27,9 @@ asm_function MatrixMultiplyWinograd 1692be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 1693be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 1694be168c0dSopenharmony_ci sub sp, sp, #48 1695be168c0dSopenharmony_ci- st1 {v8.4s}, [sp], #16 1696be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1697be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 1698be168c0dSopenharmony_ci+ st1 {v8.4s}, [sp] 1699be168c0dSopenharmony_ci+ stp x19, x20, [sp, #16] 1700be168c0dSopenharmony_ci+ stp x21, x22, [sp, #32] 1701be168c0dSopenharmony_ci mov x8, #4 1702be168c0dSopenharmony_ci mul x10, x5, x8 1703be168c0dSopenharmony_ci mov x17, x3 // m 1704be168c0dSopenharmony_ci@@ -176,7 +176,6 @@ asm_function MatrixMultiplyWinograd 1705be168c0dSopenharmony_ci add x0, x0, x21 1706be168c0dSopenharmony_ci b LoopM 1707be168c0dSopenharmony_ci EndLoopM: 1708be168c0dSopenharmony_ci- sub sp, sp, #48 1709be168c0dSopenharmony_ci ld1 {v8.4s}, [sp], #16 1710be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1711be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 1712be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S 1713be168c0dSopenharmony_ciindex 5355d302..1392ab4a 100644 1714be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S 1715be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncBiasReluC8.S 1716be168c0dSopenharmony_ci@@ -34,8 +34,9 @@ 1717be168c0dSopenharmony_ci 1718be168c0dSopenharmony_ci asm_function PostFuncBiasReluC8 1719be168c0dSopenharmony_ci sub sp, sp, #128 1720be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1721be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1722be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1723be168c0dSopenharmony_ci+ add x9, sp, #64 1724be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1725be168c0dSopenharmony_ci 1726be168c0dSopenharmony_ci movi v26.4s, #6 1727be168c0dSopenharmony_ci scvtf v26.4s, v26.4s 1728be168c0dSopenharmony_ci@@ -546,7 +547,6 @@ Loop_C1_7_Write: 1729be168c0dSopenharmony_ci b Loop_C1_7_Write 1730be168c0dSopenharmony_ci 1731be168c0dSopenharmony_ci End: 1732be168c0dSopenharmony_ci- sub sp, sp, #128 1733be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1734be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1735be168c0dSopenharmony_ci ret 1736be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S 1737be168c0dSopenharmony_ciindex 0818d74e..a240b64d 100644 1738be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S 1739be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S 1740be168c0dSopenharmony_ci@@ -54,14 +54,14 @@ 1741be168c0dSopenharmony_ci 1742be168c0dSopenharmony_ci asm_function PostFuncInt8C4Neon64 1743be168c0dSopenharmony_ci sub sp, sp, #16 1744be168c0dSopenharmony_ci- stp x24, x25, [sp], #16 1745be168c0dSopenharmony_ci+ stp x24, x25, [sp] 1746be168c0dSopenharmony_ci 1747be168c0dSopenharmony_ci- ldr w8, [sp] 1748be168c0dSopenharmony_ci- ldr w9, [sp, #8] 1749be168c0dSopenharmony_ci- ldr w10, [sp, #16] 1750be168c0dSopenharmony_ci- ldr w11, [sp, #24] 1751be168c0dSopenharmony_ci- ldr w12, [sp, #32] 1752be168c0dSopenharmony_ci- ldr w13, [sp, #40] 1753be168c0dSopenharmony_ci+ ldr w8, [sp, #16] 1754be168c0dSopenharmony_ci+ ldr w9, [sp, #24] 1755be168c0dSopenharmony_ci+ ldr w10, [sp, #32] 1756be168c0dSopenharmony_ci+ ldr w11, [sp, #40] 1757be168c0dSopenharmony_ci+ ldr w12, [sp, #48] 1758be168c0dSopenharmony_ci+ ldr w13, [sp, #56] 1759be168c0dSopenharmony_ci 1760be168c0dSopenharmony_ci dup v26.4s, w7 1761be168c0dSopenharmony_ci dup v27.4s, w8 1762be168c0dSopenharmony_ci@@ -254,7 +254,6 @@ Loop_C1_3: 1763be168c0dSopenharmony_ci 1764be168c0dSopenharmony_ci 1765be168c0dSopenharmony_ci End: 1766be168c0dSopenharmony_ci- sub sp, sp, #16 1767be168c0dSopenharmony_ci ldp x24, x25, [sp], #16 1768be168c0dSopenharmony_ci ret 1769be168c0dSopenharmony_ci #endif 1770be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S 1771be168c0dSopenharmony_ciindex cfa9bdf8..614d83f8 100644 1772be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S 1773be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/SPMM8x8Fp32.S 1774be168c0dSopenharmony_ci@@ -55,9 +55,10 @@ 1775be168c0dSopenharmony_ci 1776be168c0dSopenharmony_ci asm_function SPMM8x8Fp32 1777be168c0dSopenharmony_ci sub sp, sp, #144 1778be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1779be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1780be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1781be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1782be168c0dSopenharmony_ci+ add x9, sp, #64 1783be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1784be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 1785be168c0dSopenharmony_ci 1786be168c0dSopenharmony_ci // init output with bias 1787be168c0dSopenharmony_ci ldr w8, [x5], #4 1788be168c0dSopenharmony_ci@@ -286,7 +287,6 @@ WRITE_OUT: 1789be168c0dSopenharmony_ci st1 {v14.4s, v15.4s}, [x4] 1790be168c0dSopenharmony_ci 1791be168c0dSopenharmony_ci End: 1792be168c0dSopenharmony_ci- sub sp, sp, #144 1793be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1794be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1795be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1796be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S 1797be168c0dSopenharmony_ciindex 5987e68a..e0efc7b2 100644 1798be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S 1799be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/TiledC4MatmulFp32.S 1800be168c0dSopenharmony_ci@@ -29,8 +29,9 @@ asm_function TiledC4MatmulFp32 1801be168c0dSopenharmony_ci //x5: oc4 1802be168c0dSopenharmony_ci 1803be168c0dSopenharmony_ci sub sp, sp, #128 1804be168c0dSopenharmony_ci-st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1805be168c0dSopenharmony_ci-st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1806be168c0dSopenharmony_ci+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1807be168c0dSopenharmony_ci+add x9, sp, #64 1808be168c0dSopenharmony_ci+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1809be168c0dSopenharmony_ci 1810be168c0dSopenharmony_ci mov x7, #4 //sizeof(float) 1811be168c0dSopenharmony_ci mul x3, x3, x7 1812be168c0dSopenharmony_ci@@ -272,7 +273,6 @@ LoopOcHalf: 1813be168c0dSopenharmony_ci st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 1814be168c0dSopenharmony_ci 1815be168c0dSopenharmony_ci LoopOcEnd: 1816be168c0dSopenharmony_ci- sub sp, sp, #128 1817be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1818be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1819be168c0dSopenharmony_ci ret 1820be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S 1821be168c0dSopenharmony_ciindex 4a26b251..243b19de 100644 1822be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S 1823be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransLeft.S 1824be168c0dSopenharmony_ci@@ -30,7 +30,7 @@ asm_function WinogradTransLeft 1825be168c0dSopenharmony_ci //x6:length 1826be168c0dSopenharmony_ci 1827be168c0dSopenharmony_ci sub sp, sp, #32 1828be168c0dSopenharmony_ci-stp x19, x20, [sp], #32 1829be168c0dSopenharmony_ci+stp x19, x20, [sp] 1830be168c0dSopenharmony_ci 1831be168c0dSopenharmony_ci mov x8, #16 // 4 * sizeof(float) 1832be168c0dSopenharmony_ci mul x8, x6, x8 1833be168c0dSopenharmony_ci@@ -152,7 +152,6 @@ LoopH: 1834be168c0dSopenharmony_ci subs x4, x4, #1 1835be168c0dSopenharmony_ci bne LoopH 1836be168c0dSopenharmony_ci 1837be168c0dSopenharmony_ci- sub sp, sp, #32 1838be168c0dSopenharmony_ci ldp x19, x20, [sp], #32 1839be168c0dSopenharmony_ci ret 1840be168c0dSopenharmony_ci 1841be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S 1842be168c0dSopenharmony_ciindex 931fa016..95ee50a5 100644 1843be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S 1844be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/WinogradTransRight.S 1845be168c0dSopenharmony_ci@@ -30,7 +30,7 @@ asm_function WinogradTransRight 1846be168c0dSopenharmony_ci //x6: length 1847be168c0dSopenharmony_ci 1848be168c0dSopenharmony_ci sub sp, sp, #16 1849be168c0dSopenharmony_ci-stp x19, x20, [sp], #16 1850be168c0dSopenharmony_ci+stp x19, x20, [sp] 1851be168c0dSopenharmony_ci 1852be168c0dSopenharmony_ci mov x8, #16 // 4 * sizeof(float) 1853be168c0dSopenharmony_ci mul x8, x6, x8 1854be168c0dSopenharmony_ci@@ -155,7 +155,6 @@ LoopH: 1855be168c0dSopenharmony_ci subs x4, x4, #1 1856be168c0dSopenharmony_ci bne LoopH 1857be168c0dSopenharmony_ci 1858be168c0dSopenharmony_ci- sub sp, sp, #16 1859be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1860be168c0dSopenharmony_ci ret 1861be168c0dSopenharmony_ci #endif 1862be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S 1863be168c0dSopenharmony_ciindex 221a1609..56f03dbd 100644 1864be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S 1865be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/ConvDwFp16Center.S 1866be168c0dSopenharmony_ci@@ -31,21 +31,22 @@ asm_function ConvDwFp16Center 1867be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 1868be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 1869be168c0dSopenharmony_ci sub sp, sp, #192 1870be168c0dSopenharmony_ci- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1871be168c0dSopenharmony_ci- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1872be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1873be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 1874be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 1875be168c0dSopenharmony_ci- stp x25, x26, [sp], #16 1876be168c0dSopenharmony_ci+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 1877be168c0dSopenharmony_ci+ add x9, sp, #64 1878be168c0dSopenharmony_ci+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 1879be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 1880be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 1881be168c0dSopenharmony_ci+ stp x23, x24, [sp, #160] 1882be168c0dSopenharmony_ci+ stp x25, x26, [sp, #176] 1883be168c0dSopenharmony_ci 1884be168c0dSopenharmony_ci- ldr x8, [sp] 1885be168c0dSopenharmony_ci- ldr x9, [sp, #8] 1886be168c0dSopenharmony_ci- ldr x10, [sp, #16] 1887be168c0dSopenharmony_ci- ldr x11, [sp, #24] 1888be168c0dSopenharmony_ci- ldr x12, [sp, #32] 1889be168c0dSopenharmony_ci- ldr x13, [sp, #40] 1890be168c0dSopenharmony_ci- ldr x14, [sp, #48] 1891be168c0dSopenharmony_ci- ldr x15, [sp, #56] 1892be168c0dSopenharmony_ci+ ldr x8, [sp, #192] 1893be168c0dSopenharmony_ci+ ldr x9, [sp, #200] 1894be168c0dSopenharmony_ci+ ldr x10, [sp, #208] 1895be168c0dSopenharmony_ci+ ldr x11, [sp, #216] 1896be168c0dSopenharmony_ci+ ldr x12, [sp, #224] 1897be168c0dSopenharmony_ci+ ldr x13, [sp, #232] 1898be168c0dSopenharmony_ci+ ldr x14, [sp, #240] 1899be168c0dSopenharmony_ci+ ldr x15, [sp, #248] 1900be168c0dSopenharmony_ci 1901be168c0dSopenharmony_ci ld1 {v24.8h}, [x3] 1902be168c0dSopenharmony_ci movi v26.8h, #0x46, lsl #8 1903be168c0dSopenharmony_ci@@ -301,7 +302,6 @@ asm_function ConvDwFp16Center 1904be168c0dSopenharmony_ci subs x4, x4, #1 1905be168c0dSopenharmony_ci bne LoopH 1906be168c0dSopenharmony_ci 1907be168c0dSopenharmony_ci- sub sp, sp, #192 1908be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 1909be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 1910be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1911be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S 1912be168c0dSopenharmony_ciindex 1266b160..bb37a913 100644 1913be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S 1914be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/DeconvDwFp16Center.S 1915be168c0dSopenharmony_ci@@ -30,14 +30,14 @@ asm_function DeconvDwFp16Center 1916be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 1917be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 1918be168c0dSopenharmony_ci sub sp, sp, #32 1919be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1920be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 1921be168c0dSopenharmony_ci+ stp x19, x20, [sp] 1922be168c0dSopenharmony_ci+ stp x21, x22, [sp, #16] 1923be168c0dSopenharmony_ci 1924be168c0dSopenharmony_ci- ldr x8, [sp] 1925be168c0dSopenharmony_ci- ldr x9, [sp, #8] 1926be168c0dSopenharmony_ci- ldr x10, [sp, #16] 1927be168c0dSopenharmony_ci- ldr x11, [sp, #24] 1928be168c0dSopenharmony_ci- ldr x12, [sp, #32] 1929be168c0dSopenharmony_ci+ ldr x8, [sp, #32] 1930be168c0dSopenharmony_ci+ ldr x9, [sp, #40] 1931be168c0dSopenharmony_ci+ ldr x10, [sp, #48] 1932be168c0dSopenharmony_ci+ ldr x11, [sp, #56] 1933be168c0dSopenharmony_ci+ ldr x12, [sp, #64] 1934be168c0dSopenharmony_ci 1935be168c0dSopenharmony_ci LoopH: 1936be168c0dSopenharmony_ci mov x15, x0 1937be168c0dSopenharmony_ci@@ -69,7 +69,6 @@ asm_function DeconvDwFp16Center 1938be168c0dSopenharmony_ci subs x3, x3, #1 1939be168c0dSopenharmony_ci bne LoopH 1940be168c0dSopenharmony_ci 1941be168c0dSopenharmony_ci- sub sp, sp, #32 1942be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 1943be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 1944be168c0dSopenharmony_ci ret 1945be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S 1946be168c0dSopenharmony_ciindex 80a55b75..4f5441a3 100644 1947be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S 1948be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatVecMulFp16.S 1949be168c0dSopenharmony_ci@@ -30,8 +30,9 @@ 1950be168c0dSopenharmony_ci 1951be168c0dSopenharmony_ci asm_function MatVecMulFp16Neon64 1952be168c0dSopenharmony_ci sub sp, sp, #128 1953be168c0dSopenharmony_ci- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 1954be168c0dSopenharmony_ci- st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 1955be168c0dSopenharmony_ci+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 1956be168c0dSopenharmony_ci+ add x9, sp, #64 1957be168c0dSopenharmony_ci+ st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9] 1958be168c0dSopenharmony_ci 1959be168c0dSopenharmony_ci mov w14, #2 // sizeof(float16) 1960be168c0dSopenharmony_ci mul w8, w14, w5 // rhs depthx1 block stride 1961be168c0dSopenharmony_ci@@ -184,7 +185,6 @@ Write1x1: 1962be168c0dSopenharmony_ci b Loop 1963be168c0dSopenharmony_ci 1964be168c0dSopenharmony_ci End: 1965be168c0dSopenharmony_ci- sub sp, sp, #128 1966be168c0dSopenharmony_ci ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 1967be168c0dSopenharmony_ci ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 1968be168c0dSopenharmony_ci ret 1969be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S 1970be168c0dSopenharmony_ciindex a0e28b74..9f804fd3 100644 1971be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S 1972be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/Matmul12X16Fp16.S 1973be168c0dSopenharmony_ci@@ -36,13 +36,14 @@ 1974be168c0dSopenharmony_ci 1975be168c0dSopenharmony_ci asm_function MatMul12x16Fp16Opt 1976be168c0dSopenharmony_ci sub sp, sp, #160 1977be168c0dSopenharmony_ci- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 1978be168c0dSopenharmony_ci- st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 1979be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 1980be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 1981be168c0dSopenharmony_ci+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 1982be168c0dSopenharmony_ci+ add x9, sp, #64 1983be168c0dSopenharmony_ci+ st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9] 1984be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 1985be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 1986be168c0dSopenharmony_ci 1987be168c0dSopenharmony_ci- ldr x8, [sp] 1988be168c0dSopenharmony_ci- ldr x9, [sp, #8] 1989be168c0dSopenharmony_ci+ ldr x8, [sp, #160] 1990be168c0dSopenharmony_ci+ ldr x9, [sp, #168] 1991be168c0dSopenharmony_ci 1992be168c0dSopenharmony_ci .macro CLEAR_OUTPUT_V8_V9 1993be168c0dSopenharmony_ci dup v8.4s, wzr 1994be168c0dSopenharmony_ci@@ -1694,7 +1695,6 @@ LoopColEnd: 1995be168c0dSopenharmony_ci subs x6, x6, #12 1996be168c0dSopenharmony_ci bgt LoopRowStart 1997be168c0dSopenharmony_ci 1998be168c0dSopenharmony_ci- sub sp, sp, #160 1999be168c0dSopenharmony_ci ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2000be168c0dSopenharmony_ci ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2001be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 2002be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S 2003be168c0dSopenharmony_ciindex 79fa12bc..31f1adbd 100644 2004be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S 2005be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulBaseFp16Neon.S 2006be168c0dSopenharmony_ci@@ -34,13 +34,14 @@ 2007be168c0dSopenharmony_ci 2008be168c0dSopenharmony_ci asm_function MatmulBaseFp16Neon 2009be168c0dSopenharmony_ci sub sp, sp, #160 2010be168c0dSopenharmony_ci- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2011be168c0dSopenharmony_ci- st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2012be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 2013be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 2014be168c0dSopenharmony_ci+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 2015be168c0dSopenharmony_ci+ add x9, sp, #64 2016be168c0dSopenharmony_ci+ st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9] 2017be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 2018be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 2019be168c0dSopenharmony_ci 2020be168c0dSopenharmony_ci- ldr x8, [sp] 2021be168c0dSopenharmony_ci- ldr x9, [sp, #8] // act 2022be168c0dSopenharmony_ci+ ldr x8, [sp, #160] 2023be168c0dSopenharmony_ci+ ldr x9, [sp, #168] // act 2024be168c0dSopenharmony_ci add x8, x8, x8 // stride * sizeof(float16_t) 2025be168c0dSopenharmony_ci 2026be168c0dSopenharmony_ci add x16, x7, x7 // col * sizeof(float16_t) 2027be168c0dSopenharmony_ci@@ -951,7 +952,6 @@ LoopColEnd: 2028be168c0dSopenharmony_ci add x0, x0, x15 2029be168c0dSopenharmony_ci bgt LoopRowStart 2030be168c0dSopenharmony_ci 2031be168c0dSopenharmony_ci- sub sp, sp, #160 2032be168c0dSopenharmony_ci ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2033be168c0dSopenharmony_ci ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2034be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 2035be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S 2036be168c0dSopenharmony_ciindex 6bb93f99..1d6b69a6 100644 2037be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S 2038be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16.S 2039be168c0dSopenharmony_ci@@ -34,15 +34,16 @@ 2040be168c0dSopenharmony_ci 2041be168c0dSopenharmony_ci asm_function MatmulFp16Neon64 2042be168c0dSopenharmony_ci sub sp, sp, #144 2043be168c0dSopenharmony_ci- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2044be168c0dSopenharmony_ci- st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2045be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 2046be168c0dSopenharmony_ci+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 2047be168c0dSopenharmony_ci+ add x9, sp, #64 2048be168c0dSopenharmony_ci+ st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9] 2049be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 2050be168c0dSopenharmony_ci 2051be168c0dSopenharmony_ci mov w18, #16 // sizeof(float16) * 8 2052be168c0dSopenharmony_ci mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float16) * 8 * depth 2053be168c0dSopenharmony_ci mov x11, x3 // bias flag 2054be168c0dSopenharmony_ci mov x19, #2 2055be168c0dSopenharmony_ci- ldr x17, [sp] 2056be168c0dSopenharmony_ci+ ldr x17, [sp, #144] 2057be168c0dSopenharmony_ci mul x17, x17, x19 2058be168c0dSopenharmony_ci 2059be168c0dSopenharmony_ci L1: 2060be168c0dSopenharmony_ci@@ -308,7 +309,7 @@ Relu: 2061be168c0dSopenharmony_ci fmax v31.8h, v31.8h, v14.8h 2062be168c0dSopenharmony_ci 2063be168c0dSopenharmony_ci Write: 2064be168c0dSopenharmony_ci- ldrb w13, [sp, #8] 2065be168c0dSopenharmony_ci+ ldrb w13, [sp, #152] 2066be168c0dSopenharmony_ci cbz w13, WriteC8 2067be168c0dSopenharmony_ci cmp w7, #1 2068be168c0dSopenharmony_ci beq Write1 2069be168c0dSopenharmony_ci@@ -877,14 +878,13 @@ End2: 2070be168c0dSopenharmony_ci subs w7, w7, #8 // rhs col - 8 2071be168c0dSopenharmony_ci add x1, x1, x15 // rhs ptr + stride 2072be168c0dSopenharmony_ci add x3, x3, #16 // bias ptr + stride 2073be168c0dSopenharmony_ci- ldrb w13, [sp, #8] 2074be168c0dSopenharmony_ci+ ldrb w13, [sp, #152] 2075be168c0dSopenharmony_ci cbz w13, NoDstStep 2076be168c0dSopenharmony_ci add x2, x2, #16 // dst ptr + stride 2077be168c0dSopenharmony_ci NoDstStep: 2078be168c0dSopenharmony_ci bgt L1 2079be168c0dSopenharmony_ci 2080be168c0dSopenharmony_ci End1: 2081be168c0dSopenharmony_ci- sub sp, sp, #144 2082be168c0dSopenharmony_ci ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2083be168c0dSopenharmony_ci ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2084be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 2085be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S 2086be168c0dSopenharmony_ciindex 4a111066..21348f80 100644 2087be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S 2088be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16Opt.S 2089be168c0dSopenharmony_ci@@ -34,12 +34,12 @@ 2090be168c0dSopenharmony_ci 2091be168c0dSopenharmony_ci asm_function MatmulFp16Neon64Opt 2092be168c0dSopenharmony_ci sub sp, sp, #96 2093be168c0dSopenharmony_ci- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2094be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 2095be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 2096be168c0dSopenharmony_ci+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 2097be168c0dSopenharmony_ci+ stp x19, x20, [sp, #64] 2098be168c0dSopenharmony_ci+ stp x21, x22, [sp, #80] 2099be168c0dSopenharmony_ci 2100be168c0dSopenharmony_ci- ldr x8, [sp] 2101be168c0dSopenharmony_ci- ldr x9, [sp, #8] 2102be168c0dSopenharmony_ci+ ldr x8, [sp, #96] 2103be168c0dSopenharmony_ci+ ldr x9, [sp, #104] 2104be168c0dSopenharmony_ci 2105be168c0dSopenharmony_ci mov x21, #32 // sizeof(float16_t) * 16 2106be168c0dSopenharmony_ci mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth 2107be168c0dSopenharmony_ci@@ -1178,7 +1178,6 @@ LoopColEnd: 2108be168c0dSopenharmony_ci subs x6, x6, #16 2109be168c0dSopenharmony_ci bgt LoopRowStart 2110be168c0dSopenharmony_ci 2111be168c0dSopenharmony_ci- sub sp, sp, #96 2112be168c0dSopenharmony_ci ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2113be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 2114be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 2115be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S 2116be168c0dSopenharmony_ciindex 2d901a3d..40b788c9 100644 2117be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S 2118be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S 2119be168c0dSopenharmony_ci@@ -34,15 +34,16 @@ 2120be168c0dSopenharmony_ci 2121be168c0dSopenharmony_ci asm_function MatmulFp16OptV2 2122be168c0dSopenharmony_ci sub sp, sp, #192 2123be168c0dSopenharmony_ci- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2124be168c0dSopenharmony_ci- st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2125be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 2126be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 2127be168c0dSopenharmony_ci- stp x23, x24, [sp], #16 2128be168c0dSopenharmony_ci- stp x29, x30, [sp], #16 2129be168c0dSopenharmony_ci- 2130be168c0dSopenharmony_ci- ldr x8, [sp] 2131be168c0dSopenharmony_ci- ldr x9, [sp, #8] // writeMode 2132be168c0dSopenharmony_ci+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 2133be168c0dSopenharmony_ci+ add x9, sp, #64 2134be168c0dSopenharmony_ci+ st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9] 2135be168c0dSopenharmony_ci+ stp x19, x20, [sp, #128] 2136be168c0dSopenharmony_ci+ stp x21, x22, [sp, #144] 2137be168c0dSopenharmony_ci+ stp x23, x24, [sp, #160] 2138be168c0dSopenharmony_ci+ stp x29, x30, [sp, #176] 2139be168c0dSopenharmony_ci+ 2140be168c0dSopenharmony_ci+ ldr x8, [sp, #192] 2141be168c0dSopenharmony_ci+ ldr x9, [sp, #200] // writeMode 2142be168c0dSopenharmony_ci lsl x8, x8, #1 // stride * sizeof(float16_t) 2143be168c0dSopenharmony_ci 2144be168c0dSopenharmony_ci lsl x15, x7, #1 // col * sizeof(float16_t) 2145be168c0dSopenharmony_ci@@ -2955,7 +2956,6 @@ Compute1x4Unit: 2146be168c0dSopenharmony_ci ret 2147be168c0dSopenharmony_ci 2148be168c0dSopenharmony_ci End: 2149be168c0dSopenharmony_ci- sub sp, sp, #192 2150be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 2151be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 2152be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 2153be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S 2154be168c0dSopenharmony_ciindex 9ee3c4d5..ca0542da 100644 2155be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S 2156be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulWinogradFp16.S 2157be168c0dSopenharmony_ci@@ -27,9 +27,9 @@ asm_function MatrixMultiplyWinogradFp16 2158be168c0dSopenharmony_ci // x19 ~ x29 should be also preserved 2159be168c0dSopenharmony_ci // whereas our coding style do not permit such amount of parameters 2160be168c0dSopenharmony_ci sub sp, sp, #48 2161be168c0dSopenharmony_ci- st1 {v8.8h}, [sp], #16 2162be168c0dSopenharmony_ci- stp x19, x20, [sp], #16 2163be168c0dSopenharmony_ci- stp x21, x22, [sp], #16 2164be168c0dSopenharmony_ci+ st1 {v8.8h}, [sp] 2165be168c0dSopenharmony_ci+ stp x19, x20, [sp, #16] 2166be168c0dSopenharmony_ci+ stp x21, x22, [sp, #32] 2167be168c0dSopenharmony_ci 2168be168c0dSopenharmony_ci mov x8, #2 2169be168c0dSopenharmony_ci mul x10, x5, x8 // n * 2 2170be168c0dSopenharmony_ci@@ -210,7 +210,6 @@ asm_function MatrixMultiplyWinogradFp16 2171be168c0dSopenharmony_ci b LoopM 2172be168c0dSopenharmony_ci 2173be168c0dSopenharmony_ci EndLoopM: 2174be168c0dSopenharmony_ci- sub sp, sp, #48 2175be168c0dSopenharmony_ci ld1 {v8.8h}, [sp], #16 2176be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 2177be168c0dSopenharmony_ci ldp x21, x22, [sp], #16 2178be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S 2179be168c0dSopenharmony_ciindex d7570d18..5b616ae7 100644 2180be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S 2181be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/TiledC4MatmulFp16.S 2182be168c0dSopenharmony_ci@@ -22,8 +22,9 @@ 2183be168c0dSopenharmony_ci asm_function TiledC4MatmulFp16 2184be168c0dSopenharmony_ci 2185be168c0dSopenharmony_ci sub sp, sp, #128 2186be168c0dSopenharmony_ci-st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 2187be168c0dSopenharmony_ci-st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 2188be168c0dSopenharmony_ci+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 2189be168c0dSopenharmony_ci+add x9, sp, #64 2190be168c0dSopenharmony_ci+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 2191be168c0dSopenharmony_ci 2192be168c0dSopenharmony_ci mov x7, #2 //sizeof(float) 2193be168c0dSopenharmony_ci mul x3, x3, x7 2194be168c0dSopenharmony_ci@@ -265,7 +266,6 @@ LoopOcHalf: 2195be168c0dSopenharmony_ci st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], #32 2196be168c0dSopenharmony_ci 2197be168c0dSopenharmony_ci LoopOcEnd: 2198be168c0dSopenharmony_ci- sub sp, sp, #128 2199be168c0dSopenharmony_ci ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 2200be168c0dSopenharmony_ci ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 2201be168c0dSopenharmony_ci ret 2202be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S 2203be168c0dSopenharmony_ciindex d11dd472..0df891d3 100644 2204be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S 2205be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/VecMatmulFp16.S 2206be168c0dSopenharmony_ci@@ -31,8 +31,9 @@ 2207be168c0dSopenharmony_ci 2208be168c0dSopenharmony_ci asm_function VecMatmulFp16Neon64_2 2209be168c0dSopenharmony_ci sub sp, sp, #128 2210be168c0dSopenharmony_ci- st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2211be168c0dSopenharmony_ci- st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2212be168c0dSopenharmony_ci+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] 2213be168c0dSopenharmony_ci+ add x9, sp, #64 2214be168c0dSopenharmony_ci+ st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9] 2215be168c0dSopenharmony_ci 2216be168c0dSopenharmony_ci LoopCol: 2217be168c0dSopenharmony_ci mov x15, x0 // reload a ptr 2218be168c0dSopenharmony_ci@@ -174,7 +175,6 @@ Write7: 2219be168c0dSopenharmony_ci b End 2220be168c0dSopenharmony_ci 2221be168c0dSopenharmony_ci End: 2222be168c0dSopenharmony_ci- sub sp, sp, #128 2223be168c0dSopenharmony_ci ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 2224be168c0dSopenharmony_ci ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 2225be168c0dSopenharmony_ci ret 2226be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S 2227be168c0dSopenharmony_ciindex 1970c16a..c9b4104e 100644 2228be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S 2229be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransLeftFp16.S 2230be168c0dSopenharmony_ci@@ -22,7 +22,7 @@ 2231be168c0dSopenharmony_ci asm_function WinogradTransLeftFp16 2232be168c0dSopenharmony_ci 2233be168c0dSopenharmony_ci sub sp, sp, #16 2234be168c0dSopenharmony_ci-stp x19, x20, [sp], #16 2235be168c0dSopenharmony_ci+stp x19, x20, [sp] 2236be168c0dSopenharmony_ci 2237be168c0dSopenharmony_ci mov x8, #8 // 4 * sizeof(float16) 2238be168c0dSopenharmony_ci mul x8, x6, x8 2239be168c0dSopenharmony_ci@@ -144,7 +144,6 @@ LoopH: 2240be168c0dSopenharmony_ci subs x4, x4, #1 2241be168c0dSopenharmony_ci bne LoopH 2242be168c0dSopenharmony_ci 2243be168c0dSopenharmony_ci- sub sp, sp, #16 2244be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 2245be168c0dSopenharmony_ci ret 2246be168c0dSopenharmony_ci 2247be168c0dSopenharmony_cidiff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S 2248be168c0dSopenharmony_ciindex c575f504..46c3cd84 100644 2249be168c0dSopenharmony_ci--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S 2250be168c0dSopenharmony_ci+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/WinogradTransRightFp16.S 2251be168c0dSopenharmony_ci@@ -22,7 +22,7 @@ 2252be168c0dSopenharmony_ci asm_function WinogradTransRightFp16 2253be168c0dSopenharmony_ci 2254be168c0dSopenharmony_ci sub sp, sp, #16 2255be168c0dSopenharmony_ci-stp x19, x20, [sp], #16 2256be168c0dSopenharmony_ci+stp x19, x20, [sp] 2257be168c0dSopenharmony_ci 2258be168c0dSopenharmony_ci mov x8, #8 // 4 * sizeof(float16) 2259be168c0dSopenharmony_ci mul x8, x6, x8 2260be168c0dSopenharmony_ci@@ -147,7 +147,6 @@ LoopH: 2261be168c0dSopenharmony_ci subs x4, x4, #1 2262be168c0dSopenharmony_ci bne LoopH 2263be168c0dSopenharmony_ci 2264be168c0dSopenharmony_ci- sub sp, sp, #16 2265be168c0dSopenharmony_ci ldp x19, x20, [sp], #16 2266be168c0dSopenharmony_ci 2267be168c0dSopenharmony_ci ret 2268be168c0dSopenharmony_ci-- 2269be168c0dSopenharmony_ci2.17.1 2270be168c0dSopenharmony_ci 2271