18c2ecf20Sopenharmony_ci/* 28c2ecf20Sopenharmony_ci * ChaCha/XChaCha NEON helper functions 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org> 58c2ecf20Sopenharmony_ci * 68c2ecf20Sopenharmony_ci * This program is free software; you can redistribute it and/or modify 78c2ecf20Sopenharmony_ci * it under the terms of the GNU General Public License version 2 as 88c2ecf20Sopenharmony_ci * published by the Free Software Foundation. 98c2ecf20Sopenharmony_ci * 108c2ecf20Sopenharmony_ci * Originally based on: 118c2ecf20Sopenharmony_ci * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions 128c2ecf20Sopenharmony_ci * 138c2ecf20Sopenharmony_ci * Copyright (C) 2015 Martin Willi 148c2ecf20Sopenharmony_ci * 158c2ecf20Sopenharmony_ci * This program is free software; you can redistribute it and/or modify 168c2ecf20Sopenharmony_ci * it under the terms of the GNU General Public License as published by 178c2ecf20Sopenharmony_ci * the Free Software Foundation; either version 2 of the License, or 188c2ecf20Sopenharmony_ci * (at your option) any later version. 198c2ecf20Sopenharmony_ci */ 208c2ecf20Sopenharmony_ci 218c2ecf20Sopenharmony_ci#include <linux/linkage.h> 228c2ecf20Sopenharmony_ci#include <asm/assembler.h> 238c2ecf20Sopenharmony_ci#include <asm/cache.h> 248c2ecf20Sopenharmony_ci 258c2ecf20Sopenharmony_ci .text 268c2ecf20Sopenharmony_ci .align 6 278c2ecf20Sopenharmony_ci 288c2ecf20Sopenharmony_ci/* 298c2ecf20Sopenharmony_ci * chacha_permute - permute one block 308c2ecf20Sopenharmony_ci * 318c2ecf20Sopenharmony_ci * Permute one 64-byte block where the state matrix is stored in the four NEON 328c2ecf20Sopenharmony_ci * registers v0-v3. It performs matrix operations on four words in parallel, 338c2ecf20Sopenharmony_ci * but requires shuffling to rearrange the words after each round. 348c2ecf20Sopenharmony_ci * 358c2ecf20Sopenharmony_ci * The round count is given in w3. 368c2ecf20Sopenharmony_ci * 378c2ecf20Sopenharmony_ci * Clobbers: w3, x10, v4, v12 388c2ecf20Sopenharmony_ci */ 398c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(chacha_permute) 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_ci adr_l x10, ROT8 428c2ecf20Sopenharmony_ci ld1 {v12.4s}, [x10] 438c2ecf20Sopenharmony_ci 448c2ecf20Sopenharmony_ci.Ldoubleround: 458c2ecf20Sopenharmony_ci // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 468c2ecf20Sopenharmony_ci add v0.4s, v0.4s, v1.4s 478c2ecf20Sopenharmony_ci eor v3.16b, v3.16b, v0.16b 488c2ecf20Sopenharmony_ci rev32 v3.8h, v3.8h 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ci // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 518c2ecf20Sopenharmony_ci add v2.4s, v2.4s, v3.4s 528c2ecf20Sopenharmony_ci eor v4.16b, v1.16b, v2.16b 538c2ecf20Sopenharmony_ci shl v1.4s, v4.4s, #12 548c2ecf20Sopenharmony_ci sri v1.4s, v4.4s, #20 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_ci // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 578c2ecf20Sopenharmony_ci add v0.4s, v0.4s, v1.4s 588c2ecf20Sopenharmony_ci eor v3.16b, v3.16b, v0.16b 598c2ecf20Sopenharmony_ci tbl v3.16b, {v3.16b}, v12.16b 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 628c2ecf20Sopenharmony_ci add v2.4s, v2.4s, v3.4s 638c2ecf20Sopenharmony_ci eor v4.16b, v1.16b, v2.16b 648c2ecf20Sopenharmony_ci shl v1.4s, v4.4s, #7 658c2ecf20Sopenharmony_ci sri v1.4s, v4.4s, #25 668c2ecf20Sopenharmony_ci 678c2ecf20Sopenharmony_ci // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 688c2ecf20Sopenharmony_ci ext v1.16b, v1.16b, v1.16b, #4 698c2ecf20Sopenharmony_ci // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 708c2ecf20Sopenharmony_ci ext v2.16b, v2.16b, v2.16b, #8 718c2ecf20Sopenharmony_ci // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 728c2ecf20Sopenharmony_ci ext v3.16b, v3.16b, v3.16b, #12 738c2ecf20Sopenharmony_ci 748c2ecf20Sopenharmony_ci // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 758c2ecf20Sopenharmony_ci add v0.4s, v0.4s, v1.4s 768c2ecf20Sopenharmony_ci eor v3.16b, v3.16b, v0.16b 778c2ecf20Sopenharmony_ci rev32 v3.8h, v3.8h 788c2ecf20Sopenharmony_ci 798c2ecf20Sopenharmony_ci // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 808c2ecf20Sopenharmony_ci add v2.4s, v2.4s, v3.4s 818c2ecf20Sopenharmony_ci eor v4.16b, v1.16b, v2.16b 828c2ecf20Sopenharmony_ci shl v1.4s, v4.4s, #12 838c2ecf20Sopenharmony_ci sri v1.4s, v4.4s, #20 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_ci // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 868c2ecf20Sopenharmony_ci add v0.4s, v0.4s, v1.4s 878c2ecf20Sopenharmony_ci eor v3.16b, v3.16b, v0.16b 888c2ecf20Sopenharmony_ci tbl v3.16b, {v3.16b}, v12.16b 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 918c2ecf20Sopenharmony_ci add v2.4s, v2.4s, v3.4s 928c2ecf20Sopenharmony_ci eor v4.16b, v1.16b, v2.16b 938c2ecf20Sopenharmony_ci shl v1.4s, v4.4s, #7 948c2ecf20Sopenharmony_ci sri v1.4s, v4.4s, #25 958c2ecf20Sopenharmony_ci 968c2ecf20Sopenharmony_ci // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 978c2ecf20Sopenharmony_ci ext v1.16b, v1.16b, v1.16b, #12 988c2ecf20Sopenharmony_ci // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 998c2ecf20Sopenharmony_ci ext v2.16b, v2.16b, v2.16b, #8 1008c2ecf20Sopenharmony_ci // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 1018c2ecf20Sopenharmony_ci ext v3.16b, v3.16b, v3.16b, #4 1028c2ecf20Sopenharmony_ci 1038c2ecf20Sopenharmony_ci subs w3, w3, #2 1048c2ecf20Sopenharmony_ci b.ne .Ldoubleround 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci ret 1078c2ecf20Sopenharmony_ciSYM_FUNC_END(chacha_permute) 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ciSYM_FUNC_START(chacha_block_xor_neon) 1108c2ecf20Sopenharmony_ci // x0: Input state matrix, s 1118c2ecf20Sopenharmony_ci // x1: 1 data block output, o 1128c2ecf20Sopenharmony_ci // x2: 1 data block input, i 1138c2ecf20Sopenharmony_ci // w3: nrounds 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_ci stp x29, x30, [sp, #-16]! 1168c2ecf20Sopenharmony_ci mov x29, sp 1178c2ecf20Sopenharmony_ci 1188c2ecf20Sopenharmony_ci // x0..3 = s0..3 1198c2ecf20Sopenharmony_ci ld1 {v0.4s-v3.4s}, [x0] 1208c2ecf20Sopenharmony_ci ld1 {v8.4s-v11.4s}, [x0] 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci bl chacha_permute 1238c2ecf20Sopenharmony_ci 1248c2ecf20Sopenharmony_ci ld1 {v4.16b-v7.16b}, [x2] 1258c2ecf20Sopenharmony_ci 1268c2ecf20Sopenharmony_ci // o0 = i0 ^ (x0 + s0) 1278c2ecf20Sopenharmony_ci add v0.4s, v0.4s, v8.4s 1288c2ecf20Sopenharmony_ci eor v0.16b, v0.16b, v4.16b 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_ci // o1 = i1 ^ (x1 + s1) 1318c2ecf20Sopenharmony_ci add v1.4s, v1.4s, v9.4s 1328c2ecf20Sopenharmony_ci eor v1.16b, v1.16b, v5.16b 1338c2ecf20Sopenharmony_ci 1348c2ecf20Sopenharmony_ci // o2 = i2 ^ (x2 + s2) 1358c2ecf20Sopenharmony_ci add v2.4s, v2.4s, v10.4s 1368c2ecf20Sopenharmony_ci eor v2.16b, v2.16b, v6.16b 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_ci // o3 = i3 ^ (x3 + s3) 1398c2ecf20Sopenharmony_ci add v3.4s, v3.4s, v11.4s 1408c2ecf20Sopenharmony_ci eor v3.16b, v3.16b, v7.16b 1418c2ecf20Sopenharmony_ci 1428c2ecf20Sopenharmony_ci st1 {v0.16b-v3.16b}, [x1] 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci ldp x29, x30, [sp], #16 1458c2ecf20Sopenharmony_ci ret 1468c2ecf20Sopenharmony_ciSYM_FUNC_END(chacha_block_xor_neon) 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_ciSYM_FUNC_START(hchacha_block_neon) 1498c2ecf20Sopenharmony_ci // x0: Input state matrix, s 1508c2ecf20Sopenharmony_ci // x1: output (8 32-bit words) 1518c2ecf20Sopenharmony_ci // w2: nrounds 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci stp x29, x30, [sp, #-16]! 1548c2ecf20Sopenharmony_ci mov x29, sp 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci ld1 {v0.4s-v3.4s}, [x0] 1578c2ecf20Sopenharmony_ci 1588c2ecf20Sopenharmony_ci mov w3, w2 1598c2ecf20Sopenharmony_ci bl chacha_permute 1608c2ecf20Sopenharmony_ci 1618c2ecf20Sopenharmony_ci st1 {v0.4s}, [x1], #16 1628c2ecf20Sopenharmony_ci st1 {v3.4s}, [x1] 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_ci ldp x29, x30, [sp], #16 1658c2ecf20Sopenharmony_ci ret 1668c2ecf20Sopenharmony_ciSYM_FUNC_END(hchacha_block_neon) 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_ci a0 .req w12 1698c2ecf20Sopenharmony_ci a1 .req w13 1708c2ecf20Sopenharmony_ci a2 .req w14 1718c2ecf20Sopenharmony_ci a3 .req w15 1728c2ecf20Sopenharmony_ci a4 .req w16 1738c2ecf20Sopenharmony_ci a5 .req w17 1748c2ecf20Sopenharmony_ci a6 .req w19 1758c2ecf20Sopenharmony_ci a7 .req w20 1768c2ecf20Sopenharmony_ci a8 .req w21 1778c2ecf20Sopenharmony_ci a9 .req w22 1788c2ecf20Sopenharmony_ci a10 .req w23 1798c2ecf20Sopenharmony_ci a11 .req w24 1808c2ecf20Sopenharmony_ci a12 .req w25 1818c2ecf20Sopenharmony_ci a13 .req w26 1828c2ecf20Sopenharmony_ci a14 .req w27 1838c2ecf20Sopenharmony_ci a15 .req w28 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci .align 6 1868c2ecf20Sopenharmony_ciSYM_FUNC_START(chacha_4block_xor_neon) 1878c2ecf20Sopenharmony_ci frame_push 10 1888c2ecf20Sopenharmony_ci 1898c2ecf20Sopenharmony_ci // x0: Input state matrix, s 1908c2ecf20Sopenharmony_ci // x1: 4 data blocks output, o 1918c2ecf20Sopenharmony_ci // x2: 4 data blocks input, i 1928c2ecf20Sopenharmony_ci // w3: nrounds 1938c2ecf20Sopenharmony_ci // x4: byte count 1948c2ecf20Sopenharmony_ci 1958c2ecf20Sopenharmony_ci adr_l x10, .Lpermute 1968c2ecf20Sopenharmony_ci and x5, x4, #63 1978c2ecf20Sopenharmony_ci add x10, x10, x5 1988c2ecf20Sopenharmony_ci add x11, x10, #64 1998c2ecf20Sopenharmony_ci 2008c2ecf20Sopenharmony_ci // 2018c2ecf20Sopenharmony_ci // This function encrypts four consecutive ChaCha blocks by loading 2028c2ecf20Sopenharmony_ci // the state matrix in NEON registers four times. The algorithm performs 2038c2ecf20Sopenharmony_ci // each operation on the corresponding word of each state matrix, hence 2048c2ecf20Sopenharmony_ci // requires no word shuffling. For final XORing step we transpose the 2058c2ecf20Sopenharmony_ci // matrix by interleaving 32- and then 64-bit words, which allows us to 2068c2ecf20Sopenharmony_ci // do XOR in NEON registers. 2078c2ecf20Sopenharmony_ci // 2088c2ecf20Sopenharmony_ci // At the same time, a fifth block is encrypted in parallel using 2098c2ecf20Sopenharmony_ci // scalar registers 2108c2ecf20Sopenharmony_ci // 2118c2ecf20Sopenharmony_ci adr_l x9, CTRINC // ... and ROT8 2128c2ecf20Sopenharmony_ci ld1 {v30.4s-v31.4s}, [x9] 2138c2ecf20Sopenharmony_ci 2148c2ecf20Sopenharmony_ci // x0..15[0-3] = s0..3[0..3] 2158c2ecf20Sopenharmony_ci add x8, x0, #16 2168c2ecf20Sopenharmony_ci ld4r { v0.4s- v3.4s}, [x0] 2178c2ecf20Sopenharmony_ci ld4r { v4.4s- v7.4s}, [x8], #16 2188c2ecf20Sopenharmony_ci ld4r { v8.4s-v11.4s}, [x8], #16 2198c2ecf20Sopenharmony_ci ld4r {v12.4s-v15.4s}, [x8] 2208c2ecf20Sopenharmony_ci 2218c2ecf20Sopenharmony_ci mov a0, v0.s[0] 2228c2ecf20Sopenharmony_ci mov a1, v1.s[0] 2238c2ecf20Sopenharmony_ci mov a2, v2.s[0] 2248c2ecf20Sopenharmony_ci mov a3, v3.s[0] 2258c2ecf20Sopenharmony_ci mov a4, v4.s[0] 2268c2ecf20Sopenharmony_ci mov a5, v5.s[0] 2278c2ecf20Sopenharmony_ci mov a6, v6.s[0] 2288c2ecf20Sopenharmony_ci mov a7, v7.s[0] 2298c2ecf20Sopenharmony_ci mov a8, v8.s[0] 2308c2ecf20Sopenharmony_ci mov a9, v9.s[0] 2318c2ecf20Sopenharmony_ci mov a10, v10.s[0] 2328c2ecf20Sopenharmony_ci mov a11, v11.s[0] 2338c2ecf20Sopenharmony_ci mov a12, v12.s[0] 2348c2ecf20Sopenharmony_ci mov a13, v13.s[0] 2358c2ecf20Sopenharmony_ci mov a14, v14.s[0] 2368c2ecf20Sopenharmony_ci mov a15, v15.s[0] 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci // x12 += counter values 1-4 2398c2ecf20Sopenharmony_ci add v12.4s, v12.4s, v30.4s 2408c2ecf20Sopenharmony_ci 2418c2ecf20Sopenharmony_ci.Ldoubleround4: 2428c2ecf20Sopenharmony_ci // x0 += x4, x12 = rotl32(x12 ^ x0, 16) 2438c2ecf20Sopenharmony_ci // x1 += x5, x13 = rotl32(x13 ^ x1, 16) 2448c2ecf20Sopenharmony_ci // x2 += x6, x14 = rotl32(x14 ^ x2, 16) 2458c2ecf20Sopenharmony_ci // x3 += x7, x15 = rotl32(x15 ^ x3, 16) 2468c2ecf20Sopenharmony_ci add v0.4s, v0.4s, v4.4s 2478c2ecf20Sopenharmony_ci add a0, a0, a4 2488c2ecf20Sopenharmony_ci add v1.4s, v1.4s, v5.4s 2498c2ecf20Sopenharmony_ci add a1, a1, a5 2508c2ecf20Sopenharmony_ci add v2.4s, v2.4s, v6.4s 2518c2ecf20Sopenharmony_ci add a2, a2, a6 2528c2ecf20Sopenharmony_ci add v3.4s, v3.4s, v7.4s 2538c2ecf20Sopenharmony_ci add a3, a3, a7 2548c2ecf20Sopenharmony_ci 2558c2ecf20Sopenharmony_ci eor v12.16b, v12.16b, v0.16b 2568c2ecf20Sopenharmony_ci eor a12, a12, a0 2578c2ecf20Sopenharmony_ci eor v13.16b, v13.16b, v1.16b 2588c2ecf20Sopenharmony_ci eor a13, a13, a1 2598c2ecf20Sopenharmony_ci eor v14.16b, v14.16b, v2.16b 2608c2ecf20Sopenharmony_ci eor a14, a14, a2 2618c2ecf20Sopenharmony_ci eor v15.16b, v15.16b, v3.16b 2628c2ecf20Sopenharmony_ci eor a15, a15, a3 2638c2ecf20Sopenharmony_ci 2648c2ecf20Sopenharmony_ci rev32 v12.8h, v12.8h 2658c2ecf20Sopenharmony_ci ror a12, a12, #16 2668c2ecf20Sopenharmony_ci rev32 v13.8h, v13.8h 2678c2ecf20Sopenharmony_ci ror a13, a13, #16 2688c2ecf20Sopenharmony_ci rev32 v14.8h, v14.8h 2698c2ecf20Sopenharmony_ci ror a14, a14, #16 2708c2ecf20Sopenharmony_ci rev32 v15.8h, v15.8h 2718c2ecf20Sopenharmony_ci ror a15, a15, #16 2728c2ecf20Sopenharmony_ci 2738c2ecf20Sopenharmony_ci // x8 += x12, x4 = rotl32(x4 ^ x8, 12) 2748c2ecf20Sopenharmony_ci // x9 += x13, x5 = rotl32(x5 ^ x9, 12) 2758c2ecf20Sopenharmony_ci // x10 += x14, x6 = rotl32(x6 ^ x10, 12) 2768c2ecf20Sopenharmony_ci // x11 += x15, x7 = rotl32(x7 ^ x11, 12) 2778c2ecf20Sopenharmony_ci add v8.4s, v8.4s, v12.4s 2788c2ecf20Sopenharmony_ci add a8, a8, a12 2798c2ecf20Sopenharmony_ci add v9.4s, v9.4s, v13.4s 2808c2ecf20Sopenharmony_ci add a9, a9, a13 2818c2ecf20Sopenharmony_ci add v10.4s, v10.4s, v14.4s 2828c2ecf20Sopenharmony_ci add a10, a10, a14 2838c2ecf20Sopenharmony_ci add v11.4s, v11.4s, v15.4s 2848c2ecf20Sopenharmony_ci add a11, a11, a15 2858c2ecf20Sopenharmony_ci 2868c2ecf20Sopenharmony_ci eor v16.16b, v4.16b, v8.16b 2878c2ecf20Sopenharmony_ci eor a4, a4, a8 2888c2ecf20Sopenharmony_ci eor v17.16b, v5.16b, v9.16b 2898c2ecf20Sopenharmony_ci eor a5, a5, a9 2908c2ecf20Sopenharmony_ci eor v18.16b, v6.16b, v10.16b 2918c2ecf20Sopenharmony_ci eor a6, a6, a10 2928c2ecf20Sopenharmony_ci eor v19.16b, v7.16b, v11.16b 2938c2ecf20Sopenharmony_ci eor a7, a7, a11 2948c2ecf20Sopenharmony_ci 2958c2ecf20Sopenharmony_ci shl v4.4s, v16.4s, #12 2968c2ecf20Sopenharmony_ci shl v5.4s, v17.4s, #12 2978c2ecf20Sopenharmony_ci shl v6.4s, v18.4s, #12 2988c2ecf20Sopenharmony_ci shl v7.4s, v19.4s, #12 2998c2ecf20Sopenharmony_ci 3008c2ecf20Sopenharmony_ci sri v4.4s, v16.4s, #20 3018c2ecf20Sopenharmony_ci ror a4, a4, #20 3028c2ecf20Sopenharmony_ci sri v5.4s, v17.4s, #20 3038c2ecf20Sopenharmony_ci ror a5, a5, #20 3048c2ecf20Sopenharmony_ci sri v6.4s, v18.4s, #20 3058c2ecf20Sopenharmony_ci ror a6, a6, #20 3068c2ecf20Sopenharmony_ci sri v7.4s, v19.4s, #20 3078c2ecf20Sopenharmony_ci ror a7, a7, #20 3088c2ecf20Sopenharmony_ci 3098c2ecf20Sopenharmony_ci // x0 += x4, x12 = rotl32(x12 ^ x0, 8) 3108c2ecf20Sopenharmony_ci // x1 += x5, x13 = rotl32(x13 ^ x1, 8) 3118c2ecf20Sopenharmony_ci // x2 += x6, x14 = rotl32(x14 ^ x2, 8) 3128c2ecf20Sopenharmony_ci // x3 += x7, x15 = rotl32(x15 ^ x3, 8) 3138c2ecf20Sopenharmony_ci add v0.4s, v0.4s, v4.4s 3148c2ecf20Sopenharmony_ci add a0, a0, a4 3158c2ecf20Sopenharmony_ci add v1.4s, v1.4s, v5.4s 3168c2ecf20Sopenharmony_ci add a1, a1, a5 3178c2ecf20Sopenharmony_ci add v2.4s, v2.4s, v6.4s 3188c2ecf20Sopenharmony_ci add a2, a2, a6 3198c2ecf20Sopenharmony_ci add v3.4s, v3.4s, v7.4s 3208c2ecf20Sopenharmony_ci add a3, a3, a7 3218c2ecf20Sopenharmony_ci 3228c2ecf20Sopenharmony_ci eor v12.16b, v12.16b, v0.16b 3238c2ecf20Sopenharmony_ci eor a12, a12, a0 3248c2ecf20Sopenharmony_ci eor v13.16b, v13.16b, v1.16b 3258c2ecf20Sopenharmony_ci eor a13, a13, a1 3268c2ecf20Sopenharmony_ci eor v14.16b, v14.16b, v2.16b 3278c2ecf20Sopenharmony_ci eor a14, a14, a2 3288c2ecf20Sopenharmony_ci eor v15.16b, v15.16b, v3.16b 3298c2ecf20Sopenharmony_ci eor a15, a15, a3 3308c2ecf20Sopenharmony_ci 3318c2ecf20Sopenharmony_ci tbl v12.16b, {v12.16b}, v31.16b 3328c2ecf20Sopenharmony_ci ror a12, a12, #24 3338c2ecf20Sopenharmony_ci tbl v13.16b, {v13.16b}, v31.16b 3348c2ecf20Sopenharmony_ci ror a13, a13, #24 3358c2ecf20Sopenharmony_ci tbl v14.16b, {v14.16b}, v31.16b 3368c2ecf20Sopenharmony_ci ror a14, a14, #24 3378c2ecf20Sopenharmony_ci tbl v15.16b, {v15.16b}, v31.16b 3388c2ecf20Sopenharmony_ci ror a15, a15, #24 3398c2ecf20Sopenharmony_ci 3408c2ecf20Sopenharmony_ci // x8 += x12, x4 = rotl32(x4 ^ x8, 7) 3418c2ecf20Sopenharmony_ci // x9 += x13, x5 = rotl32(x5 ^ x9, 7) 3428c2ecf20Sopenharmony_ci // x10 += x14, x6 = rotl32(x6 ^ x10, 7) 3438c2ecf20Sopenharmony_ci // x11 += x15, x7 = rotl32(x7 ^ x11, 7) 3448c2ecf20Sopenharmony_ci add v8.4s, v8.4s, v12.4s 3458c2ecf20Sopenharmony_ci add a8, a8, a12 3468c2ecf20Sopenharmony_ci add v9.4s, v9.4s, v13.4s 3478c2ecf20Sopenharmony_ci add a9, a9, a13 3488c2ecf20Sopenharmony_ci add v10.4s, v10.4s, v14.4s 3498c2ecf20Sopenharmony_ci add a10, a10, a14 3508c2ecf20Sopenharmony_ci add v11.4s, v11.4s, v15.4s 3518c2ecf20Sopenharmony_ci add a11, a11, a15 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci eor v16.16b, v4.16b, v8.16b 3548c2ecf20Sopenharmony_ci eor a4, a4, a8 3558c2ecf20Sopenharmony_ci eor v17.16b, v5.16b, v9.16b 3568c2ecf20Sopenharmony_ci eor a5, a5, a9 3578c2ecf20Sopenharmony_ci eor v18.16b, v6.16b, v10.16b 3588c2ecf20Sopenharmony_ci eor a6, a6, a10 3598c2ecf20Sopenharmony_ci eor v19.16b, v7.16b, v11.16b 3608c2ecf20Sopenharmony_ci eor a7, a7, a11 3618c2ecf20Sopenharmony_ci 3628c2ecf20Sopenharmony_ci shl v4.4s, v16.4s, #7 3638c2ecf20Sopenharmony_ci shl v5.4s, v17.4s, #7 3648c2ecf20Sopenharmony_ci shl v6.4s, v18.4s, #7 3658c2ecf20Sopenharmony_ci shl v7.4s, v19.4s, #7 3668c2ecf20Sopenharmony_ci 3678c2ecf20Sopenharmony_ci sri v4.4s, v16.4s, #25 3688c2ecf20Sopenharmony_ci ror a4, a4, #25 3698c2ecf20Sopenharmony_ci sri v5.4s, v17.4s, #25 3708c2ecf20Sopenharmony_ci ror a5, a5, #25 3718c2ecf20Sopenharmony_ci sri v6.4s, v18.4s, #25 3728c2ecf20Sopenharmony_ci ror a6, a6, #25 3738c2ecf20Sopenharmony_ci sri v7.4s, v19.4s, #25 3748c2ecf20Sopenharmony_ci ror a7, a7, #25 3758c2ecf20Sopenharmony_ci 3768c2ecf20Sopenharmony_ci // x0 += x5, x15 = rotl32(x15 ^ x0, 16) 3778c2ecf20Sopenharmony_ci // x1 += x6, x12 = rotl32(x12 ^ x1, 16) 3788c2ecf20Sopenharmony_ci // x2 += x7, x13 = rotl32(x13 ^ x2, 16) 3798c2ecf20Sopenharmony_ci // x3 += x4, x14 = rotl32(x14 ^ x3, 16) 3808c2ecf20Sopenharmony_ci add v0.4s, v0.4s, v5.4s 3818c2ecf20Sopenharmony_ci add a0, a0, a5 3828c2ecf20Sopenharmony_ci add v1.4s, v1.4s, v6.4s 3838c2ecf20Sopenharmony_ci add a1, a1, a6 3848c2ecf20Sopenharmony_ci add v2.4s, v2.4s, v7.4s 3858c2ecf20Sopenharmony_ci add a2, a2, a7 3868c2ecf20Sopenharmony_ci add v3.4s, v3.4s, v4.4s 3878c2ecf20Sopenharmony_ci add a3, a3, a4 3888c2ecf20Sopenharmony_ci 3898c2ecf20Sopenharmony_ci eor v15.16b, v15.16b, v0.16b 3908c2ecf20Sopenharmony_ci eor a15, a15, a0 3918c2ecf20Sopenharmony_ci eor v12.16b, v12.16b, v1.16b 3928c2ecf20Sopenharmony_ci eor a12, a12, a1 3938c2ecf20Sopenharmony_ci eor v13.16b, v13.16b, v2.16b 3948c2ecf20Sopenharmony_ci eor a13, a13, a2 3958c2ecf20Sopenharmony_ci eor v14.16b, v14.16b, v3.16b 3968c2ecf20Sopenharmony_ci eor a14, a14, a3 3978c2ecf20Sopenharmony_ci 3988c2ecf20Sopenharmony_ci rev32 v15.8h, v15.8h 3998c2ecf20Sopenharmony_ci ror a15, a15, #16 4008c2ecf20Sopenharmony_ci rev32 v12.8h, v12.8h 4018c2ecf20Sopenharmony_ci ror a12, a12, #16 4028c2ecf20Sopenharmony_ci rev32 v13.8h, v13.8h 4038c2ecf20Sopenharmony_ci ror a13, a13, #16 4048c2ecf20Sopenharmony_ci rev32 v14.8h, v14.8h 4058c2ecf20Sopenharmony_ci ror a14, a14, #16 4068c2ecf20Sopenharmony_ci 4078c2ecf20Sopenharmony_ci // x10 += x15, x5 = rotl32(x5 ^ x10, 12) 4088c2ecf20Sopenharmony_ci // x11 += x12, x6 = rotl32(x6 ^ x11, 12) 4098c2ecf20Sopenharmony_ci // x8 += x13, x7 = rotl32(x7 ^ x8, 12) 4108c2ecf20Sopenharmony_ci // x9 += x14, x4 = rotl32(x4 ^ x9, 12) 4118c2ecf20Sopenharmony_ci add v10.4s, v10.4s, v15.4s 4128c2ecf20Sopenharmony_ci add a10, a10, a15 4138c2ecf20Sopenharmony_ci add v11.4s, v11.4s, v12.4s 4148c2ecf20Sopenharmony_ci add a11, a11, a12 4158c2ecf20Sopenharmony_ci add v8.4s, v8.4s, v13.4s 4168c2ecf20Sopenharmony_ci add a8, a8, a13 4178c2ecf20Sopenharmony_ci add v9.4s, v9.4s, v14.4s 4188c2ecf20Sopenharmony_ci add a9, a9, a14 4198c2ecf20Sopenharmony_ci 4208c2ecf20Sopenharmony_ci eor v16.16b, v5.16b, v10.16b 4218c2ecf20Sopenharmony_ci eor a5, a5, a10 4228c2ecf20Sopenharmony_ci eor v17.16b, v6.16b, v11.16b 4238c2ecf20Sopenharmony_ci eor a6, a6, a11 4248c2ecf20Sopenharmony_ci eor v18.16b, v7.16b, v8.16b 4258c2ecf20Sopenharmony_ci eor a7, a7, a8 4268c2ecf20Sopenharmony_ci eor v19.16b, v4.16b, v9.16b 4278c2ecf20Sopenharmony_ci eor a4, a4, a9 4288c2ecf20Sopenharmony_ci 4298c2ecf20Sopenharmony_ci shl v5.4s, v16.4s, #12 4308c2ecf20Sopenharmony_ci shl v6.4s, v17.4s, #12 4318c2ecf20Sopenharmony_ci shl v7.4s, v18.4s, #12 4328c2ecf20Sopenharmony_ci shl v4.4s, v19.4s, #12 4338c2ecf20Sopenharmony_ci 4348c2ecf20Sopenharmony_ci sri v5.4s, v16.4s, #20 4358c2ecf20Sopenharmony_ci ror a5, a5, #20 4368c2ecf20Sopenharmony_ci sri v6.4s, v17.4s, #20 4378c2ecf20Sopenharmony_ci ror a6, a6, #20 4388c2ecf20Sopenharmony_ci sri v7.4s, v18.4s, #20 4398c2ecf20Sopenharmony_ci ror a7, a7, #20 4408c2ecf20Sopenharmony_ci sri v4.4s, v19.4s, #20 4418c2ecf20Sopenharmony_ci ror a4, a4, #20 4428c2ecf20Sopenharmony_ci 4438c2ecf20Sopenharmony_ci // x0 += x5, x15 = rotl32(x15 ^ x0, 8) 4448c2ecf20Sopenharmony_ci // x1 += x6, x12 = rotl32(x12 ^ x1, 8) 4458c2ecf20Sopenharmony_ci // x2 += x7, x13 = rotl32(x13 ^ x2, 8) 4468c2ecf20Sopenharmony_ci // x3 += x4, x14 = rotl32(x14 ^ x3, 8) 4478c2ecf20Sopenharmony_ci add v0.4s, v0.4s, v5.4s 4488c2ecf20Sopenharmony_ci add a0, a0, a5 4498c2ecf20Sopenharmony_ci add v1.4s, v1.4s, v6.4s 4508c2ecf20Sopenharmony_ci add a1, a1, a6 4518c2ecf20Sopenharmony_ci add v2.4s, v2.4s, v7.4s 4528c2ecf20Sopenharmony_ci add a2, a2, a7 4538c2ecf20Sopenharmony_ci add v3.4s, v3.4s, v4.4s 4548c2ecf20Sopenharmony_ci add a3, a3, a4 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci eor v15.16b, v15.16b, v0.16b 4578c2ecf20Sopenharmony_ci eor a15, a15, a0 4588c2ecf20Sopenharmony_ci eor v12.16b, v12.16b, v1.16b 4598c2ecf20Sopenharmony_ci eor a12, a12, a1 4608c2ecf20Sopenharmony_ci eor v13.16b, v13.16b, v2.16b 4618c2ecf20Sopenharmony_ci eor a13, a13, a2 4628c2ecf20Sopenharmony_ci eor v14.16b, v14.16b, v3.16b 4638c2ecf20Sopenharmony_ci eor a14, a14, a3 4648c2ecf20Sopenharmony_ci 4658c2ecf20Sopenharmony_ci tbl v15.16b, {v15.16b}, v31.16b 4668c2ecf20Sopenharmony_ci ror a15, a15, #24 4678c2ecf20Sopenharmony_ci tbl v12.16b, {v12.16b}, v31.16b 4688c2ecf20Sopenharmony_ci ror a12, a12, #24 4698c2ecf20Sopenharmony_ci tbl v13.16b, {v13.16b}, v31.16b 4708c2ecf20Sopenharmony_ci ror a13, a13, #24 4718c2ecf20Sopenharmony_ci tbl v14.16b, {v14.16b}, v31.16b 4728c2ecf20Sopenharmony_ci ror a14, a14, #24 4738c2ecf20Sopenharmony_ci 4748c2ecf20Sopenharmony_ci // x10 += x15, x5 = rotl32(x5 ^ x10, 7) 4758c2ecf20Sopenharmony_ci // x11 += x12, x6 = rotl32(x6 ^ x11, 7) 4768c2ecf20Sopenharmony_ci // x8 += x13, x7 = rotl32(x7 ^ x8, 7) 4778c2ecf20Sopenharmony_ci // x9 += x14, x4 = rotl32(x4 ^ x9, 7) 4788c2ecf20Sopenharmony_ci add v10.4s, v10.4s, v15.4s 4798c2ecf20Sopenharmony_ci add a10, a10, a15 4808c2ecf20Sopenharmony_ci add v11.4s, v11.4s, v12.4s 4818c2ecf20Sopenharmony_ci add a11, a11, a12 4828c2ecf20Sopenharmony_ci add v8.4s, v8.4s, v13.4s 4838c2ecf20Sopenharmony_ci add a8, a8, a13 4848c2ecf20Sopenharmony_ci add v9.4s, v9.4s, v14.4s 4858c2ecf20Sopenharmony_ci add a9, a9, a14 4868c2ecf20Sopenharmony_ci 4878c2ecf20Sopenharmony_ci eor v16.16b, v5.16b, v10.16b 4888c2ecf20Sopenharmony_ci eor a5, a5, a10 4898c2ecf20Sopenharmony_ci eor v17.16b, v6.16b, v11.16b 4908c2ecf20Sopenharmony_ci eor a6, a6, a11 4918c2ecf20Sopenharmony_ci eor v18.16b, v7.16b, v8.16b 4928c2ecf20Sopenharmony_ci eor a7, a7, a8 4938c2ecf20Sopenharmony_ci eor v19.16b, v4.16b, v9.16b 4948c2ecf20Sopenharmony_ci eor a4, a4, a9 4958c2ecf20Sopenharmony_ci 4968c2ecf20Sopenharmony_ci shl v5.4s, v16.4s, #7 4978c2ecf20Sopenharmony_ci shl v6.4s, v17.4s, #7 4988c2ecf20Sopenharmony_ci shl v7.4s, v18.4s, #7 4998c2ecf20Sopenharmony_ci shl v4.4s, v19.4s, #7 5008c2ecf20Sopenharmony_ci 5018c2ecf20Sopenharmony_ci sri v5.4s, v16.4s, #25 5028c2ecf20Sopenharmony_ci ror a5, a5, #25 5038c2ecf20Sopenharmony_ci sri v6.4s, v17.4s, #25 5048c2ecf20Sopenharmony_ci ror a6, a6, #25 5058c2ecf20Sopenharmony_ci sri v7.4s, v18.4s, #25 5068c2ecf20Sopenharmony_ci ror a7, a7, #25 5078c2ecf20Sopenharmony_ci sri v4.4s, v19.4s, #25 5088c2ecf20Sopenharmony_ci ror a4, a4, #25 5098c2ecf20Sopenharmony_ci 5108c2ecf20Sopenharmony_ci subs w3, w3, #2 5118c2ecf20Sopenharmony_ci b.ne .Ldoubleround4 5128c2ecf20Sopenharmony_ci 5138c2ecf20Sopenharmony_ci ld4r {v16.4s-v19.4s}, [x0], #16 5148c2ecf20Sopenharmony_ci ld4r {v20.4s-v23.4s}, [x0], #16 5158c2ecf20Sopenharmony_ci 5168c2ecf20Sopenharmony_ci // x12 += counter values 0-3 5178c2ecf20Sopenharmony_ci add v12.4s, v12.4s, v30.4s 5188c2ecf20Sopenharmony_ci 5198c2ecf20Sopenharmony_ci // x0[0-3] += s0[0] 5208c2ecf20Sopenharmony_ci // x1[0-3] += s0[1] 5218c2ecf20Sopenharmony_ci // x2[0-3] += s0[2] 5228c2ecf20Sopenharmony_ci // x3[0-3] += s0[3] 5238c2ecf20Sopenharmony_ci add v0.4s, v0.4s, v16.4s 5248c2ecf20Sopenharmony_ci mov w6, v16.s[0] 5258c2ecf20Sopenharmony_ci mov w7, v17.s[0] 5268c2ecf20Sopenharmony_ci add v1.4s, v1.4s, v17.4s 5278c2ecf20Sopenharmony_ci mov w8, v18.s[0] 5288c2ecf20Sopenharmony_ci mov w9, v19.s[0] 5298c2ecf20Sopenharmony_ci add v2.4s, v2.4s, v18.4s 5308c2ecf20Sopenharmony_ci add a0, a0, w6 5318c2ecf20Sopenharmony_ci add a1, a1, w7 5328c2ecf20Sopenharmony_ci add v3.4s, v3.4s, v19.4s 5338c2ecf20Sopenharmony_ci add a2, a2, w8 5348c2ecf20Sopenharmony_ci add a3, a3, w9 5358c2ecf20Sopenharmony_ciCPU_BE( rev a0, a0 ) 5368c2ecf20Sopenharmony_ciCPU_BE( rev a1, a1 ) 5378c2ecf20Sopenharmony_ciCPU_BE( rev a2, a2 ) 5388c2ecf20Sopenharmony_ciCPU_BE( rev a3, a3 ) 5398c2ecf20Sopenharmony_ci 5408c2ecf20Sopenharmony_ci ld4r {v24.4s-v27.4s}, [x0], #16 5418c2ecf20Sopenharmony_ci ld4r {v28.4s-v31.4s}, [x0] 5428c2ecf20Sopenharmony_ci 5438c2ecf20Sopenharmony_ci // x4[0-3] += s1[0] 5448c2ecf20Sopenharmony_ci // x5[0-3] += s1[1] 5458c2ecf20Sopenharmony_ci // x6[0-3] += s1[2] 5468c2ecf20Sopenharmony_ci // x7[0-3] += s1[3] 5478c2ecf20Sopenharmony_ci add v4.4s, v4.4s, v20.4s 5488c2ecf20Sopenharmony_ci mov w6, v20.s[0] 5498c2ecf20Sopenharmony_ci mov w7, v21.s[0] 5508c2ecf20Sopenharmony_ci add v5.4s, v5.4s, v21.4s 5518c2ecf20Sopenharmony_ci mov w8, v22.s[0] 5528c2ecf20Sopenharmony_ci mov w9, v23.s[0] 5538c2ecf20Sopenharmony_ci add v6.4s, v6.4s, v22.4s 5548c2ecf20Sopenharmony_ci add a4, a4, w6 5558c2ecf20Sopenharmony_ci add a5, a5, w7 5568c2ecf20Sopenharmony_ci add v7.4s, v7.4s, v23.4s 5578c2ecf20Sopenharmony_ci add a6, a6, w8 5588c2ecf20Sopenharmony_ci add a7, a7, w9 5598c2ecf20Sopenharmony_ciCPU_BE( rev a4, a4 ) 5608c2ecf20Sopenharmony_ciCPU_BE( rev a5, a5 ) 5618c2ecf20Sopenharmony_ciCPU_BE( rev a6, a6 ) 5628c2ecf20Sopenharmony_ciCPU_BE( rev a7, a7 ) 5638c2ecf20Sopenharmony_ci 5648c2ecf20Sopenharmony_ci // x8[0-3] += s2[0] 5658c2ecf20Sopenharmony_ci // x9[0-3] += s2[1] 5668c2ecf20Sopenharmony_ci // x10[0-3] += s2[2] 5678c2ecf20Sopenharmony_ci // x11[0-3] += s2[3] 5688c2ecf20Sopenharmony_ci add v8.4s, v8.4s, v24.4s 5698c2ecf20Sopenharmony_ci mov w6, v24.s[0] 5708c2ecf20Sopenharmony_ci mov w7, v25.s[0] 5718c2ecf20Sopenharmony_ci add v9.4s, v9.4s, v25.4s 5728c2ecf20Sopenharmony_ci mov w8, v26.s[0] 5738c2ecf20Sopenharmony_ci mov w9, v27.s[0] 5748c2ecf20Sopenharmony_ci add v10.4s, v10.4s, v26.4s 5758c2ecf20Sopenharmony_ci add a8, a8, w6 5768c2ecf20Sopenharmony_ci add a9, a9, w7 5778c2ecf20Sopenharmony_ci add v11.4s, v11.4s, v27.4s 5788c2ecf20Sopenharmony_ci add a10, a10, w8 5798c2ecf20Sopenharmony_ci add a11, a11, w9 5808c2ecf20Sopenharmony_ciCPU_BE( rev a8, a8 ) 5818c2ecf20Sopenharmony_ciCPU_BE( rev a9, a9 ) 5828c2ecf20Sopenharmony_ciCPU_BE( rev a10, a10 ) 5838c2ecf20Sopenharmony_ciCPU_BE( rev a11, a11 ) 5848c2ecf20Sopenharmony_ci 5858c2ecf20Sopenharmony_ci // x12[0-3] += s3[0] 5868c2ecf20Sopenharmony_ci // x13[0-3] += s3[1] 5878c2ecf20Sopenharmony_ci // x14[0-3] += s3[2] 5888c2ecf20Sopenharmony_ci // x15[0-3] += s3[3] 5898c2ecf20Sopenharmony_ci add v12.4s, v12.4s, v28.4s 5908c2ecf20Sopenharmony_ci mov w6, v28.s[0] 5918c2ecf20Sopenharmony_ci mov w7, v29.s[0] 5928c2ecf20Sopenharmony_ci add v13.4s, v13.4s, v29.4s 5938c2ecf20Sopenharmony_ci mov w8, v30.s[0] 5948c2ecf20Sopenharmony_ci mov w9, v31.s[0] 5958c2ecf20Sopenharmony_ci add v14.4s, v14.4s, v30.4s 5968c2ecf20Sopenharmony_ci add a12, a12, w6 5978c2ecf20Sopenharmony_ci add a13, a13, w7 5988c2ecf20Sopenharmony_ci add v15.4s, v15.4s, v31.4s 5998c2ecf20Sopenharmony_ci add a14, a14, w8 6008c2ecf20Sopenharmony_ci add a15, a15, w9 6018c2ecf20Sopenharmony_ciCPU_BE( rev a12, a12 ) 6028c2ecf20Sopenharmony_ciCPU_BE( rev a13, a13 ) 6038c2ecf20Sopenharmony_ciCPU_BE( rev a14, a14 ) 6048c2ecf20Sopenharmony_ciCPU_BE( rev a15, a15 ) 6058c2ecf20Sopenharmony_ci 6068c2ecf20Sopenharmony_ci // interleave 32-bit words in state n, n+1 6078c2ecf20Sopenharmony_ci ldp w6, w7, [x2], #64 6088c2ecf20Sopenharmony_ci zip1 v16.4s, v0.4s, v1.4s 6098c2ecf20Sopenharmony_ci ldp w8, w9, [x2, #-56] 6108c2ecf20Sopenharmony_ci eor a0, a0, w6 6118c2ecf20Sopenharmony_ci zip2 v17.4s, v0.4s, v1.4s 6128c2ecf20Sopenharmony_ci eor a1, a1, w7 6138c2ecf20Sopenharmony_ci zip1 v18.4s, v2.4s, v3.4s 6148c2ecf20Sopenharmony_ci eor a2, a2, w8 6158c2ecf20Sopenharmony_ci zip2 v19.4s, v2.4s, v3.4s 6168c2ecf20Sopenharmony_ci eor a3, a3, w9 6178c2ecf20Sopenharmony_ci ldp w6, w7, [x2, #-48] 6188c2ecf20Sopenharmony_ci zip1 v20.4s, v4.4s, v5.4s 6198c2ecf20Sopenharmony_ci ldp w8, w9, [x2, #-40] 6208c2ecf20Sopenharmony_ci eor a4, a4, w6 6218c2ecf20Sopenharmony_ci zip2 v21.4s, v4.4s, v5.4s 6228c2ecf20Sopenharmony_ci eor a5, a5, w7 6238c2ecf20Sopenharmony_ci zip1 v22.4s, v6.4s, v7.4s 6248c2ecf20Sopenharmony_ci eor a6, a6, w8 6258c2ecf20Sopenharmony_ci zip2 v23.4s, v6.4s, v7.4s 6268c2ecf20Sopenharmony_ci eor a7, a7, w9 6278c2ecf20Sopenharmony_ci ldp w6, w7, [x2, #-32] 6288c2ecf20Sopenharmony_ci zip1 v24.4s, v8.4s, v9.4s 6298c2ecf20Sopenharmony_ci ldp w8, w9, [x2, #-24] 6308c2ecf20Sopenharmony_ci eor a8, a8, w6 6318c2ecf20Sopenharmony_ci zip2 v25.4s, v8.4s, v9.4s 6328c2ecf20Sopenharmony_ci eor a9, a9, w7 6338c2ecf20Sopenharmony_ci zip1 v26.4s, v10.4s, v11.4s 6348c2ecf20Sopenharmony_ci eor a10, a10, w8 6358c2ecf20Sopenharmony_ci zip2 v27.4s, v10.4s, v11.4s 6368c2ecf20Sopenharmony_ci eor a11, a11, w9 6378c2ecf20Sopenharmony_ci ldp w6, w7, [x2, #-16] 6388c2ecf20Sopenharmony_ci zip1 v28.4s, v12.4s, v13.4s 6398c2ecf20Sopenharmony_ci ldp w8, w9, [x2, #-8] 6408c2ecf20Sopenharmony_ci eor a12, a12, w6 6418c2ecf20Sopenharmony_ci zip2 v29.4s, v12.4s, v13.4s 6428c2ecf20Sopenharmony_ci eor a13, a13, w7 6438c2ecf20Sopenharmony_ci zip1 v30.4s, v14.4s, v15.4s 6448c2ecf20Sopenharmony_ci eor a14, a14, w8 6458c2ecf20Sopenharmony_ci zip2 v31.4s, v14.4s, v15.4s 6468c2ecf20Sopenharmony_ci eor a15, a15, w9 6478c2ecf20Sopenharmony_ci 6488c2ecf20Sopenharmony_ci mov x3, #64 6498c2ecf20Sopenharmony_ci subs x5, x4, #128 6508c2ecf20Sopenharmony_ci add x6, x5, x2 6518c2ecf20Sopenharmony_ci csel x3, x3, xzr, ge 6528c2ecf20Sopenharmony_ci csel x2, x2, x6, ge 6538c2ecf20Sopenharmony_ci 6548c2ecf20Sopenharmony_ci // interleave 64-bit words in state n, n+2 6558c2ecf20Sopenharmony_ci zip1 v0.2d, v16.2d, v18.2d 6568c2ecf20Sopenharmony_ci zip2 v4.2d, v16.2d, v18.2d 6578c2ecf20Sopenharmony_ci stp a0, a1, [x1], #64 6588c2ecf20Sopenharmony_ci zip1 v8.2d, v17.2d, v19.2d 6598c2ecf20Sopenharmony_ci zip2 v12.2d, v17.2d, v19.2d 6608c2ecf20Sopenharmony_ci stp a2, a3, [x1, #-56] 6618c2ecf20Sopenharmony_ci ld1 {v16.16b-v19.16b}, [x2], x3 6628c2ecf20Sopenharmony_ci 6638c2ecf20Sopenharmony_ci subs x6, x4, #192 6648c2ecf20Sopenharmony_ci ccmp x3, xzr, #4, lt 6658c2ecf20Sopenharmony_ci add x7, x6, x2 6668c2ecf20Sopenharmony_ci csel x3, x3, xzr, eq 6678c2ecf20Sopenharmony_ci csel x2, x2, x7, eq 6688c2ecf20Sopenharmony_ci 6698c2ecf20Sopenharmony_ci zip1 v1.2d, v20.2d, v22.2d 6708c2ecf20Sopenharmony_ci zip2 v5.2d, v20.2d, v22.2d 6718c2ecf20Sopenharmony_ci stp a4, a5, [x1, #-48] 6728c2ecf20Sopenharmony_ci zip1 v9.2d, v21.2d, v23.2d 6738c2ecf20Sopenharmony_ci zip2 v13.2d, v21.2d, v23.2d 6748c2ecf20Sopenharmony_ci stp a6, a7, [x1, #-40] 6758c2ecf20Sopenharmony_ci ld1 {v20.16b-v23.16b}, [x2], x3 6768c2ecf20Sopenharmony_ci 6778c2ecf20Sopenharmony_ci subs x7, x4, #256 6788c2ecf20Sopenharmony_ci ccmp x3, xzr, #4, lt 6798c2ecf20Sopenharmony_ci add x8, x7, x2 6808c2ecf20Sopenharmony_ci csel x3, x3, xzr, eq 6818c2ecf20Sopenharmony_ci csel x2, x2, x8, eq 6828c2ecf20Sopenharmony_ci 6838c2ecf20Sopenharmony_ci zip1 v2.2d, v24.2d, v26.2d 6848c2ecf20Sopenharmony_ci zip2 v6.2d, v24.2d, v26.2d 6858c2ecf20Sopenharmony_ci stp a8, a9, [x1, #-32] 6868c2ecf20Sopenharmony_ci zip1 v10.2d, v25.2d, v27.2d 6878c2ecf20Sopenharmony_ci zip2 v14.2d, v25.2d, v27.2d 6888c2ecf20Sopenharmony_ci stp a10, a11, [x1, #-24] 6898c2ecf20Sopenharmony_ci ld1 {v24.16b-v27.16b}, [x2], x3 6908c2ecf20Sopenharmony_ci 6918c2ecf20Sopenharmony_ci subs x8, x4, #320 6928c2ecf20Sopenharmony_ci ccmp x3, xzr, #4, lt 6938c2ecf20Sopenharmony_ci add x9, x8, x2 6948c2ecf20Sopenharmony_ci csel x2, x2, x9, eq 6958c2ecf20Sopenharmony_ci 6968c2ecf20Sopenharmony_ci zip1 v3.2d, v28.2d, v30.2d 6978c2ecf20Sopenharmony_ci zip2 v7.2d, v28.2d, v30.2d 6988c2ecf20Sopenharmony_ci stp a12, a13, [x1, #-16] 6998c2ecf20Sopenharmony_ci zip1 v11.2d, v29.2d, v31.2d 7008c2ecf20Sopenharmony_ci zip2 v15.2d, v29.2d, v31.2d 7018c2ecf20Sopenharmony_ci stp a14, a15, [x1, #-8] 7028c2ecf20Sopenharmony_ci ld1 {v28.16b-v31.16b}, [x2] 7038c2ecf20Sopenharmony_ci 7048c2ecf20Sopenharmony_ci // xor with corresponding input, write to output 7058c2ecf20Sopenharmony_ci tbnz x5, #63, 0f 7068c2ecf20Sopenharmony_ci eor v16.16b, v16.16b, v0.16b 7078c2ecf20Sopenharmony_ci eor v17.16b, v17.16b, v1.16b 7088c2ecf20Sopenharmony_ci eor v18.16b, v18.16b, v2.16b 7098c2ecf20Sopenharmony_ci eor v19.16b, v19.16b, v3.16b 7108c2ecf20Sopenharmony_ci st1 {v16.16b-v19.16b}, [x1], #64 7118c2ecf20Sopenharmony_ci cbz x5, .Lout 7128c2ecf20Sopenharmony_ci 7138c2ecf20Sopenharmony_ci tbnz x6, #63, 1f 7148c2ecf20Sopenharmony_ci eor v20.16b, v20.16b, v4.16b 7158c2ecf20Sopenharmony_ci eor v21.16b, v21.16b, v5.16b 7168c2ecf20Sopenharmony_ci eor v22.16b, v22.16b, v6.16b 7178c2ecf20Sopenharmony_ci eor v23.16b, v23.16b, v7.16b 7188c2ecf20Sopenharmony_ci st1 {v20.16b-v23.16b}, [x1], #64 7198c2ecf20Sopenharmony_ci cbz x6, .Lout 7208c2ecf20Sopenharmony_ci 7218c2ecf20Sopenharmony_ci tbnz x7, #63, 2f 7228c2ecf20Sopenharmony_ci eor v24.16b, v24.16b, v8.16b 7238c2ecf20Sopenharmony_ci eor v25.16b, v25.16b, v9.16b 7248c2ecf20Sopenharmony_ci eor v26.16b, v26.16b, v10.16b 7258c2ecf20Sopenharmony_ci eor v27.16b, v27.16b, v11.16b 7268c2ecf20Sopenharmony_ci st1 {v24.16b-v27.16b}, [x1], #64 7278c2ecf20Sopenharmony_ci cbz x7, .Lout 7288c2ecf20Sopenharmony_ci 7298c2ecf20Sopenharmony_ci tbnz x8, #63, 3f 7308c2ecf20Sopenharmony_ci eor v28.16b, v28.16b, v12.16b 7318c2ecf20Sopenharmony_ci eor v29.16b, v29.16b, v13.16b 7328c2ecf20Sopenharmony_ci eor v30.16b, v30.16b, v14.16b 7338c2ecf20Sopenharmony_ci eor v31.16b, v31.16b, v15.16b 7348c2ecf20Sopenharmony_ci st1 {v28.16b-v31.16b}, [x1] 7358c2ecf20Sopenharmony_ci 7368c2ecf20Sopenharmony_ci.Lout: frame_pop 7378c2ecf20Sopenharmony_ci ret 7388c2ecf20Sopenharmony_ci 7398c2ecf20Sopenharmony_ci // fewer than 128 bytes of in/output 7408c2ecf20Sopenharmony_ci0: ld1 {v8.16b}, [x10] 7418c2ecf20Sopenharmony_ci ld1 {v9.16b}, [x11] 7428c2ecf20Sopenharmony_ci movi v10.16b, #16 7438c2ecf20Sopenharmony_ci sub x2, x1, #64 7448c2ecf20Sopenharmony_ci add x1, x1, x5 7458c2ecf20Sopenharmony_ci ld1 {v16.16b-v19.16b}, [x2] 7468c2ecf20Sopenharmony_ci tbl v4.16b, {v0.16b-v3.16b}, v8.16b 7478c2ecf20Sopenharmony_ci tbx v20.16b, {v16.16b-v19.16b}, v9.16b 7488c2ecf20Sopenharmony_ci add v8.16b, v8.16b, v10.16b 7498c2ecf20Sopenharmony_ci add v9.16b, v9.16b, v10.16b 7508c2ecf20Sopenharmony_ci tbl v5.16b, {v0.16b-v3.16b}, v8.16b 7518c2ecf20Sopenharmony_ci tbx v21.16b, {v16.16b-v19.16b}, v9.16b 7528c2ecf20Sopenharmony_ci add v8.16b, v8.16b, v10.16b 7538c2ecf20Sopenharmony_ci add v9.16b, v9.16b, v10.16b 7548c2ecf20Sopenharmony_ci tbl v6.16b, {v0.16b-v3.16b}, v8.16b 7558c2ecf20Sopenharmony_ci tbx v22.16b, {v16.16b-v19.16b}, v9.16b 7568c2ecf20Sopenharmony_ci add v8.16b, v8.16b, v10.16b 7578c2ecf20Sopenharmony_ci add v9.16b, v9.16b, v10.16b 7588c2ecf20Sopenharmony_ci tbl v7.16b, {v0.16b-v3.16b}, v8.16b 7598c2ecf20Sopenharmony_ci tbx v23.16b, {v16.16b-v19.16b}, v9.16b 7608c2ecf20Sopenharmony_ci 7618c2ecf20Sopenharmony_ci eor v20.16b, v20.16b, v4.16b 7628c2ecf20Sopenharmony_ci eor v21.16b, v21.16b, v5.16b 7638c2ecf20Sopenharmony_ci eor v22.16b, v22.16b, v6.16b 7648c2ecf20Sopenharmony_ci eor v23.16b, v23.16b, v7.16b 7658c2ecf20Sopenharmony_ci st1 {v20.16b-v23.16b}, [x1] 7668c2ecf20Sopenharmony_ci b .Lout 7678c2ecf20Sopenharmony_ci 7688c2ecf20Sopenharmony_ci // fewer than 192 bytes of in/output 7698c2ecf20Sopenharmony_ci1: ld1 {v8.16b}, [x10] 7708c2ecf20Sopenharmony_ci ld1 {v9.16b}, [x11] 7718c2ecf20Sopenharmony_ci movi v10.16b, #16 7728c2ecf20Sopenharmony_ci add x1, x1, x6 7738c2ecf20Sopenharmony_ci tbl v0.16b, {v4.16b-v7.16b}, v8.16b 7748c2ecf20Sopenharmony_ci tbx v20.16b, {v16.16b-v19.16b}, v9.16b 7758c2ecf20Sopenharmony_ci add v8.16b, v8.16b, v10.16b 7768c2ecf20Sopenharmony_ci add v9.16b, v9.16b, v10.16b 7778c2ecf20Sopenharmony_ci tbl v1.16b, {v4.16b-v7.16b}, v8.16b 7788c2ecf20Sopenharmony_ci tbx v21.16b, {v16.16b-v19.16b}, v9.16b 7798c2ecf20Sopenharmony_ci add v8.16b, v8.16b, v10.16b 7808c2ecf20Sopenharmony_ci add v9.16b, v9.16b, v10.16b 7818c2ecf20Sopenharmony_ci tbl v2.16b, {v4.16b-v7.16b}, v8.16b 7828c2ecf20Sopenharmony_ci tbx v22.16b, {v16.16b-v19.16b}, v9.16b 7838c2ecf20Sopenharmony_ci add v8.16b, v8.16b, v10.16b 7848c2ecf20Sopenharmony_ci add v9.16b, v9.16b, v10.16b 7858c2ecf20Sopenharmony_ci tbl v3.16b, {v4.16b-v7.16b}, v8.16b 7868c2ecf20Sopenharmony_ci tbx v23.16b, {v16.16b-v19.16b}, v9.16b 7878c2ecf20Sopenharmony_ci 7888c2ecf20Sopenharmony_ci eor v20.16b, v20.16b, v0.16b 7898c2ecf20Sopenharmony_ci eor v21.16b, v21.16b, v1.16b 7908c2ecf20Sopenharmony_ci eor v22.16b, v22.16b, v2.16b 7918c2ecf20Sopenharmony_ci eor v23.16b, v23.16b, v3.16b 7928c2ecf20Sopenharmony_ci st1 {v20.16b-v23.16b}, [x1] 7938c2ecf20Sopenharmony_ci b .Lout 7948c2ecf20Sopenharmony_ci 7958c2ecf20Sopenharmony_ci // fewer than 256 bytes of in/output 7968c2ecf20Sopenharmony_ci2: ld1 {v4.16b}, [x10] 7978c2ecf20Sopenharmony_ci ld1 {v5.16b}, [x11] 7988c2ecf20Sopenharmony_ci movi v6.16b, #16 7998c2ecf20Sopenharmony_ci add x1, x1, x7 8008c2ecf20Sopenharmony_ci tbl v0.16b, {v8.16b-v11.16b}, v4.16b 8018c2ecf20Sopenharmony_ci tbx v24.16b, {v20.16b-v23.16b}, v5.16b 8028c2ecf20Sopenharmony_ci add v4.16b, v4.16b, v6.16b 8038c2ecf20Sopenharmony_ci add v5.16b, v5.16b, v6.16b 8048c2ecf20Sopenharmony_ci tbl v1.16b, {v8.16b-v11.16b}, v4.16b 8058c2ecf20Sopenharmony_ci tbx v25.16b, {v20.16b-v23.16b}, v5.16b 8068c2ecf20Sopenharmony_ci add v4.16b, v4.16b, v6.16b 8078c2ecf20Sopenharmony_ci add v5.16b, v5.16b, v6.16b 8088c2ecf20Sopenharmony_ci tbl v2.16b, {v8.16b-v11.16b}, v4.16b 8098c2ecf20Sopenharmony_ci tbx v26.16b, {v20.16b-v23.16b}, v5.16b 8108c2ecf20Sopenharmony_ci add v4.16b, v4.16b, v6.16b 8118c2ecf20Sopenharmony_ci add v5.16b, v5.16b, v6.16b 8128c2ecf20Sopenharmony_ci tbl v3.16b, {v8.16b-v11.16b}, v4.16b 8138c2ecf20Sopenharmony_ci tbx v27.16b, {v20.16b-v23.16b}, v5.16b 8148c2ecf20Sopenharmony_ci 8158c2ecf20Sopenharmony_ci eor v24.16b, v24.16b, v0.16b 8168c2ecf20Sopenharmony_ci eor v25.16b, v25.16b, v1.16b 8178c2ecf20Sopenharmony_ci eor v26.16b, v26.16b, v2.16b 8188c2ecf20Sopenharmony_ci eor v27.16b, v27.16b, v3.16b 8198c2ecf20Sopenharmony_ci st1 {v24.16b-v27.16b}, [x1] 8208c2ecf20Sopenharmony_ci b .Lout 8218c2ecf20Sopenharmony_ci 8228c2ecf20Sopenharmony_ci // fewer than 320 bytes of in/output 8238c2ecf20Sopenharmony_ci3: ld1 {v4.16b}, [x10] 8248c2ecf20Sopenharmony_ci ld1 {v5.16b}, [x11] 8258c2ecf20Sopenharmony_ci movi v6.16b, #16 8268c2ecf20Sopenharmony_ci add x1, x1, x8 8278c2ecf20Sopenharmony_ci tbl v0.16b, {v12.16b-v15.16b}, v4.16b 8288c2ecf20Sopenharmony_ci tbx v28.16b, {v24.16b-v27.16b}, v5.16b 8298c2ecf20Sopenharmony_ci add v4.16b, v4.16b, v6.16b 8308c2ecf20Sopenharmony_ci add v5.16b, v5.16b, v6.16b 8318c2ecf20Sopenharmony_ci tbl v1.16b, {v12.16b-v15.16b}, v4.16b 8328c2ecf20Sopenharmony_ci tbx v29.16b, {v24.16b-v27.16b}, v5.16b 8338c2ecf20Sopenharmony_ci add v4.16b, v4.16b, v6.16b 8348c2ecf20Sopenharmony_ci add v5.16b, v5.16b, v6.16b 8358c2ecf20Sopenharmony_ci tbl v2.16b, {v12.16b-v15.16b}, v4.16b 8368c2ecf20Sopenharmony_ci tbx v30.16b, {v24.16b-v27.16b}, v5.16b 8378c2ecf20Sopenharmony_ci add v4.16b, v4.16b, v6.16b 8388c2ecf20Sopenharmony_ci add v5.16b, v5.16b, v6.16b 8398c2ecf20Sopenharmony_ci tbl v3.16b, {v12.16b-v15.16b}, v4.16b 8408c2ecf20Sopenharmony_ci tbx v31.16b, {v24.16b-v27.16b}, v5.16b 8418c2ecf20Sopenharmony_ci 8428c2ecf20Sopenharmony_ci eor v28.16b, v28.16b, v0.16b 8438c2ecf20Sopenharmony_ci eor v29.16b, v29.16b, v1.16b 8448c2ecf20Sopenharmony_ci eor v30.16b, v30.16b, v2.16b 8458c2ecf20Sopenharmony_ci eor v31.16b, v31.16b, v3.16b 8468c2ecf20Sopenharmony_ci st1 {v28.16b-v31.16b}, [x1] 8478c2ecf20Sopenharmony_ci b .Lout 8488c2ecf20Sopenharmony_ciSYM_FUNC_END(chacha_4block_xor_neon) 8498c2ecf20Sopenharmony_ci 8508c2ecf20Sopenharmony_ci .section ".rodata", "a", %progbits 8518c2ecf20Sopenharmony_ci .align L1_CACHE_SHIFT 8528c2ecf20Sopenharmony_ci.Lpermute: 8538c2ecf20Sopenharmony_ci .set .Li, 0 8548c2ecf20Sopenharmony_ci .rept 192 8558c2ecf20Sopenharmony_ci .byte (.Li - 64) 8568c2ecf20Sopenharmony_ci .set .Li, .Li + 1 8578c2ecf20Sopenharmony_ci .endr 8588c2ecf20Sopenharmony_ci 8598c2ecf20Sopenharmony_ciCTRINC: .word 1, 2, 3, 4 8608c2ecf20Sopenharmony_ciROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f 861