162306a36Sopenharmony_ci/* 262306a36Sopenharmony_ci * ChaCha/XChaCha NEON helper functions 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org> 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * This program is free software; you can redistribute it and/or modify 762306a36Sopenharmony_ci * it under the terms of the GNU General Public License version 2 as 862306a36Sopenharmony_ci * published by the Free Software Foundation. 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * Originally based on: 1162306a36Sopenharmony_ci * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions 1262306a36Sopenharmony_ci * 1362306a36Sopenharmony_ci * Copyright (C) 2015 Martin Willi 1462306a36Sopenharmony_ci * 1562306a36Sopenharmony_ci * This program is free software; you can redistribute it and/or modify 1662306a36Sopenharmony_ci * it under the terms of the GNU General Public License as published by 1762306a36Sopenharmony_ci * the Free Software Foundation; either version 2 of the License, or 1862306a36Sopenharmony_ci * (at your option) any later version. 1962306a36Sopenharmony_ci */ 2062306a36Sopenharmony_ci 2162306a36Sopenharmony_ci#include <linux/linkage.h> 2262306a36Sopenharmony_ci#include <asm/assembler.h> 2362306a36Sopenharmony_ci#include <asm/cache.h> 2462306a36Sopenharmony_ci 2562306a36Sopenharmony_ci .text 2662306a36Sopenharmony_ci .align 6 2762306a36Sopenharmony_ci 2862306a36Sopenharmony_ci/* 2962306a36Sopenharmony_ci * chacha_permute - permute one block 3062306a36Sopenharmony_ci * 3162306a36Sopenharmony_ci * Permute one 64-byte block where the state matrix is stored in the four NEON 3262306a36Sopenharmony_ci * registers v0-v3. It performs matrix operations on four words in parallel, 3362306a36Sopenharmony_ci * but requires shuffling to rearrange the words after each round. 3462306a36Sopenharmony_ci * 3562306a36Sopenharmony_ci * The round count is given in w3. 3662306a36Sopenharmony_ci * 3762306a36Sopenharmony_ci * Clobbers: w3, x10, v4, v12 3862306a36Sopenharmony_ci */ 3962306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(chacha_permute) 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_ci adr_l x10, ROT8 4262306a36Sopenharmony_ci ld1 {v12.4s}, [x10] 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ci.Ldoubleround: 4562306a36Sopenharmony_ci // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 4662306a36Sopenharmony_ci add v0.4s, v0.4s, v1.4s 4762306a36Sopenharmony_ci eor v3.16b, v3.16b, v0.16b 4862306a36Sopenharmony_ci rev32 v3.8h, v3.8h 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_ci // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 5162306a36Sopenharmony_ci add v2.4s, v2.4s, v3.4s 5262306a36Sopenharmony_ci eor v4.16b, v1.16b, v2.16b 5362306a36Sopenharmony_ci shl v1.4s, v4.4s, #12 5462306a36Sopenharmony_ci sri v1.4s, v4.4s, #20 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 5762306a36Sopenharmony_ci add v0.4s, v0.4s, v1.4s 5862306a36Sopenharmony_ci eor v3.16b, v3.16b, v0.16b 5962306a36Sopenharmony_ci tbl v3.16b, {v3.16b}, v12.16b 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_ci // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 6262306a36Sopenharmony_ci add v2.4s, v2.4s, v3.4s 6362306a36Sopenharmony_ci eor v4.16b, v1.16b, v2.16b 6462306a36Sopenharmony_ci shl v1.4s, v4.4s, #7 6562306a36Sopenharmony_ci sri v1.4s, v4.4s, #25 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_ci // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 6862306a36Sopenharmony_ci ext v1.16b, v1.16b, v1.16b, #4 6962306a36Sopenharmony_ci // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 7062306a36Sopenharmony_ci ext v2.16b, v2.16b, v2.16b, #8 7162306a36Sopenharmony_ci // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 7262306a36Sopenharmony_ci ext v3.16b, v3.16b, v3.16b, #12 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_ci // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 7562306a36Sopenharmony_ci add v0.4s, v0.4s, v1.4s 7662306a36Sopenharmony_ci eor v3.16b, v3.16b, v0.16b 7762306a36Sopenharmony_ci rev32 v3.8h, v3.8h 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_ci // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 8062306a36Sopenharmony_ci add v2.4s, v2.4s, v3.4s 8162306a36Sopenharmony_ci eor v4.16b, v1.16b, v2.16b 8262306a36Sopenharmony_ci shl v1.4s, v4.4s, #12 8362306a36Sopenharmony_ci sri v1.4s, v4.4s, #20 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_ci // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 8662306a36Sopenharmony_ci add v0.4s, v0.4s, v1.4s 8762306a36Sopenharmony_ci eor v3.16b, v3.16b, v0.16b 8862306a36Sopenharmony_ci tbl v3.16b, {v3.16b}, v12.16b 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 9162306a36Sopenharmony_ci add v2.4s, v2.4s, v3.4s 9262306a36Sopenharmony_ci eor v4.16b, v1.16b, v2.16b 9362306a36Sopenharmony_ci shl v1.4s, v4.4s, #7 9462306a36Sopenharmony_ci sri v1.4s, v4.4s, #25 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 9762306a36Sopenharmony_ci ext v1.16b, v1.16b, v1.16b, #12 9862306a36Sopenharmony_ci // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 9962306a36Sopenharmony_ci ext v2.16b, v2.16b, v2.16b, #8 10062306a36Sopenharmony_ci // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 10162306a36Sopenharmony_ci ext v3.16b, v3.16b, v3.16b, #4 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_ci subs w3, w3, #2 10462306a36Sopenharmony_ci b.ne .Ldoubleround 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci ret 10762306a36Sopenharmony_ciSYM_FUNC_END(chacha_permute) 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ciSYM_FUNC_START(chacha_block_xor_neon) 11062306a36Sopenharmony_ci // x0: Input state matrix, s 11162306a36Sopenharmony_ci // x1: 1 data block output, o 11262306a36Sopenharmony_ci // x2: 1 data block input, i 11362306a36Sopenharmony_ci // w3: nrounds 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_ci stp x29, x30, [sp, #-16]! 11662306a36Sopenharmony_ci mov x29, sp 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_ci // x0..3 = s0..3 11962306a36Sopenharmony_ci ld1 {v0.4s-v3.4s}, [x0] 12062306a36Sopenharmony_ci ld1 {v8.4s-v11.4s}, [x0] 12162306a36Sopenharmony_ci 12262306a36Sopenharmony_ci bl chacha_permute 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci ld1 {v4.16b-v7.16b}, [x2] 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_ci // o0 = i0 ^ (x0 + s0) 12762306a36Sopenharmony_ci add v0.4s, v0.4s, v8.4s 12862306a36Sopenharmony_ci eor v0.16b, v0.16b, v4.16b 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci // o1 = i1 ^ (x1 + s1) 13162306a36Sopenharmony_ci add v1.4s, v1.4s, v9.4s 13262306a36Sopenharmony_ci eor v1.16b, v1.16b, v5.16b 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci // o2 = i2 ^ (x2 + s2) 13562306a36Sopenharmony_ci add v2.4s, v2.4s, v10.4s 13662306a36Sopenharmony_ci eor v2.16b, v2.16b, v6.16b 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci // o3 = i3 ^ (x3 + s3) 13962306a36Sopenharmony_ci add v3.4s, v3.4s, v11.4s 14062306a36Sopenharmony_ci eor v3.16b, v3.16b, v7.16b 14162306a36Sopenharmony_ci 14262306a36Sopenharmony_ci st1 {v0.16b-v3.16b}, [x1] 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci ldp x29, x30, [sp], #16 14562306a36Sopenharmony_ci ret 14662306a36Sopenharmony_ciSYM_FUNC_END(chacha_block_xor_neon) 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ciSYM_FUNC_START(hchacha_block_neon) 14962306a36Sopenharmony_ci // x0: Input state matrix, s 15062306a36Sopenharmony_ci // x1: output (8 32-bit words) 15162306a36Sopenharmony_ci // w2: nrounds 15262306a36Sopenharmony_ci 15362306a36Sopenharmony_ci stp x29, x30, [sp, #-16]! 15462306a36Sopenharmony_ci mov x29, sp 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_ci ld1 {v0.4s-v3.4s}, [x0] 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci mov w3, w2 15962306a36Sopenharmony_ci bl chacha_permute 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci st1 {v0.4s}, [x1], #16 16262306a36Sopenharmony_ci st1 {v3.4s}, [x1] 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci ldp x29, x30, [sp], #16 16562306a36Sopenharmony_ci ret 16662306a36Sopenharmony_ciSYM_FUNC_END(hchacha_block_neon) 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ci a0 .req w12 16962306a36Sopenharmony_ci a1 .req w13 17062306a36Sopenharmony_ci a2 .req w14 17162306a36Sopenharmony_ci a3 .req w15 17262306a36Sopenharmony_ci a4 .req w16 17362306a36Sopenharmony_ci a5 .req w17 17462306a36Sopenharmony_ci a6 .req w19 17562306a36Sopenharmony_ci a7 .req w20 17662306a36Sopenharmony_ci a8 .req w21 17762306a36Sopenharmony_ci a9 .req w22 17862306a36Sopenharmony_ci a10 .req w23 17962306a36Sopenharmony_ci a11 .req w24 18062306a36Sopenharmony_ci a12 .req w25 18162306a36Sopenharmony_ci a13 .req w26 18262306a36Sopenharmony_ci a14 .req w27 18362306a36Sopenharmony_ci a15 .req w28 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci .align 6 18662306a36Sopenharmony_ciSYM_FUNC_START(chacha_4block_xor_neon) 18762306a36Sopenharmony_ci frame_push 10 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci // x0: Input state matrix, s 19062306a36Sopenharmony_ci // x1: 4 data blocks output, o 19162306a36Sopenharmony_ci // x2: 4 data blocks input, i 19262306a36Sopenharmony_ci // w3: nrounds 19362306a36Sopenharmony_ci // x4: byte count 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci adr_l x10, .Lpermute 19662306a36Sopenharmony_ci and x5, x4, #63 19762306a36Sopenharmony_ci add x10, x10, x5 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci // 20062306a36Sopenharmony_ci // This function encrypts four consecutive ChaCha blocks by loading 20162306a36Sopenharmony_ci // the state matrix in NEON registers four times. The algorithm performs 20262306a36Sopenharmony_ci // each operation on the corresponding word of each state matrix, hence 20362306a36Sopenharmony_ci // requires no word shuffling. For final XORing step we transpose the 20462306a36Sopenharmony_ci // matrix by interleaving 32- and then 64-bit words, which allows us to 20562306a36Sopenharmony_ci // do XOR in NEON registers. 20662306a36Sopenharmony_ci // 20762306a36Sopenharmony_ci // At the same time, a fifth block is encrypted in parallel using 20862306a36Sopenharmony_ci // scalar registers 20962306a36Sopenharmony_ci // 21062306a36Sopenharmony_ci adr_l x9, CTRINC // ... and ROT8 21162306a36Sopenharmony_ci ld1 {v30.4s-v31.4s}, [x9] 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci // x0..15[0-3] = s0..3[0..3] 21462306a36Sopenharmony_ci add x8, x0, #16 21562306a36Sopenharmony_ci ld4r { v0.4s- v3.4s}, [x0] 21662306a36Sopenharmony_ci ld4r { v4.4s- v7.4s}, [x8], #16 21762306a36Sopenharmony_ci ld4r { v8.4s-v11.4s}, [x8], #16 21862306a36Sopenharmony_ci ld4r {v12.4s-v15.4s}, [x8] 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_ci mov a0, v0.s[0] 22162306a36Sopenharmony_ci mov a1, v1.s[0] 22262306a36Sopenharmony_ci mov a2, v2.s[0] 22362306a36Sopenharmony_ci mov a3, v3.s[0] 22462306a36Sopenharmony_ci mov a4, v4.s[0] 22562306a36Sopenharmony_ci mov a5, v5.s[0] 22662306a36Sopenharmony_ci mov a6, v6.s[0] 22762306a36Sopenharmony_ci mov a7, v7.s[0] 22862306a36Sopenharmony_ci mov a8, v8.s[0] 22962306a36Sopenharmony_ci mov a9, v9.s[0] 23062306a36Sopenharmony_ci mov a10, v10.s[0] 23162306a36Sopenharmony_ci mov a11, v11.s[0] 23262306a36Sopenharmony_ci mov a12, v12.s[0] 23362306a36Sopenharmony_ci mov a13, v13.s[0] 23462306a36Sopenharmony_ci mov a14, v14.s[0] 23562306a36Sopenharmony_ci mov a15, v15.s[0] 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_ci // x12 += counter values 1-4 23862306a36Sopenharmony_ci add v12.4s, v12.4s, v30.4s 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_ci.Ldoubleround4: 24162306a36Sopenharmony_ci // x0 += x4, x12 = rotl32(x12 ^ x0, 16) 24262306a36Sopenharmony_ci // x1 += x5, x13 = rotl32(x13 ^ x1, 16) 24362306a36Sopenharmony_ci // x2 += x6, x14 = rotl32(x14 ^ x2, 16) 24462306a36Sopenharmony_ci // x3 += x7, x15 = rotl32(x15 ^ x3, 16) 24562306a36Sopenharmony_ci add v0.4s, v0.4s, v4.4s 24662306a36Sopenharmony_ci add a0, a0, a4 24762306a36Sopenharmony_ci add v1.4s, v1.4s, v5.4s 24862306a36Sopenharmony_ci add a1, a1, a5 24962306a36Sopenharmony_ci add v2.4s, v2.4s, v6.4s 25062306a36Sopenharmony_ci add a2, a2, a6 25162306a36Sopenharmony_ci add v3.4s, v3.4s, v7.4s 25262306a36Sopenharmony_ci add a3, a3, a7 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ci eor v12.16b, v12.16b, v0.16b 25562306a36Sopenharmony_ci eor a12, a12, a0 25662306a36Sopenharmony_ci eor v13.16b, v13.16b, v1.16b 25762306a36Sopenharmony_ci eor a13, a13, a1 25862306a36Sopenharmony_ci eor v14.16b, v14.16b, v2.16b 25962306a36Sopenharmony_ci eor a14, a14, a2 26062306a36Sopenharmony_ci eor v15.16b, v15.16b, v3.16b 26162306a36Sopenharmony_ci eor a15, a15, a3 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci rev32 v12.8h, v12.8h 26462306a36Sopenharmony_ci ror a12, a12, #16 26562306a36Sopenharmony_ci rev32 v13.8h, v13.8h 26662306a36Sopenharmony_ci ror a13, a13, #16 26762306a36Sopenharmony_ci rev32 v14.8h, v14.8h 26862306a36Sopenharmony_ci ror a14, a14, #16 26962306a36Sopenharmony_ci rev32 v15.8h, v15.8h 27062306a36Sopenharmony_ci ror a15, a15, #16 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ci // x8 += x12, x4 = rotl32(x4 ^ x8, 12) 27362306a36Sopenharmony_ci // x9 += x13, x5 = rotl32(x5 ^ x9, 12) 27462306a36Sopenharmony_ci // x10 += x14, x6 = rotl32(x6 ^ x10, 12) 27562306a36Sopenharmony_ci // x11 += x15, x7 = rotl32(x7 ^ x11, 12) 27662306a36Sopenharmony_ci add v8.4s, v8.4s, v12.4s 27762306a36Sopenharmony_ci add a8, a8, a12 27862306a36Sopenharmony_ci add v9.4s, v9.4s, v13.4s 27962306a36Sopenharmony_ci add a9, a9, a13 28062306a36Sopenharmony_ci add v10.4s, v10.4s, v14.4s 28162306a36Sopenharmony_ci add a10, a10, a14 28262306a36Sopenharmony_ci add v11.4s, v11.4s, v15.4s 28362306a36Sopenharmony_ci add a11, a11, a15 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_ci eor v16.16b, v4.16b, v8.16b 28662306a36Sopenharmony_ci eor a4, a4, a8 28762306a36Sopenharmony_ci eor v17.16b, v5.16b, v9.16b 28862306a36Sopenharmony_ci eor a5, a5, a9 28962306a36Sopenharmony_ci eor v18.16b, v6.16b, v10.16b 29062306a36Sopenharmony_ci eor a6, a6, a10 29162306a36Sopenharmony_ci eor v19.16b, v7.16b, v11.16b 29262306a36Sopenharmony_ci eor a7, a7, a11 29362306a36Sopenharmony_ci 29462306a36Sopenharmony_ci shl v4.4s, v16.4s, #12 29562306a36Sopenharmony_ci shl v5.4s, v17.4s, #12 29662306a36Sopenharmony_ci shl v6.4s, v18.4s, #12 29762306a36Sopenharmony_ci shl v7.4s, v19.4s, #12 29862306a36Sopenharmony_ci 29962306a36Sopenharmony_ci sri v4.4s, v16.4s, #20 30062306a36Sopenharmony_ci ror a4, a4, #20 30162306a36Sopenharmony_ci sri v5.4s, v17.4s, #20 30262306a36Sopenharmony_ci ror a5, a5, #20 30362306a36Sopenharmony_ci sri v6.4s, v18.4s, #20 30462306a36Sopenharmony_ci ror a6, a6, #20 30562306a36Sopenharmony_ci sri v7.4s, v19.4s, #20 30662306a36Sopenharmony_ci ror a7, a7, #20 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci // x0 += x4, x12 = rotl32(x12 ^ x0, 8) 30962306a36Sopenharmony_ci // x1 += x5, x13 = rotl32(x13 ^ x1, 8) 31062306a36Sopenharmony_ci // x2 += x6, x14 = rotl32(x14 ^ x2, 8) 31162306a36Sopenharmony_ci // x3 += x7, x15 = rotl32(x15 ^ x3, 8) 31262306a36Sopenharmony_ci add v0.4s, v0.4s, v4.4s 31362306a36Sopenharmony_ci add a0, a0, a4 31462306a36Sopenharmony_ci add v1.4s, v1.4s, v5.4s 31562306a36Sopenharmony_ci add a1, a1, a5 31662306a36Sopenharmony_ci add v2.4s, v2.4s, v6.4s 31762306a36Sopenharmony_ci add a2, a2, a6 31862306a36Sopenharmony_ci add v3.4s, v3.4s, v7.4s 31962306a36Sopenharmony_ci add a3, a3, a7 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci eor v12.16b, v12.16b, v0.16b 32262306a36Sopenharmony_ci eor a12, a12, a0 32362306a36Sopenharmony_ci eor v13.16b, v13.16b, v1.16b 32462306a36Sopenharmony_ci eor a13, a13, a1 32562306a36Sopenharmony_ci eor v14.16b, v14.16b, v2.16b 32662306a36Sopenharmony_ci eor a14, a14, a2 32762306a36Sopenharmony_ci eor v15.16b, v15.16b, v3.16b 32862306a36Sopenharmony_ci eor a15, a15, a3 32962306a36Sopenharmony_ci 33062306a36Sopenharmony_ci tbl v12.16b, {v12.16b}, v31.16b 33162306a36Sopenharmony_ci ror a12, a12, #24 33262306a36Sopenharmony_ci tbl v13.16b, {v13.16b}, v31.16b 33362306a36Sopenharmony_ci ror a13, a13, #24 33462306a36Sopenharmony_ci tbl v14.16b, {v14.16b}, v31.16b 33562306a36Sopenharmony_ci ror a14, a14, #24 33662306a36Sopenharmony_ci tbl v15.16b, {v15.16b}, v31.16b 33762306a36Sopenharmony_ci ror a15, a15, #24 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci // x8 += x12, x4 = rotl32(x4 ^ x8, 7) 34062306a36Sopenharmony_ci // x9 += x13, x5 = rotl32(x5 ^ x9, 7) 34162306a36Sopenharmony_ci // x10 += x14, x6 = rotl32(x6 ^ x10, 7) 34262306a36Sopenharmony_ci // x11 += x15, x7 = rotl32(x7 ^ x11, 7) 34362306a36Sopenharmony_ci add v8.4s, v8.4s, v12.4s 34462306a36Sopenharmony_ci add a8, a8, a12 34562306a36Sopenharmony_ci add v9.4s, v9.4s, v13.4s 34662306a36Sopenharmony_ci add a9, a9, a13 34762306a36Sopenharmony_ci add v10.4s, v10.4s, v14.4s 34862306a36Sopenharmony_ci add a10, a10, a14 34962306a36Sopenharmony_ci add v11.4s, v11.4s, v15.4s 35062306a36Sopenharmony_ci add a11, a11, a15 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_ci eor v16.16b, v4.16b, v8.16b 35362306a36Sopenharmony_ci eor a4, a4, a8 35462306a36Sopenharmony_ci eor v17.16b, v5.16b, v9.16b 35562306a36Sopenharmony_ci eor a5, a5, a9 35662306a36Sopenharmony_ci eor v18.16b, v6.16b, v10.16b 35762306a36Sopenharmony_ci eor a6, a6, a10 35862306a36Sopenharmony_ci eor v19.16b, v7.16b, v11.16b 35962306a36Sopenharmony_ci eor a7, a7, a11 36062306a36Sopenharmony_ci 36162306a36Sopenharmony_ci shl v4.4s, v16.4s, #7 36262306a36Sopenharmony_ci shl v5.4s, v17.4s, #7 36362306a36Sopenharmony_ci shl v6.4s, v18.4s, #7 36462306a36Sopenharmony_ci shl v7.4s, v19.4s, #7 36562306a36Sopenharmony_ci 36662306a36Sopenharmony_ci sri v4.4s, v16.4s, #25 36762306a36Sopenharmony_ci ror a4, a4, #25 36862306a36Sopenharmony_ci sri v5.4s, v17.4s, #25 36962306a36Sopenharmony_ci ror a5, a5, #25 37062306a36Sopenharmony_ci sri v6.4s, v18.4s, #25 37162306a36Sopenharmony_ci ror a6, a6, #25 37262306a36Sopenharmony_ci sri v7.4s, v19.4s, #25 37362306a36Sopenharmony_ci ror a7, a7, #25 37462306a36Sopenharmony_ci 37562306a36Sopenharmony_ci // x0 += x5, x15 = rotl32(x15 ^ x0, 16) 37662306a36Sopenharmony_ci // x1 += x6, x12 = rotl32(x12 ^ x1, 16) 37762306a36Sopenharmony_ci // x2 += x7, x13 = rotl32(x13 ^ x2, 16) 37862306a36Sopenharmony_ci // x3 += x4, x14 = rotl32(x14 ^ x3, 16) 37962306a36Sopenharmony_ci add v0.4s, v0.4s, v5.4s 38062306a36Sopenharmony_ci add a0, a0, a5 38162306a36Sopenharmony_ci add v1.4s, v1.4s, v6.4s 38262306a36Sopenharmony_ci add a1, a1, a6 38362306a36Sopenharmony_ci add v2.4s, v2.4s, v7.4s 38462306a36Sopenharmony_ci add a2, a2, a7 38562306a36Sopenharmony_ci add v3.4s, v3.4s, v4.4s 38662306a36Sopenharmony_ci add a3, a3, a4 38762306a36Sopenharmony_ci 38862306a36Sopenharmony_ci eor v15.16b, v15.16b, v0.16b 38962306a36Sopenharmony_ci eor a15, a15, a0 39062306a36Sopenharmony_ci eor v12.16b, v12.16b, v1.16b 39162306a36Sopenharmony_ci eor a12, a12, a1 39262306a36Sopenharmony_ci eor v13.16b, v13.16b, v2.16b 39362306a36Sopenharmony_ci eor a13, a13, a2 39462306a36Sopenharmony_ci eor v14.16b, v14.16b, v3.16b 39562306a36Sopenharmony_ci eor a14, a14, a3 39662306a36Sopenharmony_ci 39762306a36Sopenharmony_ci rev32 v15.8h, v15.8h 39862306a36Sopenharmony_ci ror a15, a15, #16 39962306a36Sopenharmony_ci rev32 v12.8h, v12.8h 40062306a36Sopenharmony_ci ror a12, a12, #16 40162306a36Sopenharmony_ci rev32 v13.8h, v13.8h 40262306a36Sopenharmony_ci ror a13, a13, #16 40362306a36Sopenharmony_ci rev32 v14.8h, v14.8h 40462306a36Sopenharmony_ci ror a14, a14, #16 40562306a36Sopenharmony_ci 40662306a36Sopenharmony_ci // x10 += x15, x5 = rotl32(x5 ^ x10, 12) 40762306a36Sopenharmony_ci // x11 += x12, x6 = rotl32(x6 ^ x11, 12) 40862306a36Sopenharmony_ci // x8 += x13, x7 = rotl32(x7 ^ x8, 12) 40962306a36Sopenharmony_ci // x9 += x14, x4 = rotl32(x4 ^ x9, 12) 41062306a36Sopenharmony_ci add v10.4s, v10.4s, v15.4s 41162306a36Sopenharmony_ci add a10, a10, a15 41262306a36Sopenharmony_ci add v11.4s, v11.4s, v12.4s 41362306a36Sopenharmony_ci add a11, a11, a12 41462306a36Sopenharmony_ci add v8.4s, v8.4s, v13.4s 41562306a36Sopenharmony_ci add a8, a8, a13 41662306a36Sopenharmony_ci add v9.4s, v9.4s, v14.4s 41762306a36Sopenharmony_ci add a9, a9, a14 41862306a36Sopenharmony_ci 41962306a36Sopenharmony_ci eor v16.16b, v5.16b, v10.16b 42062306a36Sopenharmony_ci eor a5, a5, a10 42162306a36Sopenharmony_ci eor v17.16b, v6.16b, v11.16b 42262306a36Sopenharmony_ci eor a6, a6, a11 42362306a36Sopenharmony_ci eor v18.16b, v7.16b, v8.16b 42462306a36Sopenharmony_ci eor a7, a7, a8 42562306a36Sopenharmony_ci eor v19.16b, v4.16b, v9.16b 42662306a36Sopenharmony_ci eor a4, a4, a9 42762306a36Sopenharmony_ci 42862306a36Sopenharmony_ci shl v5.4s, v16.4s, #12 42962306a36Sopenharmony_ci shl v6.4s, v17.4s, #12 43062306a36Sopenharmony_ci shl v7.4s, v18.4s, #12 43162306a36Sopenharmony_ci shl v4.4s, v19.4s, #12 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ci sri v5.4s, v16.4s, #20 43462306a36Sopenharmony_ci ror a5, a5, #20 43562306a36Sopenharmony_ci sri v6.4s, v17.4s, #20 43662306a36Sopenharmony_ci ror a6, a6, #20 43762306a36Sopenharmony_ci sri v7.4s, v18.4s, #20 43862306a36Sopenharmony_ci ror a7, a7, #20 43962306a36Sopenharmony_ci sri v4.4s, v19.4s, #20 44062306a36Sopenharmony_ci ror a4, a4, #20 44162306a36Sopenharmony_ci 44262306a36Sopenharmony_ci // x0 += x5, x15 = rotl32(x15 ^ x0, 8) 44362306a36Sopenharmony_ci // x1 += x6, x12 = rotl32(x12 ^ x1, 8) 44462306a36Sopenharmony_ci // x2 += x7, x13 = rotl32(x13 ^ x2, 8) 44562306a36Sopenharmony_ci // x3 += x4, x14 = rotl32(x14 ^ x3, 8) 44662306a36Sopenharmony_ci add v0.4s, v0.4s, v5.4s 44762306a36Sopenharmony_ci add a0, a0, a5 44862306a36Sopenharmony_ci add v1.4s, v1.4s, v6.4s 44962306a36Sopenharmony_ci add a1, a1, a6 45062306a36Sopenharmony_ci add v2.4s, v2.4s, v7.4s 45162306a36Sopenharmony_ci add a2, a2, a7 45262306a36Sopenharmony_ci add v3.4s, v3.4s, v4.4s 45362306a36Sopenharmony_ci add a3, a3, a4 45462306a36Sopenharmony_ci 45562306a36Sopenharmony_ci eor v15.16b, v15.16b, v0.16b 45662306a36Sopenharmony_ci eor a15, a15, a0 45762306a36Sopenharmony_ci eor v12.16b, v12.16b, v1.16b 45862306a36Sopenharmony_ci eor a12, a12, a1 45962306a36Sopenharmony_ci eor v13.16b, v13.16b, v2.16b 46062306a36Sopenharmony_ci eor a13, a13, a2 46162306a36Sopenharmony_ci eor v14.16b, v14.16b, v3.16b 46262306a36Sopenharmony_ci eor a14, a14, a3 46362306a36Sopenharmony_ci 46462306a36Sopenharmony_ci tbl v15.16b, {v15.16b}, v31.16b 46562306a36Sopenharmony_ci ror a15, a15, #24 46662306a36Sopenharmony_ci tbl v12.16b, {v12.16b}, v31.16b 46762306a36Sopenharmony_ci ror a12, a12, #24 46862306a36Sopenharmony_ci tbl v13.16b, {v13.16b}, v31.16b 46962306a36Sopenharmony_ci ror a13, a13, #24 47062306a36Sopenharmony_ci tbl v14.16b, {v14.16b}, v31.16b 47162306a36Sopenharmony_ci ror a14, a14, #24 47262306a36Sopenharmony_ci 47362306a36Sopenharmony_ci // x10 += x15, x5 = rotl32(x5 ^ x10, 7) 47462306a36Sopenharmony_ci // x11 += x12, x6 = rotl32(x6 ^ x11, 7) 47562306a36Sopenharmony_ci // x8 += x13, x7 = rotl32(x7 ^ x8, 7) 47662306a36Sopenharmony_ci // x9 += x14, x4 = rotl32(x4 ^ x9, 7) 47762306a36Sopenharmony_ci add v10.4s, v10.4s, v15.4s 47862306a36Sopenharmony_ci add a10, a10, a15 47962306a36Sopenharmony_ci add v11.4s, v11.4s, v12.4s 48062306a36Sopenharmony_ci add a11, a11, a12 48162306a36Sopenharmony_ci add v8.4s, v8.4s, v13.4s 48262306a36Sopenharmony_ci add a8, a8, a13 48362306a36Sopenharmony_ci add v9.4s, v9.4s, v14.4s 48462306a36Sopenharmony_ci add a9, a9, a14 48562306a36Sopenharmony_ci 48662306a36Sopenharmony_ci eor v16.16b, v5.16b, v10.16b 48762306a36Sopenharmony_ci eor a5, a5, a10 48862306a36Sopenharmony_ci eor v17.16b, v6.16b, v11.16b 48962306a36Sopenharmony_ci eor a6, a6, a11 49062306a36Sopenharmony_ci eor v18.16b, v7.16b, v8.16b 49162306a36Sopenharmony_ci eor a7, a7, a8 49262306a36Sopenharmony_ci eor v19.16b, v4.16b, v9.16b 49362306a36Sopenharmony_ci eor a4, a4, a9 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci shl v5.4s, v16.4s, #7 49662306a36Sopenharmony_ci shl v6.4s, v17.4s, #7 49762306a36Sopenharmony_ci shl v7.4s, v18.4s, #7 49862306a36Sopenharmony_ci shl v4.4s, v19.4s, #7 49962306a36Sopenharmony_ci 50062306a36Sopenharmony_ci sri v5.4s, v16.4s, #25 50162306a36Sopenharmony_ci ror a5, a5, #25 50262306a36Sopenharmony_ci sri v6.4s, v17.4s, #25 50362306a36Sopenharmony_ci ror a6, a6, #25 50462306a36Sopenharmony_ci sri v7.4s, v18.4s, #25 50562306a36Sopenharmony_ci ror a7, a7, #25 50662306a36Sopenharmony_ci sri v4.4s, v19.4s, #25 50762306a36Sopenharmony_ci ror a4, a4, #25 50862306a36Sopenharmony_ci 50962306a36Sopenharmony_ci subs w3, w3, #2 51062306a36Sopenharmony_ci b.ne .Ldoubleround4 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_ci ld4r {v16.4s-v19.4s}, [x0], #16 51362306a36Sopenharmony_ci ld4r {v20.4s-v23.4s}, [x0], #16 51462306a36Sopenharmony_ci 51562306a36Sopenharmony_ci // x12 += counter values 0-3 51662306a36Sopenharmony_ci add v12.4s, v12.4s, v30.4s 51762306a36Sopenharmony_ci 51862306a36Sopenharmony_ci // x0[0-3] += s0[0] 51962306a36Sopenharmony_ci // x1[0-3] += s0[1] 52062306a36Sopenharmony_ci // x2[0-3] += s0[2] 52162306a36Sopenharmony_ci // x3[0-3] += s0[3] 52262306a36Sopenharmony_ci add v0.4s, v0.4s, v16.4s 52362306a36Sopenharmony_ci mov w6, v16.s[0] 52462306a36Sopenharmony_ci mov w7, v17.s[0] 52562306a36Sopenharmony_ci add v1.4s, v1.4s, v17.4s 52662306a36Sopenharmony_ci mov w8, v18.s[0] 52762306a36Sopenharmony_ci mov w9, v19.s[0] 52862306a36Sopenharmony_ci add v2.4s, v2.4s, v18.4s 52962306a36Sopenharmony_ci add a0, a0, w6 53062306a36Sopenharmony_ci add a1, a1, w7 53162306a36Sopenharmony_ci add v3.4s, v3.4s, v19.4s 53262306a36Sopenharmony_ci add a2, a2, w8 53362306a36Sopenharmony_ci add a3, a3, w9 53462306a36Sopenharmony_ciCPU_BE( rev a0, a0 ) 53562306a36Sopenharmony_ciCPU_BE( rev a1, a1 ) 53662306a36Sopenharmony_ciCPU_BE( rev a2, a2 ) 53762306a36Sopenharmony_ciCPU_BE( rev a3, a3 ) 53862306a36Sopenharmony_ci 53962306a36Sopenharmony_ci ld4r {v24.4s-v27.4s}, [x0], #16 54062306a36Sopenharmony_ci ld4r {v28.4s-v31.4s}, [x0] 54162306a36Sopenharmony_ci 54262306a36Sopenharmony_ci // x4[0-3] += s1[0] 54362306a36Sopenharmony_ci // x5[0-3] += s1[1] 54462306a36Sopenharmony_ci // x6[0-3] += s1[2] 54562306a36Sopenharmony_ci // x7[0-3] += s1[3] 54662306a36Sopenharmony_ci add v4.4s, v4.4s, v20.4s 54762306a36Sopenharmony_ci mov w6, v20.s[0] 54862306a36Sopenharmony_ci mov w7, v21.s[0] 54962306a36Sopenharmony_ci add v5.4s, v5.4s, v21.4s 55062306a36Sopenharmony_ci mov w8, v22.s[0] 55162306a36Sopenharmony_ci mov w9, v23.s[0] 55262306a36Sopenharmony_ci add v6.4s, v6.4s, v22.4s 55362306a36Sopenharmony_ci add a4, a4, w6 55462306a36Sopenharmony_ci add a5, a5, w7 55562306a36Sopenharmony_ci add v7.4s, v7.4s, v23.4s 55662306a36Sopenharmony_ci add a6, a6, w8 55762306a36Sopenharmony_ci add a7, a7, w9 55862306a36Sopenharmony_ciCPU_BE( rev a4, a4 ) 55962306a36Sopenharmony_ciCPU_BE( rev a5, a5 ) 56062306a36Sopenharmony_ciCPU_BE( rev a6, a6 ) 56162306a36Sopenharmony_ciCPU_BE( rev a7, a7 ) 56262306a36Sopenharmony_ci 56362306a36Sopenharmony_ci // x8[0-3] += s2[0] 56462306a36Sopenharmony_ci // x9[0-3] += s2[1] 56562306a36Sopenharmony_ci // x10[0-3] += s2[2] 56662306a36Sopenharmony_ci // x11[0-3] += s2[3] 56762306a36Sopenharmony_ci add v8.4s, v8.4s, v24.4s 56862306a36Sopenharmony_ci mov w6, v24.s[0] 56962306a36Sopenharmony_ci mov w7, v25.s[0] 57062306a36Sopenharmony_ci add v9.4s, v9.4s, v25.4s 57162306a36Sopenharmony_ci mov w8, v26.s[0] 57262306a36Sopenharmony_ci mov w9, v27.s[0] 57362306a36Sopenharmony_ci add v10.4s, v10.4s, v26.4s 57462306a36Sopenharmony_ci add a8, a8, w6 57562306a36Sopenharmony_ci add a9, a9, w7 57662306a36Sopenharmony_ci add v11.4s, v11.4s, v27.4s 57762306a36Sopenharmony_ci add a10, a10, w8 57862306a36Sopenharmony_ci add a11, a11, w9 57962306a36Sopenharmony_ciCPU_BE( rev a8, a8 ) 58062306a36Sopenharmony_ciCPU_BE( rev a9, a9 ) 58162306a36Sopenharmony_ciCPU_BE( rev a10, a10 ) 58262306a36Sopenharmony_ciCPU_BE( rev a11, a11 ) 58362306a36Sopenharmony_ci 58462306a36Sopenharmony_ci // x12[0-3] += s3[0] 58562306a36Sopenharmony_ci // x13[0-3] += s3[1] 58662306a36Sopenharmony_ci // x14[0-3] += s3[2] 58762306a36Sopenharmony_ci // x15[0-3] += s3[3] 58862306a36Sopenharmony_ci add v12.4s, v12.4s, v28.4s 58962306a36Sopenharmony_ci mov w6, v28.s[0] 59062306a36Sopenharmony_ci mov w7, v29.s[0] 59162306a36Sopenharmony_ci add v13.4s, v13.4s, v29.4s 59262306a36Sopenharmony_ci mov w8, v30.s[0] 59362306a36Sopenharmony_ci mov w9, v31.s[0] 59462306a36Sopenharmony_ci add v14.4s, v14.4s, v30.4s 59562306a36Sopenharmony_ci add a12, a12, w6 59662306a36Sopenharmony_ci add a13, a13, w7 59762306a36Sopenharmony_ci add v15.4s, v15.4s, v31.4s 59862306a36Sopenharmony_ci add a14, a14, w8 59962306a36Sopenharmony_ci add a15, a15, w9 60062306a36Sopenharmony_ciCPU_BE( rev a12, a12 ) 60162306a36Sopenharmony_ciCPU_BE( rev a13, a13 ) 60262306a36Sopenharmony_ciCPU_BE( rev a14, a14 ) 60362306a36Sopenharmony_ciCPU_BE( rev a15, a15 ) 60462306a36Sopenharmony_ci 60562306a36Sopenharmony_ci // interleave 32-bit words in state n, n+1 60662306a36Sopenharmony_ci ldp w6, w7, [x2], #64 60762306a36Sopenharmony_ci zip1 v16.4s, v0.4s, v1.4s 60862306a36Sopenharmony_ci ldp w8, w9, [x2, #-56] 60962306a36Sopenharmony_ci eor a0, a0, w6 61062306a36Sopenharmony_ci zip2 v17.4s, v0.4s, v1.4s 61162306a36Sopenharmony_ci eor a1, a1, w7 61262306a36Sopenharmony_ci zip1 v18.4s, v2.4s, v3.4s 61362306a36Sopenharmony_ci eor a2, a2, w8 61462306a36Sopenharmony_ci zip2 v19.4s, v2.4s, v3.4s 61562306a36Sopenharmony_ci eor a3, a3, w9 61662306a36Sopenharmony_ci ldp w6, w7, [x2, #-48] 61762306a36Sopenharmony_ci zip1 v20.4s, v4.4s, v5.4s 61862306a36Sopenharmony_ci ldp w8, w9, [x2, #-40] 61962306a36Sopenharmony_ci eor a4, a4, w6 62062306a36Sopenharmony_ci zip2 v21.4s, v4.4s, v5.4s 62162306a36Sopenharmony_ci eor a5, a5, w7 62262306a36Sopenharmony_ci zip1 v22.4s, v6.4s, v7.4s 62362306a36Sopenharmony_ci eor a6, a6, w8 62462306a36Sopenharmony_ci zip2 v23.4s, v6.4s, v7.4s 62562306a36Sopenharmony_ci eor a7, a7, w9 62662306a36Sopenharmony_ci ldp w6, w7, [x2, #-32] 62762306a36Sopenharmony_ci zip1 v24.4s, v8.4s, v9.4s 62862306a36Sopenharmony_ci ldp w8, w9, [x2, #-24] 62962306a36Sopenharmony_ci eor a8, a8, w6 63062306a36Sopenharmony_ci zip2 v25.4s, v8.4s, v9.4s 63162306a36Sopenharmony_ci eor a9, a9, w7 63262306a36Sopenharmony_ci zip1 v26.4s, v10.4s, v11.4s 63362306a36Sopenharmony_ci eor a10, a10, w8 63462306a36Sopenharmony_ci zip2 v27.4s, v10.4s, v11.4s 63562306a36Sopenharmony_ci eor a11, a11, w9 63662306a36Sopenharmony_ci ldp w6, w7, [x2, #-16] 63762306a36Sopenharmony_ci zip1 v28.4s, v12.4s, v13.4s 63862306a36Sopenharmony_ci ldp w8, w9, [x2, #-8] 63962306a36Sopenharmony_ci eor a12, a12, w6 64062306a36Sopenharmony_ci zip2 v29.4s, v12.4s, v13.4s 64162306a36Sopenharmony_ci eor a13, a13, w7 64262306a36Sopenharmony_ci zip1 v30.4s, v14.4s, v15.4s 64362306a36Sopenharmony_ci eor a14, a14, w8 64462306a36Sopenharmony_ci zip2 v31.4s, v14.4s, v15.4s 64562306a36Sopenharmony_ci eor a15, a15, w9 64662306a36Sopenharmony_ci 64762306a36Sopenharmony_ci add x3, x2, x4 64862306a36Sopenharmony_ci sub x3, x3, #128 // start of last block 64962306a36Sopenharmony_ci 65062306a36Sopenharmony_ci subs x5, x4, #128 65162306a36Sopenharmony_ci csel x2, x2, x3, ge 65262306a36Sopenharmony_ci 65362306a36Sopenharmony_ci // interleave 64-bit words in state n, n+2 65462306a36Sopenharmony_ci zip1 v0.2d, v16.2d, v18.2d 65562306a36Sopenharmony_ci zip2 v4.2d, v16.2d, v18.2d 65662306a36Sopenharmony_ci stp a0, a1, [x1], #64 65762306a36Sopenharmony_ci zip1 v8.2d, v17.2d, v19.2d 65862306a36Sopenharmony_ci zip2 v12.2d, v17.2d, v19.2d 65962306a36Sopenharmony_ci stp a2, a3, [x1, #-56] 66062306a36Sopenharmony_ci 66162306a36Sopenharmony_ci subs x6, x4, #192 66262306a36Sopenharmony_ci ld1 {v16.16b-v19.16b}, [x2], #64 66362306a36Sopenharmony_ci csel x2, x2, x3, ge 66462306a36Sopenharmony_ci 66562306a36Sopenharmony_ci zip1 v1.2d, v20.2d, v22.2d 66662306a36Sopenharmony_ci zip2 v5.2d, v20.2d, v22.2d 66762306a36Sopenharmony_ci stp a4, a5, [x1, #-48] 66862306a36Sopenharmony_ci zip1 v9.2d, v21.2d, v23.2d 66962306a36Sopenharmony_ci zip2 v13.2d, v21.2d, v23.2d 67062306a36Sopenharmony_ci stp a6, a7, [x1, #-40] 67162306a36Sopenharmony_ci 67262306a36Sopenharmony_ci subs x7, x4, #256 67362306a36Sopenharmony_ci ld1 {v20.16b-v23.16b}, [x2], #64 67462306a36Sopenharmony_ci csel x2, x2, x3, ge 67562306a36Sopenharmony_ci 67662306a36Sopenharmony_ci zip1 v2.2d, v24.2d, v26.2d 67762306a36Sopenharmony_ci zip2 v6.2d, v24.2d, v26.2d 67862306a36Sopenharmony_ci stp a8, a9, [x1, #-32] 67962306a36Sopenharmony_ci zip1 v10.2d, v25.2d, v27.2d 68062306a36Sopenharmony_ci zip2 v14.2d, v25.2d, v27.2d 68162306a36Sopenharmony_ci stp a10, a11, [x1, #-24] 68262306a36Sopenharmony_ci 68362306a36Sopenharmony_ci subs x8, x4, #320 68462306a36Sopenharmony_ci ld1 {v24.16b-v27.16b}, [x2], #64 68562306a36Sopenharmony_ci csel x2, x2, x3, ge 68662306a36Sopenharmony_ci 68762306a36Sopenharmony_ci zip1 v3.2d, v28.2d, v30.2d 68862306a36Sopenharmony_ci zip2 v7.2d, v28.2d, v30.2d 68962306a36Sopenharmony_ci stp a12, a13, [x1, #-16] 69062306a36Sopenharmony_ci zip1 v11.2d, v29.2d, v31.2d 69162306a36Sopenharmony_ci zip2 v15.2d, v29.2d, v31.2d 69262306a36Sopenharmony_ci stp a14, a15, [x1, #-8] 69362306a36Sopenharmony_ci 69462306a36Sopenharmony_ci tbnz x5, #63, .Lt128 69562306a36Sopenharmony_ci ld1 {v28.16b-v31.16b}, [x2] 69662306a36Sopenharmony_ci 69762306a36Sopenharmony_ci // xor with corresponding input, write to output 69862306a36Sopenharmony_ci eor v16.16b, v16.16b, v0.16b 69962306a36Sopenharmony_ci eor v17.16b, v17.16b, v1.16b 70062306a36Sopenharmony_ci eor v18.16b, v18.16b, v2.16b 70162306a36Sopenharmony_ci eor v19.16b, v19.16b, v3.16b 70262306a36Sopenharmony_ci 70362306a36Sopenharmony_ci tbnz x6, #63, .Lt192 70462306a36Sopenharmony_ci 70562306a36Sopenharmony_ci eor v20.16b, v20.16b, v4.16b 70662306a36Sopenharmony_ci eor v21.16b, v21.16b, v5.16b 70762306a36Sopenharmony_ci eor v22.16b, v22.16b, v6.16b 70862306a36Sopenharmony_ci eor v23.16b, v23.16b, v7.16b 70962306a36Sopenharmony_ci 71062306a36Sopenharmony_ci st1 {v16.16b-v19.16b}, [x1], #64 71162306a36Sopenharmony_ci tbnz x7, #63, .Lt256 71262306a36Sopenharmony_ci 71362306a36Sopenharmony_ci eor v24.16b, v24.16b, v8.16b 71462306a36Sopenharmony_ci eor v25.16b, v25.16b, v9.16b 71562306a36Sopenharmony_ci eor v26.16b, v26.16b, v10.16b 71662306a36Sopenharmony_ci eor v27.16b, v27.16b, v11.16b 71762306a36Sopenharmony_ci 71862306a36Sopenharmony_ci st1 {v20.16b-v23.16b}, [x1], #64 71962306a36Sopenharmony_ci tbnz x8, #63, .Lt320 72062306a36Sopenharmony_ci 72162306a36Sopenharmony_ci eor v28.16b, v28.16b, v12.16b 72262306a36Sopenharmony_ci eor v29.16b, v29.16b, v13.16b 72362306a36Sopenharmony_ci eor v30.16b, v30.16b, v14.16b 72462306a36Sopenharmony_ci eor v31.16b, v31.16b, v15.16b 72562306a36Sopenharmony_ci 72662306a36Sopenharmony_ci st1 {v24.16b-v27.16b}, [x1], #64 72762306a36Sopenharmony_ci st1 {v28.16b-v31.16b}, [x1] 72862306a36Sopenharmony_ci 72962306a36Sopenharmony_ci.Lout: frame_pop 73062306a36Sopenharmony_ci ret 73162306a36Sopenharmony_ci 73262306a36Sopenharmony_ci // fewer than 192 bytes of in/output 73362306a36Sopenharmony_ci.Lt192: cbz x5, 1f // exactly 128 bytes? 73462306a36Sopenharmony_ci ld1 {v28.16b-v31.16b}, [x10] 73562306a36Sopenharmony_ci add x5, x5, x1 73662306a36Sopenharmony_ci tbl v28.16b, {v4.16b-v7.16b}, v28.16b 73762306a36Sopenharmony_ci tbl v29.16b, {v4.16b-v7.16b}, v29.16b 73862306a36Sopenharmony_ci tbl v30.16b, {v4.16b-v7.16b}, v30.16b 73962306a36Sopenharmony_ci tbl v31.16b, {v4.16b-v7.16b}, v31.16b 74062306a36Sopenharmony_ci 74162306a36Sopenharmony_ci0: eor v20.16b, v20.16b, v28.16b 74262306a36Sopenharmony_ci eor v21.16b, v21.16b, v29.16b 74362306a36Sopenharmony_ci eor v22.16b, v22.16b, v30.16b 74462306a36Sopenharmony_ci eor v23.16b, v23.16b, v31.16b 74562306a36Sopenharmony_ci st1 {v20.16b-v23.16b}, [x5] // overlapping stores 74662306a36Sopenharmony_ci1: st1 {v16.16b-v19.16b}, [x1] 74762306a36Sopenharmony_ci b .Lout 74862306a36Sopenharmony_ci 74962306a36Sopenharmony_ci // fewer than 128 bytes of in/output 75062306a36Sopenharmony_ci.Lt128: ld1 {v28.16b-v31.16b}, [x10] 75162306a36Sopenharmony_ci add x5, x5, x1 75262306a36Sopenharmony_ci sub x1, x1, #64 75362306a36Sopenharmony_ci tbl v28.16b, {v0.16b-v3.16b}, v28.16b 75462306a36Sopenharmony_ci tbl v29.16b, {v0.16b-v3.16b}, v29.16b 75562306a36Sopenharmony_ci tbl v30.16b, {v0.16b-v3.16b}, v30.16b 75662306a36Sopenharmony_ci tbl v31.16b, {v0.16b-v3.16b}, v31.16b 75762306a36Sopenharmony_ci ld1 {v16.16b-v19.16b}, [x1] // reload first output block 75862306a36Sopenharmony_ci b 0b 75962306a36Sopenharmony_ci 76062306a36Sopenharmony_ci // fewer than 256 bytes of in/output 76162306a36Sopenharmony_ci.Lt256: cbz x6, 2f // exactly 192 bytes? 76262306a36Sopenharmony_ci ld1 {v4.16b-v7.16b}, [x10] 76362306a36Sopenharmony_ci add x6, x6, x1 76462306a36Sopenharmony_ci tbl v0.16b, {v8.16b-v11.16b}, v4.16b 76562306a36Sopenharmony_ci tbl v1.16b, {v8.16b-v11.16b}, v5.16b 76662306a36Sopenharmony_ci tbl v2.16b, {v8.16b-v11.16b}, v6.16b 76762306a36Sopenharmony_ci tbl v3.16b, {v8.16b-v11.16b}, v7.16b 76862306a36Sopenharmony_ci 76962306a36Sopenharmony_ci eor v28.16b, v28.16b, v0.16b 77062306a36Sopenharmony_ci eor v29.16b, v29.16b, v1.16b 77162306a36Sopenharmony_ci eor v30.16b, v30.16b, v2.16b 77262306a36Sopenharmony_ci eor v31.16b, v31.16b, v3.16b 77362306a36Sopenharmony_ci st1 {v28.16b-v31.16b}, [x6] // overlapping stores 77462306a36Sopenharmony_ci2: st1 {v20.16b-v23.16b}, [x1] 77562306a36Sopenharmony_ci b .Lout 77662306a36Sopenharmony_ci 77762306a36Sopenharmony_ci // fewer than 320 bytes of in/output 77862306a36Sopenharmony_ci.Lt320: cbz x7, 3f // exactly 256 bytes? 77962306a36Sopenharmony_ci ld1 {v4.16b-v7.16b}, [x10] 78062306a36Sopenharmony_ci add x7, x7, x1 78162306a36Sopenharmony_ci tbl v0.16b, {v12.16b-v15.16b}, v4.16b 78262306a36Sopenharmony_ci tbl v1.16b, {v12.16b-v15.16b}, v5.16b 78362306a36Sopenharmony_ci tbl v2.16b, {v12.16b-v15.16b}, v6.16b 78462306a36Sopenharmony_ci tbl v3.16b, {v12.16b-v15.16b}, v7.16b 78562306a36Sopenharmony_ci 78662306a36Sopenharmony_ci eor v28.16b, v28.16b, v0.16b 78762306a36Sopenharmony_ci eor v29.16b, v29.16b, v1.16b 78862306a36Sopenharmony_ci eor v30.16b, v30.16b, v2.16b 78962306a36Sopenharmony_ci eor v31.16b, v31.16b, v3.16b 79062306a36Sopenharmony_ci st1 {v28.16b-v31.16b}, [x7] // overlapping stores 79162306a36Sopenharmony_ci3: st1 {v24.16b-v27.16b}, [x1] 79262306a36Sopenharmony_ci b .Lout 79362306a36Sopenharmony_ciSYM_FUNC_END(chacha_4block_xor_neon) 79462306a36Sopenharmony_ci 79562306a36Sopenharmony_ci .section ".rodata", "a", %progbits 79662306a36Sopenharmony_ci .align L1_CACHE_SHIFT 79762306a36Sopenharmony_ci.Lpermute: 79862306a36Sopenharmony_ci .set .Li, 0 79962306a36Sopenharmony_ci .rept 128 80062306a36Sopenharmony_ci .byte (.Li - 64) 80162306a36Sopenharmony_ci .set .Li, .Li + 1 80262306a36Sopenharmony_ci .endr 80362306a36Sopenharmony_ci 80462306a36Sopenharmony_ciCTRINC: .word 1, 2, 3, 4 80562306a36Sopenharmony_ciROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f 806