162306a36Sopenharmony_ci/* 262306a36Sopenharmony_ci * ChaCha/XChaCha NEON helper functions 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * This program is free software; you can redistribute it and/or modify 762306a36Sopenharmony_ci * it under the terms of the GNU General Public License version 2 as 862306a36Sopenharmony_ci * published by the Free Software Foundation. 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * Based on: 1162306a36Sopenharmony_ci * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions 1262306a36Sopenharmony_ci * 1362306a36Sopenharmony_ci * Copyright (C) 2015 Martin Willi 1462306a36Sopenharmony_ci * 1562306a36Sopenharmony_ci * This program is free software; you can redistribute it and/or modify 1662306a36Sopenharmony_ci * it under the terms of the GNU General Public License as published by 1762306a36Sopenharmony_ci * the Free Software Foundation; either version 2 of the License, or 1862306a36Sopenharmony_ci * (at your option) any later version. 1962306a36Sopenharmony_ci */ 2062306a36Sopenharmony_ci 2162306a36Sopenharmony_ci /* 2262306a36Sopenharmony_ci * NEON doesn't have a rotate instruction. The alternatives are, more or less: 2362306a36Sopenharmony_ci * 2462306a36Sopenharmony_ci * (a) vshl.u32 + vsri.u32 (needs temporary register) 2562306a36Sopenharmony_ci * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register) 2662306a36Sopenharmony_ci * (c) vrev32.16 (16-bit rotations only) 2762306a36Sopenharmony_ci * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only, 2862306a36Sopenharmony_ci * needs index vector) 2962306a36Sopenharmony_ci * 3062306a36Sopenharmony_ci * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations, 3162306a36Sopenharmony_ci * the only choices are (a) and (b). We use (a) since it takes two-thirds the 3262306a36Sopenharmony_ci * cycles of (b) on both Cortex-A7 and Cortex-A53. 3362306a36Sopenharmony_ci * 3462306a36Sopenharmony_ci * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest 3562306a36Sopenharmony_ci * and doesn't need a temporary register. 3662306a36Sopenharmony_ci * 3762306a36Sopenharmony_ci * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence 3862306a36Sopenharmony_ci * is twice as fast as (a), even when doing (a) on multiple registers 3962306a36Sopenharmony_ci * simultaneously to eliminate the stall between vshl and vsri. Also, it 4062306a36Sopenharmony_ci * parallelizes better when temporary registers are scarce. 4162306a36Sopenharmony_ci * 4262306a36Sopenharmony_ci * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as 4362306a36Sopenharmony_ci * (a), so the need to load the rotation table actually makes the vtbl method 4462306a36Sopenharmony_ci * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it 4562306a36Sopenharmony_ci * seems to be a good compromise to get a more significant speed boost on some 4662306a36Sopenharmony_ci * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7. 4762306a36Sopenharmony_ci */ 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci#include <linux/linkage.h> 5062306a36Sopenharmony_ci#include <asm/cache.h> 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ci .text 5362306a36Sopenharmony_ci .fpu neon 5462306a36Sopenharmony_ci .align 5 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci/* 5762306a36Sopenharmony_ci * chacha_permute - permute one block 5862306a36Sopenharmony_ci * 5962306a36Sopenharmony_ci * Permute one 64-byte block where the state matrix is stored in the four NEON 6062306a36Sopenharmony_ci * registers q0-q3. It performs matrix operations on four words in parallel, 6162306a36Sopenharmony_ci * but requires shuffling to rearrange the words after each round. 6262306a36Sopenharmony_ci * 6362306a36Sopenharmony_ci * The round count is given in r3. 6462306a36Sopenharmony_ci * 6562306a36Sopenharmony_ci * Clobbers: r3, ip, q4-q5 6662306a36Sopenharmony_ci */ 6762306a36Sopenharmony_cichacha_permute: 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_ci adr ip, .Lrol8_table 7062306a36Sopenharmony_ci vld1.8 {d10}, [ip, :64] 7162306a36Sopenharmony_ci 7262306a36Sopenharmony_ci.Ldoubleround: 7362306a36Sopenharmony_ci // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 7462306a36Sopenharmony_ci vadd.i32 q0, q0, q1 7562306a36Sopenharmony_ci veor q3, q3, q0 7662306a36Sopenharmony_ci vrev32.16 q3, q3 7762306a36Sopenharmony_ci 7862306a36Sopenharmony_ci // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 7962306a36Sopenharmony_ci vadd.i32 q2, q2, q3 8062306a36Sopenharmony_ci veor q4, q1, q2 8162306a36Sopenharmony_ci vshl.u32 q1, q4, #12 8262306a36Sopenharmony_ci vsri.u32 q1, q4, #20 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 8562306a36Sopenharmony_ci vadd.i32 q0, q0, q1 8662306a36Sopenharmony_ci veor q3, q3, q0 8762306a36Sopenharmony_ci vtbl.8 d6, {d6}, d10 8862306a36Sopenharmony_ci vtbl.8 d7, {d7}, d10 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 9162306a36Sopenharmony_ci vadd.i32 q2, q2, q3 9262306a36Sopenharmony_ci veor q4, q1, q2 9362306a36Sopenharmony_ci vshl.u32 q1, q4, #7 9462306a36Sopenharmony_ci vsri.u32 q1, q4, #25 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 9762306a36Sopenharmony_ci vext.8 q1, q1, q1, #4 9862306a36Sopenharmony_ci // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 9962306a36Sopenharmony_ci vext.8 q2, q2, q2, #8 10062306a36Sopenharmony_ci // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 10162306a36Sopenharmony_ci vext.8 q3, q3, q3, #12 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_ci // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 10462306a36Sopenharmony_ci vadd.i32 q0, q0, q1 10562306a36Sopenharmony_ci veor q3, q3, q0 10662306a36Sopenharmony_ci vrev32.16 q3, q3 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 10962306a36Sopenharmony_ci vadd.i32 q2, q2, q3 11062306a36Sopenharmony_ci veor q4, q1, q2 11162306a36Sopenharmony_ci vshl.u32 q1, q4, #12 11262306a36Sopenharmony_ci vsri.u32 q1, q4, #20 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 11562306a36Sopenharmony_ci vadd.i32 q0, q0, q1 11662306a36Sopenharmony_ci veor q3, q3, q0 11762306a36Sopenharmony_ci vtbl.8 d6, {d6}, d10 11862306a36Sopenharmony_ci vtbl.8 d7, {d7}, d10 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ci // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 12162306a36Sopenharmony_ci vadd.i32 q2, q2, q3 12262306a36Sopenharmony_ci veor q4, q1, q2 12362306a36Sopenharmony_ci vshl.u32 q1, q4, #7 12462306a36Sopenharmony_ci vsri.u32 q1, q4, #25 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_ci // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 12762306a36Sopenharmony_ci vext.8 q1, q1, q1, #12 12862306a36Sopenharmony_ci // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 12962306a36Sopenharmony_ci vext.8 q2, q2, q2, #8 13062306a36Sopenharmony_ci // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 13162306a36Sopenharmony_ci vext.8 q3, q3, q3, #4 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_ci subs r3, r3, #2 13462306a36Sopenharmony_ci bne .Ldoubleround 13562306a36Sopenharmony_ci 13662306a36Sopenharmony_ci bx lr 13762306a36Sopenharmony_ciENDPROC(chacha_permute) 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ciENTRY(chacha_block_xor_neon) 14062306a36Sopenharmony_ci // r0: Input state matrix, s 14162306a36Sopenharmony_ci // r1: 1 data block output, o 14262306a36Sopenharmony_ci // r2: 1 data block input, i 14362306a36Sopenharmony_ci // r3: nrounds 14462306a36Sopenharmony_ci push {lr} 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci // x0..3 = s0..3 14762306a36Sopenharmony_ci add ip, r0, #0x20 14862306a36Sopenharmony_ci vld1.32 {q0-q1}, [r0] 14962306a36Sopenharmony_ci vld1.32 {q2-q3}, [ip] 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci vmov q8, q0 15262306a36Sopenharmony_ci vmov q9, q1 15362306a36Sopenharmony_ci vmov q10, q2 15462306a36Sopenharmony_ci vmov q11, q3 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_ci bl chacha_permute 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci add ip, r2, #0x20 15962306a36Sopenharmony_ci vld1.8 {q4-q5}, [r2] 16062306a36Sopenharmony_ci vld1.8 {q6-q7}, [ip] 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_ci // o0 = i0 ^ (x0 + s0) 16362306a36Sopenharmony_ci vadd.i32 q0, q0, q8 16462306a36Sopenharmony_ci veor q0, q0, q4 16562306a36Sopenharmony_ci 16662306a36Sopenharmony_ci // o1 = i1 ^ (x1 + s1) 16762306a36Sopenharmony_ci vadd.i32 q1, q1, q9 16862306a36Sopenharmony_ci veor q1, q1, q5 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci // o2 = i2 ^ (x2 + s2) 17162306a36Sopenharmony_ci vadd.i32 q2, q2, q10 17262306a36Sopenharmony_ci veor q2, q2, q6 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_ci // o3 = i3 ^ (x3 + s3) 17562306a36Sopenharmony_ci vadd.i32 q3, q3, q11 17662306a36Sopenharmony_ci veor q3, q3, q7 17762306a36Sopenharmony_ci 17862306a36Sopenharmony_ci add ip, r1, #0x20 17962306a36Sopenharmony_ci vst1.8 {q0-q1}, [r1] 18062306a36Sopenharmony_ci vst1.8 {q2-q3}, [ip] 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_ci pop {pc} 18362306a36Sopenharmony_ciENDPROC(chacha_block_xor_neon) 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ciENTRY(hchacha_block_neon) 18662306a36Sopenharmony_ci // r0: Input state matrix, s 18762306a36Sopenharmony_ci // r1: output (8 32-bit words) 18862306a36Sopenharmony_ci // r2: nrounds 18962306a36Sopenharmony_ci push {lr} 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci vld1.32 {q0-q1}, [r0]! 19262306a36Sopenharmony_ci vld1.32 {q2-q3}, [r0] 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_ci mov r3, r2 19562306a36Sopenharmony_ci bl chacha_permute 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci vst1.32 {q0}, [r1]! 19862306a36Sopenharmony_ci vst1.32 {q3}, [r1] 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ci pop {pc} 20162306a36Sopenharmony_ciENDPROC(hchacha_block_neon) 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci .align 4 20462306a36Sopenharmony_ci.Lctrinc: .word 0, 1, 2, 3 20562306a36Sopenharmony_ci.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci .align 5 20862306a36Sopenharmony_ciENTRY(chacha_4block_xor_neon) 20962306a36Sopenharmony_ci push {r4, lr} 21062306a36Sopenharmony_ci mov r4, sp // preserve the stack pointer 21162306a36Sopenharmony_ci sub ip, sp, #0x20 // allocate a 32 byte buffer 21262306a36Sopenharmony_ci bic ip, ip, #0x1f // aligned to 32 bytes 21362306a36Sopenharmony_ci mov sp, ip 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_ci // r0: Input state matrix, s 21662306a36Sopenharmony_ci // r1: 4 data blocks output, o 21762306a36Sopenharmony_ci // r2: 4 data blocks input, i 21862306a36Sopenharmony_ci // r3: nrounds 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_ci // 22162306a36Sopenharmony_ci // This function encrypts four consecutive ChaCha blocks by loading 22262306a36Sopenharmony_ci // the state matrix in NEON registers four times. The algorithm performs 22362306a36Sopenharmony_ci // each operation on the corresponding word of each state matrix, hence 22462306a36Sopenharmony_ci // requires no word shuffling. The words are re-interleaved before the 22562306a36Sopenharmony_ci // final addition of the original state and the XORing step. 22662306a36Sopenharmony_ci // 22762306a36Sopenharmony_ci 22862306a36Sopenharmony_ci // x0..15[0-3] = s0..15[0-3] 22962306a36Sopenharmony_ci add ip, r0, #0x20 23062306a36Sopenharmony_ci vld1.32 {q0-q1}, [r0] 23162306a36Sopenharmony_ci vld1.32 {q2-q3}, [ip] 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci adr lr, .Lctrinc 23462306a36Sopenharmony_ci vdup.32 q15, d7[1] 23562306a36Sopenharmony_ci vdup.32 q14, d7[0] 23662306a36Sopenharmony_ci vld1.32 {q4}, [lr, :128] 23762306a36Sopenharmony_ci vdup.32 q13, d6[1] 23862306a36Sopenharmony_ci vdup.32 q12, d6[0] 23962306a36Sopenharmony_ci vdup.32 q11, d5[1] 24062306a36Sopenharmony_ci vdup.32 q10, d5[0] 24162306a36Sopenharmony_ci vadd.u32 q12, q12, q4 // x12 += counter values 0-3 24262306a36Sopenharmony_ci vdup.32 q9, d4[1] 24362306a36Sopenharmony_ci vdup.32 q8, d4[0] 24462306a36Sopenharmony_ci vdup.32 q7, d3[1] 24562306a36Sopenharmony_ci vdup.32 q6, d3[0] 24662306a36Sopenharmony_ci vdup.32 q5, d2[1] 24762306a36Sopenharmony_ci vdup.32 q4, d2[0] 24862306a36Sopenharmony_ci vdup.32 q3, d1[1] 24962306a36Sopenharmony_ci vdup.32 q2, d1[0] 25062306a36Sopenharmony_ci vdup.32 q1, d0[1] 25162306a36Sopenharmony_ci vdup.32 q0, d0[0] 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci adr ip, .Lrol8_table 25462306a36Sopenharmony_ci b 1f 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci.Ldoubleround4: 25762306a36Sopenharmony_ci vld1.32 {q8-q9}, [sp, :256] 25862306a36Sopenharmony_ci1: 25962306a36Sopenharmony_ci // x0 += x4, x12 = rotl32(x12 ^ x0, 16) 26062306a36Sopenharmony_ci // x1 += x5, x13 = rotl32(x13 ^ x1, 16) 26162306a36Sopenharmony_ci // x2 += x6, x14 = rotl32(x14 ^ x2, 16) 26262306a36Sopenharmony_ci // x3 += x7, x15 = rotl32(x15 ^ x3, 16) 26362306a36Sopenharmony_ci vadd.i32 q0, q0, q4 26462306a36Sopenharmony_ci vadd.i32 q1, q1, q5 26562306a36Sopenharmony_ci vadd.i32 q2, q2, q6 26662306a36Sopenharmony_ci vadd.i32 q3, q3, q7 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci veor q12, q12, q0 26962306a36Sopenharmony_ci veor q13, q13, q1 27062306a36Sopenharmony_ci veor q14, q14, q2 27162306a36Sopenharmony_ci veor q15, q15, q3 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci vrev32.16 q12, q12 27462306a36Sopenharmony_ci vrev32.16 q13, q13 27562306a36Sopenharmony_ci vrev32.16 q14, q14 27662306a36Sopenharmony_ci vrev32.16 q15, q15 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci // x8 += x12, x4 = rotl32(x4 ^ x8, 12) 27962306a36Sopenharmony_ci // x9 += x13, x5 = rotl32(x5 ^ x9, 12) 28062306a36Sopenharmony_ci // x10 += x14, x6 = rotl32(x6 ^ x10, 12) 28162306a36Sopenharmony_ci // x11 += x15, x7 = rotl32(x7 ^ x11, 12) 28262306a36Sopenharmony_ci vadd.i32 q8, q8, q12 28362306a36Sopenharmony_ci vadd.i32 q9, q9, q13 28462306a36Sopenharmony_ci vadd.i32 q10, q10, q14 28562306a36Sopenharmony_ci vadd.i32 q11, q11, q15 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci vst1.32 {q8-q9}, [sp, :256] 28862306a36Sopenharmony_ci 28962306a36Sopenharmony_ci veor q8, q4, q8 29062306a36Sopenharmony_ci veor q9, q5, q9 29162306a36Sopenharmony_ci vshl.u32 q4, q8, #12 29262306a36Sopenharmony_ci vshl.u32 q5, q9, #12 29362306a36Sopenharmony_ci vsri.u32 q4, q8, #20 29462306a36Sopenharmony_ci vsri.u32 q5, q9, #20 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_ci veor q8, q6, q10 29762306a36Sopenharmony_ci veor q9, q7, q11 29862306a36Sopenharmony_ci vshl.u32 q6, q8, #12 29962306a36Sopenharmony_ci vshl.u32 q7, q9, #12 30062306a36Sopenharmony_ci vsri.u32 q6, q8, #20 30162306a36Sopenharmony_ci vsri.u32 q7, q9, #20 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_ci // x0 += x4, x12 = rotl32(x12 ^ x0, 8) 30462306a36Sopenharmony_ci // x1 += x5, x13 = rotl32(x13 ^ x1, 8) 30562306a36Sopenharmony_ci // x2 += x6, x14 = rotl32(x14 ^ x2, 8) 30662306a36Sopenharmony_ci // x3 += x7, x15 = rotl32(x15 ^ x3, 8) 30762306a36Sopenharmony_ci vld1.8 {d16}, [ip, :64] 30862306a36Sopenharmony_ci vadd.i32 q0, q0, q4 30962306a36Sopenharmony_ci vadd.i32 q1, q1, q5 31062306a36Sopenharmony_ci vadd.i32 q2, q2, q6 31162306a36Sopenharmony_ci vadd.i32 q3, q3, q7 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci veor q12, q12, q0 31462306a36Sopenharmony_ci veor q13, q13, q1 31562306a36Sopenharmony_ci veor q14, q14, q2 31662306a36Sopenharmony_ci veor q15, q15, q3 31762306a36Sopenharmony_ci 31862306a36Sopenharmony_ci vtbl.8 d24, {d24}, d16 31962306a36Sopenharmony_ci vtbl.8 d25, {d25}, d16 32062306a36Sopenharmony_ci vtbl.8 d26, {d26}, d16 32162306a36Sopenharmony_ci vtbl.8 d27, {d27}, d16 32262306a36Sopenharmony_ci vtbl.8 d28, {d28}, d16 32362306a36Sopenharmony_ci vtbl.8 d29, {d29}, d16 32462306a36Sopenharmony_ci vtbl.8 d30, {d30}, d16 32562306a36Sopenharmony_ci vtbl.8 d31, {d31}, d16 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_ci vld1.32 {q8-q9}, [sp, :256] 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_ci // x8 += x12, x4 = rotl32(x4 ^ x8, 7) 33062306a36Sopenharmony_ci // x9 += x13, x5 = rotl32(x5 ^ x9, 7) 33162306a36Sopenharmony_ci // x10 += x14, x6 = rotl32(x6 ^ x10, 7) 33262306a36Sopenharmony_ci // x11 += x15, x7 = rotl32(x7 ^ x11, 7) 33362306a36Sopenharmony_ci vadd.i32 q8, q8, q12 33462306a36Sopenharmony_ci vadd.i32 q9, q9, q13 33562306a36Sopenharmony_ci vadd.i32 q10, q10, q14 33662306a36Sopenharmony_ci vadd.i32 q11, q11, q15 33762306a36Sopenharmony_ci 33862306a36Sopenharmony_ci vst1.32 {q8-q9}, [sp, :256] 33962306a36Sopenharmony_ci 34062306a36Sopenharmony_ci veor q8, q4, q8 34162306a36Sopenharmony_ci veor q9, q5, q9 34262306a36Sopenharmony_ci vshl.u32 q4, q8, #7 34362306a36Sopenharmony_ci vshl.u32 q5, q9, #7 34462306a36Sopenharmony_ci vsri.u32 q4, q8, #25 34562306a36Sopenharmony_ci vsri.u32 q5, q9, #25 34662306a36Sopenharmony_ci 34762306a36Sopenharmony_ci veor q8, q6, q10 34862306a36Sopenharmony_ci veor q9, q7, q11 34962306a36Sopenharmony_ci vshl.u32 q6, q8, #7 35062306a36Sopenharmony_ci vshl.u32 q7, q9, #7 35162306a36Sopenharmony_ci vsri.u32 q6, q8, #25 35262306a36Sopenharmony_ci vsri.u32 q7, q9, #25 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_ci vld1.32 {q8-q9}, [sp, :256] 35562306a36Sopenharmony_ci 35662306a36Sopenharmony_ci // x0 += x5, x15 = rotl32(x15 ^ x0, 16) 35762306a36Sopenharmony_ci // x1 += x6, x12 = rotl32(x12 ^ x1, 16) 35862306a36Sopenharmony_ci // x2 += x7, x13 = rotl32(x13 ^ x2, 16) 35962306a36Sopenharmony_ci // x3 += x4, x14 = rotl32(x14 ^ x3, 16) 36062306a36Sopenharmony_ci vadd.i32 q0, q0, q5 36162306a36Sopenharmony_ci vadd.i32 q1, q1, q6 36262306a36Sopenharmony_ci vadd.i32 q2, q2, q7 36362306a36Sopenharmony_ci vadd.i32 q3, q3, q4 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_ci veor q15, q15, q0 36662306a36Sopenharmony_ci veor q12, q12, q1 36762306a36Sopenharmony_ci veor q13, q13, q2 36862306a36Sopenharmony_ci veor q14, q14, q3 36962306a36Sopenharmony_ci 37062306a36Sopenharmony_ci vrev32.16 q15, q15 37162306a36Sopenharmony_ci vrev32.16 q12, q12 37262306a36Sopenharmony_ci vrev32.16 q13, q13 37362306a36Sopenharmony_ci vrev32.16 q14, q14 37462306a36Sopenharmony_ci 37562306a36Sopenharmony_ci // x10 += x15, x5 = rotl32(x5 ^ x10, 12) 37662306a36Sopenharmony_ci // x11 += x12, x6 = rotl32(x6 ^ x11, 12) 37762306a36Sopenharmony_ci // x8 += x13, x7 = rotl32(x7 ^ x8, 12) 37862306a36Sopenharmony_ci // x9 += x14, x4 = rotl32(x4 ^ x9, 12) 37962306a36Sopenharmony_ci vadd.i32 q10, q10, q15 38062306a36Sopenharmony_ci vadd.i32 q11, q11, q12 38162306a36Sopenharmony_ci vadd.i32 q8, q8, q13 38262306a36Sopenharmony_ci vadd.i32 q9, q9, q14 38362306a36Sopenharmony_ci 38462306a36Sopenharmony_ci vst1.32 {q8-q9}, [sp, :256] 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_ci veor q8, q7, q8 38762306a36Sopenharmony_ci veor q9, q4, q9 38862306a36Sopenharmony_ci vshl.u32 q7, q8, #12 38962306a36Sopenharmony_ci vshl.u32 q4, q9, #12 39062306a36Sopenharmony_ci vsri.u32 q7, q8, #20 39162306a36Sopenharmony_ci vsri.u32 q4, q9, #20 39262306a36Sopenharmony_ci 39362306a36Sopenharmony_ci veor q8, q5, q10 39462306a36Sopenharmony_ci veor q9, q6, q11 39562306a36Sopenharmony_ci vshl.u32 q5, q8, #12 39662306a36Sopenharmony_ci vshl.u32 q6, q9, #12 39762306a36Sopenharmony_ci vsri.u32 q5, q8, #20 39862306a36Sopenharmony_ci vsri.u32 q6, q9, #20 39962306a36Sopenharmony_ci 40062306a36Sopenharmony_ci // x0 += x5, x15 = rotl32(x15 ^ x0, 8) 40162306a36Sopenharmony_ci // x1 += x6, x12 = rotl32(x12 ^ x1, 8) 40262306a36Sopenharmony_ci // x2 += x7, x13 = rotl32(x13 ^ x2, 8) 40362306a36Sopenharmony_ci // x3 += x4, x14 = rotl32(x14 ^ x3, 8) 40462306a36Sopenharmony_ci vld1.8 {d16}, [ip, :64] 40562306a36Sopenharmony_ci vadd.i32 q0, q0, q5 40662306a36Sopenharmony_ci vadd.i32 q1, q1, q6 40762306a36Sopenharmony_ci vadd.i32 q2, q2, q7 40862306a36Sopenharmony_ci vadd.i32 q3, q3, q4 40962306a36Sopenharmony_ci 41062306a36Sopenharmony_ci veor q15, q15, q0 41162306a36Sopenharmony_ci veor q12, q12, q1 41262306a36Sopenharmony_ci veor q13, q13, q2 41362306a36Sopenharmony_ci veor q14, q14, q3 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci vtbl.8 d30, {d30}, d16 41662306a36Sopenharmony_ci vtbl.8 d31, {d31}, d16 41762306a36Sopenharmony_ci vtbl.8 d24, {d24}, d16 41862306a36Sopenharmony_ci vtbl.8 d25, {d25}, d16 41962306a36Sopenharmony_ci vtbl.8 d26, {d26}, d16 42062306a36Sopenharmony_ci vtbl.8 d27, {d27}, d16 42162306a36Sopenharmony_ci vtbl.8 d28, {d28}, d16 42262306a36Sopenharmony_ci vtbl.8 d29, {d29}, d16 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci vld1.32 {q8-q9}, [sp, :256] 42562306a36Sopenharmony_ci 42662306a36Sopenharmony_ci // x10 += x15, x5 = rotl32(x5 ^ x10, 7) 42762306a36Sopenharmony_ci // x11 += x12, x6 = rotl32(x6 ^ x11, 7) 42862306a36Sopenharmony_ci // x8 += x13, x7 = rotl32(x7 ^ x8, 7) 42962306a36Sopenharmony_ci // x9 += x14, x4 = rotl32(x4 ^ x9, 7) 43062306a36Sopenharmony_ci vadd.i32 q10, q10, q15 43162306a36Sopenharmony_ci vadd.i32 q11, q11, q12 43262306a36Sopenharmony_ci vadd.i32 q8, q8, q13 43362306a36Sopenharmony_ci vadd.i32 q9, q9, q14 43462306a36Sopenharmony_ci 43562306a36Sopenharmony_ci vst1.32 {q8-q9}, [sp, :256] 43662306a36Sopenharmony_ci 43762306a36Sopenharmony_ci veor q8, q7, q8 43862306a36Sopenharmony_ci veor q9, q4, q9 43962306a36Sopenharmony_ci vshl.u32 q7, q8, #7 44062306a36Sopenharmony_ci vshl.u32 q4, q9, #7 44162306a36Sopenharmony_ci vsri.u32 q7, q8, #25 44262306a36Sopenharmony_ci vsri.u32 q4, q9, #25 44362306a36Sopenharmony_ci 44462306a36Sopenharmony_ci veor q8, q5, q10 44562306a36Sopenharmony_ci veor q9, q6, q11 44662306a36Sopenharmony_ci vshl.u32 q5, q8, #7 44762306a36Sopenharmony_ci vshl.u32 q6, q9, #7 44862306a36Sopenharmony_ci vsri.u32 q5, q8, #25 44962306a36Sopenharmony_ci vsri.u32 q6, q9, #25 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci subs r3, r3, #2 45262306a36Sopenharmony_ci bne .Ldoubleround4 45362306a36Sopenharmony_ci 45462306a36Sopenharmony_ci // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15. 45562306a36Sopenharmony_ci // x8..9[0-3] are on the stack. 45662306a36Sopenharmony_ci 45762306a36Sopenharmony_ci // Re-interleave the words in the first two rows of each block (x0..7). 45862306a36Sopenharmony_ci // Also add the counter values 0-3 to x12[0-3]. 45962306a36Sopenharmony_ci vld1.32 {q8}, [lr, :128] // load counter values 0-3 46062306a36Sopenharmony_ci vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) 46162306a36Sopenharmony_ci vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) 46262306a36Sopenharmony_ci vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) 46362306a36Sopenharmony_ci vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7) 46462306a36Sopenharmony_ci vadd.u32 q12, q8 // x12 += counter values 0-3 46562306a36Sopenharmony_ci vswp d1, d4 46662306a36Sopenharmony_ci vswp d3, d6 46762306a36Sopenharmony_ci vld1.32 {q8-q9}, [r0]! // load s0..7 46862306a36Sopenharmony_ci vswp d9, d12 46962306a36Sopenharmony_ci vswp d11, d14 47062306a36Sopenharmony_ci 47162306a36Sopenharmony_ci // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1) 47262306a36Sopenharmony_ci // after XORing the first 32 bytes. 47362306a36Sopenharmony_ci vswp q1, q4 47462306a36Sopenharmony_ci 47562306a36Sopenharmony_ci // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7) 47662306a36Sopenharmony_ci 47762306a36Sopenharmony_ci // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block) 47862306a36Sopenharmony_ci vadd.u32 q0, q0, q8 47962306a36Sopenharmony_ci vadd.u32 q2, q2, q8 48062306a36Sopenharmony_ci vadd.u32 q4, q4, q8 48162306a36Sopenharmony_ci vadd.u32 q3, q3, q8 48262306a36Sopenharmony_ci 48362306a36Sopenharmony_ci // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block) 48462306a36Sopenharmony_ci vadd.u32 q1, q1, q9 48562306a36Sopenharmony_ci vadd.u32 q6, q6, q9 48662306a36Sopenharmony_ci vadd.u32 q5, q5, q9 48762306a36Sopenharmony_ci vadd.u32 q7, q7, q9 48862306a36Sopenharmony_ci 48962306a36Sopenharmony_ci // XOR first 32 bytes using keystream from first two rows of first block 49062306a36Sopenharmony_ci vld1.8 {q8-q9}, [r2]! 49162306a36Sopenharmony_ci veor q8, q8, q0 49262306a36Sopenharmony_ci veor q9, q9, q1 49362306a36Sopenharmony_ci vst1.8 {q8-q9}, [r1]! 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci // Re-interleave the words in the last two rows of each block (x8..15). 49662306a36Sopenharmony_ci vld1.32 {q8-q9}, [sp, :256] 49762306a36Sopenharmony_ci mov sp, r4 // restore original stack pointer 49862306a36Sopenharmony_ci ldr r4, [r4, #8] // load number of bytes 49962306a36Sopenharmony_ci vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) 50062306a36Sopenharmony_ci vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) 50162306a36Sopenharmony_ci vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) 50262306a36Sopenharmony_ci vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11) 50362306a36Sopenharmony_ci vld1.32 {q0-q1}, [r0] // load s8..15 50462306a36Sopenharmony_ci vswp d25, d28 50562306a36Sopenharmony_ci vswp d27, d30 50662306a36Sopenharmony_ci vswp d17, d20 50762306a36Sopenharmony_ci vswp d19, d22 50862306a36Sopenharmony_ci 50962306a36Sopenharmony_ci // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15) 51062306a36Sopenharmony_ci 51162306a36Sopenharmony_ci // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block) 51262306a36Sopenharmony_ci vadd.u32 q8, q8, q0 51362306a36Sopenharmony_ci vadd.u32 q10, q10, q0 51462306a36Sopenharmony_ci vadd.u32 q9, q9, q0 51562306a36Sopenharmony_ci vadd.u32 q11, q11, q0 51662306a36Sopenharmony_ci 51762306a36Sopenharmony_ci // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block) 51862306a36Sopenharmony_ci vadd.u32 q12, q12, q1 51962306a36Sopenharmony_ci vadd.u32 q14, q14, q1 52062306a36Sopenharmony_ci vadd.u32 q13, q13, q1 52162306a36Sopenharmony_ci vadd.u32 q15, q15, q1 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_ci // XOR the rest of the data with the keystream 52462306a36Sopenharmony_ci 52562306a36Sopenharmony_ci vld1.8 {q0-q1}, [r2]! 52662306a36Sopenharmony_ci subs r4, r4, #96 52762306a36Sopenharmony_ci veor q0, q0, q8 52862306a36Sopenharmony_ci veor q1, q1, q12 52962306a36Sopenharmony_ci ble .Lle96 53062306a36Sopenharmony_ci vst1.8 {q0-q1}, [r1]! 53162306a36Sopenharmony_ci 53262306a36Sopenharmony_ci vld1.8 {q0-q1}, [r2]! 53362306a36Sopenharmony_ci subs r4, r4, #32 53462306a36Sopenharmony_ci veor q0, q0, q2 53562306a36Sopenharmony_ci veor q1, q1, q6 53662306a36Sopenharmony_ci ble .Lle128 53762306a36Sopenharmony_ci vst1.8 {q0-q1}, [r1]! 53862306a36Sopenharmony_ci 53962306a36Sopenharmony_ci vld1.8 {q0-q1}, [r2]! 54062306a36Sopenharmony_ci subs r4, r4, #32 54162306a36Sopenharmony_ci veor q0, q0, q10 54262306a36Sopenharmony_ci veor q1, q1, q14 54362306a36Sopenharmony_ci ble .Lle160 54462306a36Sopenharmony_ci vst1.8 {q0-q1}, [r1]! 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_ci vld1.8 {q0-q1}, [r2]! 54762306a36Sopenharmony_ci subs r4, r4, #32 54862306a36Sopenharmony_ci veor q0, q0, q4 54962306a36Sopenharmony_ci veor q1, q1, q5 55062306a36Sopenharmony_ci ble .Lle192 55162306a36Sopenharmony_ci vst1.8 {q0-q1}, [r1]! 55262306a36Sopenharmony_ci 55362306a36Sopenharmony_ci vld1.8 {q0-q1}, [r2]! 55462306a36Sopenharmony_ci subs r4, r4, #32 55562306a36Sopenharmony_ci veor q0, q0, q9 55662306a36Sopenharmony_ci veor q1, q1, q13 55762306a36Sopenharmony_ci ble .Lle224 55862306a36Sopenharmony_ci vst1.8 {q0-q1}, [r1]! 55962306a36Sopenharmony_ci 56062306a36Sopenharmony_ci vld1.8 {q0-q1}, [r2]! 56162306a36Sopenharmony_ci subs r4, r4, #32 56262306a36Sopenharmony_ci veor q0, q0, q3 56362306a36Sopenharmony_ci veor q1, q1, q7 56462306a36Sopenharmony_ci blt .Llt256 56562306a36Sopenharmony_ci.Lout: 56662306a36Sopenharmony_ci vst1.8 {q0-q1}, [r1]! 56762306a36Sopenharmony_ci 56862306a36Sopenharmony_ci vld1.8 {q0-q1}, [r2] 56962306a36Sopenharmony_ci veor q0, q0, q11 57062306a36Sopenharmony_ci veor q1, q1, q15 57162306a36Sopenharmony_ci vst1.8 {q0-q1}, [r1] 57262306a36Sopenharmony_ci 57362306a36Sopenharmony_ci pop {r4, pc} 57462306a36Sopenharmony_ci 57562306a36Sopenharmony_ci.Lle192: 57662306a36Sopenharmony_ci vmov q4, q9 57762306a36Sopenharmony_ci vmov q5, q13 57862306a36Sopenharmony_ci 57962306a36Sopenharmony_ci.Lle160: 58062306a36Sopenharmony_ci // nothing to do 58162306a36Sopenharmony_ci 58262306a36Sopenharmony_ci.Lfinalblock: 58362306a36Sopenharmony_ci // Process the final block if processing less than 4 full blocks. 58462306a36Sopenharmony_ci // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the 58562306a36Sopenharmony_ci // previous 32 byte output block that still needs to be written at 58662306a36Sopenharmony_ci // [r1] in q0-q1. 58762306a36Sopenharmony_ci beq .Lfullblock 58862306a36Sopenharmony_ci 58962306a36Sopenharmony_ci.Lpartialblock: 59062306a36Sopenharmony_ci adr lr, .Lpermute + 32 59162306a36Sopenharmony_ci add r2, r2, r4 59262306a36Sopenharmony_ci add lr, lr, r4 59362306a36Sopenharmony_ci add r4, r4, r1 59462306a36Sopenharmony_ci 59562306a36Sopenharmony_ci vld1.8 {q2-q3}, [lr] 59662306a36Sopenharmony_ci vld1.8 {q6-q7}, [r2] 59762306a36Sopenharmony_ci 59862306a36Sopenharmony_ci add r4, r4, #32 59962306a36Sopenharmony_ci 60062306a36Sopenharmony_ci vtbl.8 d4, {q4-q5}, d4 60162306a36Sopenharmony_ci vtbl.8 d5, {q4-q5}, d5 60262306a36Sopenharmony_ci vtbl.8 d6, {q4-q5}, d6 60362306a36Sopenharmony_ci vtbl.8 d7, {q4-q5}, d7 60462306a36Sopenharmony_ci 60562306a36Sopenharmony_ci veor q6, q6, q2 60662306a36Sopenharmony_ci veor q7, q7, q3 60762306a36Sopenharmony_ci 60862306a36Sopenharmony_ci vst1.8 {q6-q7}, [r4] // overlapping stores 60962306a36Sopenharmony_ci vst1.8 {q0-q1}, [r1] 61062306a36Sopenharmony_ci pop {r4, pc} 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_ci.Lfullblock: 61362306a36Sopenharmony_ci vmov q11, q4 61462306a36Sopenharmony_ci vmov q15, q5 61562306a36Sopenharmony_ci b .Lout 61662306a36Sopenharmony_ci.Lle96: 61762306a36Sopenharmony_ci vmov q4, q2 61862306a36Sopenharmony_ci vmov q5, q6 61962306a36Sopenharmony_ci b .Lfinalblock 62062306a36Sopenharmony_ci.Lle128: 62162306a36Sopenharmony_ci vmov q4, q10 62262306a36Sopenharmony_ci vmov q5, q14 62362306a36Sopenharmony_ci b .Lfinalblock 62462306a36Sopenharmony_ci.Lle224: 62562306a36Sopenharmony_ci vmov q4, q3 62662306a36Sopenharmony_ci vmov q5, q7 62762306a36Sopenharmony_ci b .Lfinalblock 62862306a36Sopenharmony_ci.Llt256: 62962306a36Sopenharmony_ci vmov q4, q11 63062306a36Sopenharmony_ci vmov q5, q15 63162306a36Sopenharmony_ci b .Lpartialblock 63262306a36Sopenharmony_ciENDPROC(chacha_4block_xor_neon) 63362306a36Sopenharmony_ci 63462306a36Sopenharmony_ci .align L1_CACHE_SHIFT 63562306a36Sopenharmony_ci.Lpermute: 63662306a36Sopenharmony_ci .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 63762306a36Sopenharmony_ci .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 63862306a36Sopenharmony_ci .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 63962306a36Sopenharmony_ci .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f 64062306a36Sopenharmony_ci .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 64162306a36Sopenharmony_ci .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 64262306a36Sopenharmony_ci .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 64362306a36Sopenharmony_ci .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f 644