162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * BLAKE2s digest algorithm, ARM scalar implementation 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright 2020 Google LLC 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * Author: Eric Biggers <ebiggers@google.com> 862306a36Sopenharmony_ci */ 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci#include <linux/linkage.h> 1162306a36Sopenharmony_ci#include <asm/assembler.h> 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_ci // Registers used to hold message words temporarily. There aren't 1462306a36Sopenharmony_ci // enough ARM registers to hold the whole message block, so we have to 1562306a36Sopenharmony_ci // load the words on-demand. 1662306a36Sopenharmony_ci M_0 .req r12 1762306a36Sopenharmony_ci M_1 .req r14 1862306a36Sopenharmony_ci 1962306a36Sopenharmony_ci// The BLAKE2s initialization vector 2062306a36Sopenharmony_ci.Lblake2s_IV: 2162306a36Sopenharmony_ci .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A 2262306a36Sopenharmony_ci .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 2362306a36Sopenharmony_ci 2462306a36Sopenharmony_ci.macro __ldrd a, b, src, offset 2562306a36Sopenharmony_ci#if __LINUX_ARM_ARCH__ >= 6 2662306a36Sopenharmony_ci ldrd \a, \b, [\src, #\offset] 2762306a36Sopenharmony_ci#else 2862306a36Sopenharmony_ci ldr \a, [\src, #\offset] 2962306a36Sopenharmony_ci ldr \b, [\src, #\offset + 4] 3062306a36Sopenharmony_ci#endif 3162306a36Sopenharmony_ci.endm 3262306a36Sopenharmony_ci 3362306a36Sopenharmony_ci.macro __strd a, b, dst, offset 3462306a36Sopenharmony_ci#if __LINUX_ARM_ARCH__ >= 6 3562306a36Sopenharmony_ci strd \a, \b, [\dst, #\offset] 3662306a36Sopenharmony_ci#else 3762306a36Sopenharmony_ci str \a, [\dst, #\offset] 3862306a36Sopenharmony_ci str \b, [\dst, #\offset + 4] 3962306a36Sopenharmony_ci#endif 4062306a36Sopenharmony_ci.endm 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ci.macro _le32_bswap a, tmp 4362306a36Sopenharmony_ci#ifdef __ARMEB__ 4462306a36Sopenharmony_ci rev_l \a, \tmp 4562306a36Sopenharmony_ci#endif 4662306a36Sopenharmony_ci.endm 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_ci.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp 4962306a36Sopenharmony_ci _le32_bswap \a, \tmp 5062306a36Sopenharmony_ci _le32_bswap \b, \tmp 5162306a36Sopenharmony_ci _le32_bswap \c, \tmp 5262306a36Sopenharmony_ci _le32_bswap \d, \tmp 5362306a36Sopenharmony_ci _le32_bswap \e, \tmp 5462306a36Sopenharmony_ci _le32_bswap \f, \tmp 5562306a36Sopenharmony_ci _le32_bswap \g, \tmp 5662306a36Sopenharmony_ci _le32_bswap \h, \tmp 5762306a36Sopenharmony_ci.endm 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_ci// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals. 6062306a36Sopenharmony_ci// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two 6162306a36Sopenharmony_ci// columns/diagonals. s0-s1 are the word offsets to the message words the first 6262306a36Sopenharmony_ci// column/diagonal needs, and likewise s2-s3 for the second column/diagonal. 6362306a36Sopenharmony_ci// M_0 and M_1 are free to use, and the message block can be found at sp + 32. 6462306a36Sopenharmony_ci// 6562306a36Sopenharmony_ci// Note that to save instructions, the rotations don't happen when the 6662306a36Sopenharmony_ci// pseudocode says they should, but rather they are delayed until the values are 6762306a36Sopenharmony_ci// used. See the comment above _blake2s_round(). 6862306a36Sopenharmony_ci.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3 6962306a36Sopenharmony_ci 7062306a36Sopenharmony_ci ldr M_0, [sp, #32 + 4 * \s0] 7162306a36Sopenharmony_ci ldr M_1, [sp, #32 + 4 * \s2] 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci // a += b + m[blake2s_sigma[r][2*i + 0]]; 7462306a36Sopenharmony_ci add \a0, \a0, \b0, ror #brot 7562306a36Sopenharmony_ci add \a1, \a1, \b1, ror #brot 7662306a36Sopenharmony_ci add \a0, \a0, M_0 7762306a36Sopenharmony_ci add \a1, \a1, M_1 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_ci // d = ror32(d ^ a, 16); 8062306a36Sopenharmony_ci eor \d0, \a0, \d0, ror #drot 8162306a36Sopenharmony_ci eor \d1, \a1, \d1, ror #drot 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_ci // c += d; 8462306a36Sopenharmony_ci add \c0, \c0, \d0, ror #16 8562306a36Sopenharmony_ci add \c1, \c1, \d1, ror #16 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci // b = ror32(b ^ c, 12); 8862306a36Sopenharmony_ci eor \b0, \c0, \b0, ror #brot 8962306a36Sopenharmony_ci eor \b1, \c1, \b1, ror #brot 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci ldr M_0, [sp, #32 + 4 * \s1] 9262306a36Sopenharmony_ci ldr M_1, [sp, #32 + 4 * \s3] 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci // a += b + m[blake2s_sigma[r][2*i + 1]]; 9562306a36Sopenharmony_ci add \a0, \a0, \b0, ror #12 9662306a36Sopenharmony_ci add \a1, \a1, \b1, ror #12 9762306a36Sopenharmony_ci add \a0, \a0, M_0 9862306a36Sopenharmony_ci add \a1, \a1, M_1 9962306a36Sopenharmony_ci 10062306a36Sopenharmony_ci // d = ror32(d ^ a, 8); 10162306a36Sopenharmony_ci eor \d0, \a0, \d0, ror#16 10262306a36Sopenharmony_ci eor \d1, \a1, \d1, ror#16 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_ci // c += d; 10562306a36Sopenharmony_ci add \c0, \c0, \d0, ror#8 10662306a36Sopenharmony_ci add \c1, \c1, \d1, ror#8 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci // b = ror32(b ^ c, 7); 10962306a36Sopenharmony_ci eor \b0, \c0, \b0, ror#12 11062306a36Sopenharmony_ci eor \b1, \c1, \b1, ror#12 11162306a36Sopenharmony_ci.endm 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_ci// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9] 11462306a36Sopenharmony_ci// are in r0..r9. The stack pointer points to 8 bytes of scratch space for 11562306a36Sopenharmony_ci// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and 11662306a36Sopenharmony_ci// r14 are free to use. The macro arguments s0-s15 give the order in which the 11762306a36Sopenharmony_ci// message words are used in this round. 11862306a36Sopenharmony_ci// 11962306a36Sopenharmony_ci// All rotates are performed using the implicit rotate operand accepted by the 12062306a36Sopenharmony_ci// 'add' and 'eor' instructions. This is faster than using explicit rotate 12162306a36Sopenharmony_ci// instructions. To make this work, we allow the values in the second and last 12262306a36Sopenharmony_ci// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the 12362306a36Sopenharmony_ci// wrong rotation amount. The rotation amount is then fixed up just in time 12462306a36Sopenharmony_ci// when the values are used. 'brot' is the number of bits the values in row 'b' 12562306a36Sopenharmony_ci// need to be rotated right to arrive at the correct values, and 'drot' 12662306a36Sopenharmony_ci// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such 12762306a36Sopenharmony_ci// that they end up as (7, 8) after every round. 12862306a36Sopenharmony_ci.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \ 12962306a36Sopenharmony_ci s8, s9, s10, s11, s12, s13, s14, s15 13062306a36Sopenharmony_ci 13162306a36Sopenharmony_ci // Mix first two columns: 13262306a36Sopenharmony_ci // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]). 13362306a36Sopenharmony_ci __ldrd r10, r11, sp, 16 // load v[12] and v[13] 13462306a36Sopenharmony_ci _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \ 13562306a36Sopenharmony_ci \s0, \s1, \s2, \s3 13662306a36Sopenharmony_ci __strd r8, r9, sp, 0 13762306a36Sopenharmony_ci __strd r10, r11, sp, 16 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ci // Mix second two columns: 14062306a36Sopenharmony_ci // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]). 14162306a36Sopenharmony_ci __ldrd r8, r9, sp, 8 // load v[10] and v[11] 14262306a36Sopenharmony_ci __ldrd r10, r11, sp, 24 // load v[14] and v[15] 14362306a36Sopenharmony_ci _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \ 14462306a36Sopenharmony_ci \s4, \s5, \s6, \s7 14562306a36Sopenharmony_ci str r10, [sp, #24] // store v[14] 14662306a36Sopenharmony_ci // v[10], v[11], and v[15] are used below, so no need to store them yet. 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci .set brot, 7 14962306a36Sopenharmony_ci .set drot, 8 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci // Mix first two diagonals: 15262306a36Sopenharmony_ci // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]). 15362306a36Sopenharmony_ci ldr r10, [sp, #16] // load v[12] 15462306a36Sopenharmony_ci _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \ 15562306a36Sopenharmony_ci \s8, \s9, \s10, \s11 15662306a36Sopenharmony_ci __strd r8, r9, sp, 8 15762306a36Sopenharmony_ci str r11, [sp, #28] 15862306a36Sopenharmony_ci str r10, [sp, #16] 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_ci // Mix second two diagonals: 16162306a36Sopenharmony_ci // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]). 16262306a36Sopenharmony_ci __ldrd r8, r9, sp, 0 // load v[8] and v[9] 16362306a36Sopenharmony_ci __ldrd r10, r11, sp, 20 // load v[13] and v[14] 16462306a36Sopenharmony_ci _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \ 16562306a36Sopenharmony_ci \s12, \s13, \s14, \s15 16662306a36Sopenharmony_ci __strd r10, r11, sp, 20 16762306a36Sopenharmony_ci.endm 16862306a36Sopenharmony_ci 16962306a36Sopenharmony_ci// 17062306a36Sopenharmony_ci// void blake2s_compress(struct blake2s_state *state, 17162306a36Sopenharmony_ci// const u8 *block, size_t nblocks, u32 inc); 17262306a36Sopenharmony_ci// 17362306a36Sopenharmony_ci// Only the first three fields of struct blake2s_state are used: 17462306a36Sopenharmony_ci// u32 h[8]; (inout) 17562306a36Sopenharmony_ci// u32 t[2]; (inout) 17662306a36Sopenharmony_ci// u32 f[2]; (in) 17762306a36Sopenharmony_ci// 17862306a36Sopenharmony_ci .align 5 17962306a36Sopenharmony_ciENTRY(blake2s_compress) 18062306a36Sopenharmony_ci push {r0-r2,r4-r11,lr} // keep this an even number 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_ci.Lnext_block: 18362306a36Sopenharmony_ci // r0 is 'state' 18462306a36Sopenharmony_ci // r1 is 'block' 18562306a36Sopenharmony_ci // r3 is 'inc' 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ci // Load and increment the counter t[0..1]. 18862306a36Sopenharmony_ci __ldrd r10, r11, r0, 32 18962306a36Sopenharmony_ci adds r10, r10, r3 19062306a36Sopenharmony_ci adc r11, r11, #0 19162306a36Sopenharmony_ci __strd r10, r11, r0, 32 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci // _blake2s_round is very short on registers, so copy the message block 19462306a36Sopenharmony_ci // to the stack to save a register during the rounds. This also has the 19562306a36Sopenharmony_ci // advantage that misalignment only needs to be dealt with in one place. 19662306a36Sopenharmony_ci sub sp, sp, #64 19762306a36Sopenharmony_ci mov r12, sp 19862306a36Sopenharmony_ci tst r1, #3 19962306a36Sopenharmony_ci bne .Lcopy_block_misaligned 20062306a36Sopenharmony_ci ldmia r1!, {r2-r9} 20162306a36Sopenharmony_ci _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 20262306a36Sopenharmony_ci stmia r12!, {r2-r9} 20362306a36Sopenharmony_ci ldmia r1!, {r2-r9} 20462306a36Sopenharmony_ci _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 20562306a36Sopenharmony_ci stmia r12, {r2-r9} 20662306a36Sopenharmony_ci.Lcopy_block_done: 20762306a36Sopenharmony_ci str r1, [sp, #68] // Update message pointer 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ci // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space 21062306a36Sopenharmony_ci // for spilling v[8..9]. Leave v[8..9] in r8-r9. 21162306a36Sopenharmony_ci mov r14, r0 // r14 = state 21262306a36Sopenharmony_ci adr r12, .Lblake2s_IV 21362306a36Sopenharmony_ci ldmia r12!, {r8-r9} // load IV[0..1] 21462306a36Sopenharmony_ci __ldrd r0, r1, r14, 40 // load f[0..1] 21562306a36Sopenharmony_ci ldm r12, {r2-r7} // load IV[3..7] 21662306a36Sopenharmony_ci eor r4, r4, r10 // v[12] = IV[4] ^ t[0] 21762306a36Sopenharmony_ci eor r5, r5, r11 // v[13] = IV[5] ^ t[1] 21862306a36Sopenharmony_ci eor r6, r6, r0 // v[14] = IV[6] ^ f[0] 21962306a36Sopenharmony_ci eor r7, r7, r1 // v[15] = IV[7] ^ f[1] 22062306a36Sopenharmony_ci push {r2-r7} // push v[9..15] 22162306a36Sopenharmony_ci sub sp, sp, #8 // leave space for v[8..9] 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ci // Load h[0..7] == v[0..7]. 22462306a36Sopenharmony_ci ldm r14, {r0-r7} 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci // Execute the rounds. Each round is provided the order in which it 22762306a36Sopenharmony_ci // needs to use the message words. 22862306a36Sopenharmony_ci .set brot, 0 22962306a36Sopenharmony_ci .set drot, 0 23062306a36Sopenharmony_ci _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 23162306a36Sopenharmony_ci _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 23262306a36Sopenharmony_ci _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 23362306a36Sopenharmony_ci _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 23462306a36Sopenharmony_ci _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 23562306a36Sopenharmony_ci _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 23662306a36Sopenharmony_ci _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 23762306a36Sopenharmony_ci _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 23862306a36Sopenharmony_ci _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 23962306a36Sopenharmony_ci _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci // Fold the final state matrix into the hash chaining value: 24262306a36Sopenharmony_ci // 24362306a36Sopenharmony_ci // for (i = 0; i < 8; i++) 24462306a36Sopenharmony_ci // h[i] ^= v[i] ^ v[i + 8]; 24562306a36Sopenharmony_ci // 24662306a36Sopenharmony_ci ldr r14, [sp, #96] // r14 = &h[0] 24762306a36Sopenharmony_ci add sp, sp, #8 // v[8..9] are already loaded. 24862306a36Sopenharmony_ci pop {r10-r11} // load v[10..11] 24962306a36Sopenharmony_ci eor r0, r0, r8 25062306a36Sopenharmony_ci eor r1, r1, r9 25162306a36Sopenharmony_ci eor r2, r2, r10 25262306a36Sopenharmony_ci eor r3, r3, r11 25362306a36Sopenharmony_ci ldm r14, {r8-r11} // load h[0..3] 25462306a36Sopenharmony_ci eor r0, r0, r8 25562306a36Sopenharmony_ci eor r1, r1, r9 25662306a36Sopenharmony_ci eor r2, r2, r10 25762306a36Sopenharmony_ci eor r3, r3, r11 25862306a36Sopenharmony_ci stmia r14!, {r0-r3} // store new h[0..3] 25962306a36Sopenharmony_ci ldm r14, {r0-r3} // load old h[4..7] 26062306a36Sopenharmony_ci pop {r8-r11} // load v[12..15] 26162306a36Sopenharmony_ci eor r0, r0, r4, ror #brot 26262306a36Sopenharmony_ci eor r1, r1, r5, ror #brot 26362306a36Sopenharmony_ci eor r2, r2, r6, ror #brot 26462306a36Sopenharmony_ci eor r3, r3, r7, ror #brot 26562306a36Sopenharmony_ci eor r0, r0, r8, ror #drot 26662306a36Sopenharmony_ci eor r1, r1, r9, ror #drot 26762306a36Sopenharmony_ci eor r2, r2, r10, ror #drot 26862306a36Sopenharmony_ci eor r3, r3, r11, ror #drot 26962306a36Sopenharmony_ci add sp, sp, #64 // skip copy of message block 27062306a36Sopenharmony_ci stm r14, {r0-r3} // store new h[4..7] 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ci // Advance to the next block, if there is one. Note that if there are 27362306a36Sopenharmony_ci // multiple blocks, then 'inc' (the counter increment amount) must be 27462306a36Sopenharmony_ci // 64. So we can simply set it to 64 without re-loading it. 27562306a36Sopenharmony_ci ldm sp, {r0, r1, r2} // load (state, block, nblocks) 27662306a36Sopenharmony_ci mov r3, #64 // set 'inc' 27762306a36Sopenharmony_ci subs r2, r2, #1 // nblocks-- 27862306a36Sopenharmony_ci str r2, [sp, #8] 27962306a36Sopenharmony_ci bne .Lnext_block // nblocks != 0? 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_ci pop {r0-r2,r4-r11,pc} 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_ci // The next message block (pointed to by r1) isn't 4-byte aligned, so it 28462306a36Sopenharmony_ci // can't be loaded using ldmia. Copy it to the stack buffer (pointed to 28562306a36Sopenharmony_ci // by r12) using an alternative method. r2-r9 are free to use. 28662306a36Sopenharmony_ci.Lcopy_block_misaligned: 28762306a36Sopenharmony_ci mov r2, #64 28862306a36Sopenharmony_ci1: 28962306a36Sopenharmony_ci#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 29062306a36Sopenharmony_ci ldr r3, [r1], #4 29162306a36Sopenharmony_ci _le32_bswap r3, r4 29262306a36Sopenharmony_ci#else 29362306a36Sopenharmony_ci ldrb r3, [r1, #0] 29462306a36Sopenharmony_ci ldrb r4, [r1, #1] 29562306a36Sopenharmony_ci ldrb r5, [r1, #2] 29662306a36Sopenharmony_ci ldrb r6, [r1, #3] 29762306a36Sopenharmony_ci add r1, r1, #4 29862306a36Sopenharmony_ci orr r3, r3, r4, lsl #8 29962306a36Sopenharmony_ci orr r3, r3, r5, lsl #16 30062306a36Sopenharmony_ci orr r3, r3, r6, lsl #24 30162306a36Sopenharmony_ci#endif 30262306a36Sopenharmony_ci subs r2, r2, #4 30362306a36Sopenharmony_ci str r3, [r12], #4 30462306a36Sopenharmony_ci bne 1b 30562306a36Sopenharmony_ci b .Lcopy_block_done 30662306a36Sopenharmony_ciENDPROC(blake2s_compress) 307