162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * BLAKE2b digest algorithm, NEON accelerated 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright 2020 Google LLC 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * Author: Eric Biggers <ebiggers@google.com> 862306a36Sopenharmony_ci */ 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci#include <linux/linkage.h> 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci .text 1362306a36Sopenharmony_ci .fpu neon 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_ci // The arguments to blake2b_compress_neon() 1662306a36Sopenharmony_ci STATE .req r0 1762306a36Sopenharmony_ci BLOCK .req r1 1862306a36Sopenharmony_ci NBLOCKS .req r2 1962306a36Sopenharmony_ci INC .req r3 2062306a36Sopenharmony_ci 2162306a36Sopenharmony_ci // Pointers to the rotation tables 2262306a36Sopenharmony_ci ROR24_TABLE .req r4 2362306a36Sopenharmony_ci ROR16_TABLE .req r5 2462306a36Sopenharmony_ci 2562306a36Sopenharmony_ci // The original stack pointer 2662306a36Sopenharmony_ci ORIG_SP .req r6 2762306a36Sopenharmony_ci 2862306a36Sopenharmony_ci // NEON registers which contain the message words of the current block. 2962306a36Sopenharmony_ci // M_0-M_3 are occasionally used for other purposes too. 3062306a36Sopenharmony_ci M_0 .req d16 3162306a36Sopenharmony_ci M_1 .req d17 3262306a36Sopenharmony_ci M_2 .req d18 3362306a36Sopenharmony_ci M_3 .req d19 3462306a36Sopenharmony_ci M_4 .req d20 3562306a36Sopenharmony_ci M_5 .req d21 3662306a36Sopenharmony_ci M_6 .req d22 3762306a36Sopenharmony_ci M_7 .req d23 3862306a36Sopenharmony_ci M_8 .req d24 3962306a36Sopenharmony_ci M_9 .req d25 4062306a36Sopenharmony_ci M_10 .req d26 4162306a36Sopenharmony_ci M_11 .req d27 4262306a36Sopenharmony_ci M_12 .req d28 4362306a36Sopenharmony_ci M_13 .req d29 4462306a36Sopenharmony_ci M_14 .req d30 4562306a36Sopenharmony_ci M_15 .req d31 4662306a36Sopenharmony_ci 4762306a36Sopenharmony_ci .align 4 4862306a36Sopenharmony_ci // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8 4962306a36Sopenharmony_ci // instruction. This is the most efficient way to implement these 5062306a36Sopenharmony_ci // rotation amounts with NEON. (On Cortex-A53 it's the same speed as 5162306a36Sopenharmony_ci // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.) 5262306a36Sopenharmony_ci.Lror24_table: 5362306a36Sopenharmony_ci .byte 3, 4, 5, 6, 7, 0, 1, 2 5462306a36Sopenharmony_ci.Lror16_table: 5562306a36Sopenharmony_ci .byte 2, 3, 4, 5, 6, 7, 0, 1 5662306a36Sopenharmony_ci // The BLAKE2b initialization vector 5762306a36Sopenharmony_ci.Lblake2b_IV: 5862306a36Sopenharmony_ci .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b 5962306a36Sopenharmony_ci .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 6062306a36Sopenharmony_ci .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f 6162306a36Sopenharmony_ci .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 6262306a36Sopenharmony_ci 6362306a36Sopenharmony_ci// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the 6462306a36Sopenharmony_ci// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack 6562306a36Sopenharmony_ci// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9 6662306a36Sopenharmony_ci// (M_0-M_3), so that they can be reloaded if they are used as temporary 6762306a36Sopenharmony_ci// registers. The macro arguments s0-s15 give the order in which the message 6862306a36Sopenharmony_ci// words are used in this round. 'final' is 1 if this is the final round. 6962306a36Sopenharmony_ci.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \ 7062306a36Sopenharmony_ci s8, s9, s10, s11, s12, s13, s14, s15, final=0 7162306a36Sopenharmony_ci 7262306a36Sopenharmony_ci // Mix the columns: 7362306a36Sopenharmony_ci // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]), 7462306a36Sopenharmony_ci // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]). 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ci // a += b + m[blake2b_sigma[r][2*i + 0]]; 7762306a36Sopenharmony_ci vadd.u64 q0, q0, q2 7862306a36Sopenharmony_ci vadd.u64 q1, q1, q3 7962306a36Sopenharmony_ci vadd.u64 d0, d0, M_\s0 8062306a36Sopenharmony_ci vadd.u64 d1, d1, M_\s2 8162306a36Sopenharmony_ci vadd.u64 d2, d2, M_\s4 8262306a36Sopenharmony_ci vadd.u64 d3, d3, M_\s6 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci // d = ror64(d ^ a, 32); 8562306a36Sopenharmony_ci veor q6, q6, q0 8662306a36Sopenharmony_ci veor q7, q7, q1 8762306a36Sopenharmony_ci vrev64.32 q6, q6 8862306a36Sopenharmony_ci vrev64.32 q7, q7 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci // c += d; 9162306a36Sopenharmony_ci vadd.u64 q4, q4, q6 9262306a36Sopenharmony_ci vadd.u64 q5, q5, q7 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci // b = ror64(b ^ c, 24); 9562306a36Sopenharmony_ci vld1.8 {M_0}, [ROR24_TABLE, :64] 9662306a36Sopenharmony_ci veor q2, q2, q4 9762306a36Sopenharmony_ci veor q3, q3, q5 9862306a36Sopenharmony_ci vtbl.8 d4, {d4}, M_0 9962306a36Sopenharmony_ci vtbl.8 d5, {d5}, M_0 10062306a36Sopenharmony_ci vtbl.8 d6, {d6}, M_0 10162306a36Sopenharmony_ci vtbl.8 d7, {d7}, M_0 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_ci // a += b + m[blake2b_sigma[r][2*i + 1]]; 10462306a36Sopenharmony_ci // 10562306a36Sopenharmony_ci // M_0 got clobbered above, so we have to reload it if any of the four 10662306a36Sopenharmony_ci // message words this step needs happens to be M_0. Otherwise we don't 10762306a36Sopenharmony_ci // need to reload it here, as it will just get clobbered again below. 10862306a36Sopenharmony_ci.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0 10962306a36Sopenharmony_ci vld1.8 {M_0}, [sp, :64] 11062306a36Sopenharmony_ci.endif 11162306a36Sopenharmony_ci vadd.u64 q0, q0, q2 11262306a36Sopenharmony_ci vadd.u64 q1, q1, q3 11362306a36Sopenharmony_ci vadd.u64 d0, d0, M_\s1 11462306a36Sopenharmony_ci vadd.u64 d1, d1, M_\s3 11562306a36Sopenharmony_ci vadd.u64 d2, d2, M_\s5 11662306a36Sopenharmony_ci vadd.u64 d3, d3, M_\s7 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_ci // d = ror64(d ^ a, 16); 11962306a36Sopenharmony_ci vld1.8 {M_0}, [ROR16_TABLE, :64] 12062306a36Sopenharmony_ci veor q6, q6, q0 12162306a36Sopenharmony_ci veor q7, q7, q1 12262306a36Sopenharmony_ci vtbl.8 d12, {d12}, M_0 12362306a36Sopenharmony_ci vtbl.8 d13, {d13}, M_0 12462306a36Sopenharmony_ci vtbl.8 d14, {d14}, M_0 12562306a36Sopenharmony_ci vtbl.8 d15, {d15}, M_0 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_ci // c += d; 12862306a36Sopenharmony_ci vadd.u64 q4, q4, q6 12962306a36Sopenharmony_ci vadd.u64 q5, q5, q7 13062306a36Sopenharmony_ci 13162306a36Sopenharmony_ci // b = ror64(b ^ c, 63); 13262306a36Sopenharmony_ci // 13362306a36Sopenharmony_ci // This rotation amount isn't a multiple of 8, so it has to be 13462306a36Sopenharmony_ci // implemented using a pair of shifts, which requires temporary 13562306a36Sopenharmony_ci // registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards. 13662306a36Sopenharmony_ci veor q8, q2, q4 13762306a36Sopenharmony_ci veor q9, q3, q5 13862306a36Sopenharmony_ci vshr.u64 q2, q8, #63 13962306a36Sopenharmony_ci vshr.u64 q3, q9, #63 14062306a36Sopenharmony_ci vsli.u64 q2, q8, #1 14162306a36Sopenharmony_ci vsli.u64 q3, q9, #1 14262306a36Sopenharmony_ci vld1.8 {q8-q9}, [sp, :256] 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci // Mix the diagonals: 14562306a36Sopenharmony_ci // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]), 14662306a36Sopenharmony_ci // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]). 14762306a36Sopenharmony_ci // 14862306a36Sopenharmony_ci // There are two possible ways to do this: use 'vext' instructions to 14962306a36Sopenharmony_ci // shift the rows of the matrix so that the diagonals become columns, 15062306a36Sopenharmony_ci // and undo it afterwards; or just use 64-bit operations on 'd' 15162306a36Sopenharmony_ci // registers instead of 128-bit operations on 'q' registers. We use the 15262306a36Sopenharmony_ci // latter approach, as it performs much better on Cortex-A7. 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci // a += b + m[blake2b_sigma[r][2*i + 0]]; 15562306a36Sopenharmony_ci vadd.u64 d0, d0, d5 15662306a36Sopenharmony_ci vadd.u64 d1, d1, d6 15762306a36Sopenharmony_ci vadd.u64 d2, d2, d7 15862306a36Sopenharmony_ci vadd.u64 d3, d3, d4 15962306a36Sopenharmony_ci vadd.u64 d0, d0, M_\s8 16062306a36Sopenharmony_ci vadd.u64 d1, d1, M_\s10 16162306a36Sopenharmony_ci vadd.u64 d2, d2, M_\s12 16262306a36Sopenharmony_ci vadd.u64 d3, d3, M_\s14 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci // d = ror64(d ^ a, 32); 16562306a36Sopenharmony_ci veor d15, d15, d0 16662306a36Sopenharmony_ci veor d12, d12, d1 16762306a36Sopenharmony_ci veor d13, d13, d2 16862306a36Sopenharmony_ci veor d14, d14, d3 16962306a36Sopenharmony_ci vrev64.32 d15, d15 17062306a36Sopenharmony_ci vrev64.32 d12, d12 17162306a36Sopenharmony_ci vrev64.32 d13, d13 17262306a36Sopenharmony_ci vrev64.32 d14, d14 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_ci // c += d; 17562306a36Sopenharmony_ci vadd.u64 d10, d10, d15 17662306a36Sopenharmony_ci vadd.u64 d11, d11, d12 17762306a36Sopenharmony_ci vadd.u64 d8, d8, d13 17862306a36Sopenharmony_ci vadd.u64 d9, d9, d14 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_ci // b = ror64(b ^ c, 24); 18162306a36Sopenharmony_ci vld1.8 {M_0}, [ROR24_TABLE, :64] 18262306a36Sopenharmony_ci veor d5, d5, d10 18362306a36Sopenharmony_ci veor d6, d6, d11 18462306a36Sopenharmony_ci veor d7, d7, d8 18562306a36Sopenharmony_ci veor d4, d4, d9 18662306a36Sopenharmony_ci vtbl.8 d5, {d5}, M_0 18762306a36Sopenharmony_ci vtbl.8 d6, {d6}, M_0 18862306a36Sopenharmony_ci vtbl.8 d7, {d7}, M_0 18962306a36Sopenharmony_ci vtbl.8 d4, {d4}, M_0 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci // a += b + m[blake2b_sigma[r][2*i + 1]]; 19262306a36Sopenharmony_ci.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0 19362306a36Sopenharmony_ci vld1.8 {M_0}, [sp, :64] 19462306a36Sopenharmony_ci.endif 19562306a36Sopenharmony_ci vadd.u64 d0, d0, d5 19662306a36Sopenharmony_ci vadd.u64 d1, d1, d6 19762306a36Sopenharmony_ci vadd.u64 d2, d2, d7 19862306a36Sopenharmony_ci vadd.u64 d3, d3, d4 19962306a36Sopenharmony_ci vadd.u64 d0, d0, M_\s9 20062306a36Sopenharmony_ci vadd.u64 d1, d1, M_\s11 20162306a36Sopenharmony_ci vadd.u64 d2, d2, M_\s13 20262306a36Sopenharmony_ci vadd.u64 d3, d3, M_\s15 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_ci // d = ror64(d ^ a, 16); 20562306a36Sopenharmony_ci vld1.8 {M_0}, [ROR16_TABLE, :64] 20662306a36Sopenharmony_ci veor d15, d15, d0 20762306a36Sopenharmony_ci veor d12, d12, d1 20862306a36Sopenharmony_ci veor d13, d13, d2 20962306a36Sopenharmony_ci veor d14, d14, d3 21062306a36Sopenharmony_ci vtbl.8 d12, {d12}, M_0 21162306a36Sopenharmony_ci vtbl.8 d13, {d13}, M_0 21262306a36Sopenharmony_ci vtbl.8 d14, {d14}, M_0 21362306a36Sopenharmony_ci vtbl.8 d15, {d15}, M_0 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_ci // c += d; 21662306a36Sopenharmony_ci vadd.u64 d10, d10, d15 21762306a36Sopenharmony_ci vadd.u64 d11, d11, d12 21862306a36Sopenharmony_ci vadd.u64 d8, d8, d13 21962306a36Sopenharmony_ci vadd.u64 d9, d9, d14 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ci // b = ror64(b ^ c, 63); 22262306a36Sopenharmony_ci veor d16, d4, d9 22362306a36Sopenharmony_ci veor d17, d5, d10 22462306a36Sopenharmony_ci veor d18, d6, d11 22562306a36Sopenharmony_ci veor d19, d7, d8 22662306a36Sopenharmony_ci vshr.u64 q2, q8, #63 22762306a36Sopenharmony_ci vshr.u64 q3, q9, #63 22862306a36Sopenharmony_ci vsli.u64 q2, q8, #1 22962306a36Sopenharmony_ci vsli.u64 q3, q9, #1 23062306a36Sopenharmony_ci // Reloading q8-q9 can be skipped on the final round. 23162306a36Sopenharmony_ci.if ! \final 23262306a36Sopenharmony_ci vld1.8 {q8-q9}, [sp, :256] 23362306a36Sopenharmony_ci.endif 23462306a36Sopenharmony_ci.endm 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ci// 23762306a36Sopenharmony_ci// void blake2b_compress_neon(struct blake2b_state *state, 23862306a36Sopenharmony_ci// const u8 *block, size_t nblocks, u32 inc); 23962306a36Sopenharmony_ci// 24062306a36Sopenharmony_ci// Only the first three fields of struct blake2b_state are used: 24162306a36Sopenharmony_ci// u64 h[8]; (inout) 24262306a36Sopenharmony_ci// u64 t[2]; (inout) 24362306a36Sopenharmony_ci// u64 f[2]; (in) 24462306a36Sopenharmony_ci// 24562306a36Sopenharmony_ci .align 5 24662306a36Sopenharmony_ciENTRY(blake2b_compress_neon) 24762306a36Sopenharmony_ci push {r4-r10} 24862306a36Sopenharmony_ci 24962306a36Sopenharmony_ci // Allocate a 32-byte stack buffer that is 32-byte aligned. 25062306a36Sopenharmony_ci mov ORIG_SP, sp 25162306a36Sopenharmony_ci sub ip, sp, #32 25262306a36Sopenharmony_ci bic ip, ip, #31 25362306a36Sopenharmony_ci mov sp, ip 25462306a36Sopenharmony_ci 25562306a36Sopenharmony_ci adr ROR24_TABLE, .Lror24_table 25662306a36Sopenharmony_ci adr ROR16_TABLE, .Lror16_table 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci mov ip, STATE 25962306a36Sopenharmony_ci vld1.64 {q0-q1}, [ip]! // Load h[0..3] 26062306a36Sopenharmony_ci vld1.64 {q2-q3}, [ip]! // Load h[4..7] 26162306a36Sopenharmony_ci.Lnext_block: 26262306a36Sopenharmony_ci adr r10, .Lblake2b_IV 26362306a36Sopenharmony_ci vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1] 26462306a36Sopenharmony_ci vld1.64 {q4-q5}, [r10]! // Load IV[0..3] 26562306a36Sopenharmony_ci vmov r7, r8, d28 // Copy t[0] to (r7, r8) 26662306a36Sopenharmony_ci vld1.64 {q6-q7}, [r10] // Load IV[4..7] 26762306a36Sopenharmony_ci adds r7, r7, INC // Increment counter 26862306a36Sopenharmony_ci bcs .Lslow_inc_ctr 26962306a36Sopenharmony_ci vmov.i32 d28[0], r7 27062306a36Sopenharmony_ci vst1.64 {d28}, [ip] // Update t[0] 27162306a36Sopenharmony_ci.Linc_ctr_done: 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci // Load the next message block and finish initializing the state matrix 27462306a36Sopenharmony_ci // 'v'. Fortunately, there are exactly enough NEON registers to fit the 27562306a36Sopenharmony_ci // entire state matrix in q0-q7 and the entire message block in q8-15. 27662306a36Sopenharmony_ci // 27762306a36Sopenharmony_ci // However, _blake2b_round also needs some extra registers for rotates, 27862306a36Sopenharmony_ci // so we have to spill some registers. It's better to spill the message 27962306a36Sopenharmony_ci // registers than the state registers, as the message doesn't change. 28062306a36Sopenharmony_ci // Therefore we store a copy of the first 32 bytes of the message block 28162306a36Sopenharmony_ci // (q8-q9) in an aligned buffer on the stack so that they can be 28262306a36Sopenharmony_ci // reloaded when needed. (We could just reload directly from the 28362306a36Sopenharmony_ci // message buffer, but it's faster to use aligned loads.) 28462306a36Sopenharmony_ci vld1.8 {q8-q9}, [BLOCK]! 28562306a36Sopenharmony_ci veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1] 28662306a36Sopenharmony_ci vld1.8 {q10-q11}, [BLOCK]! 28762306a36Sopenharmony_ci veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1] 28862306a36Sopenharmony_ci vld1.8 {q12-q13}, [BLOCK]! 28962306a36Sopenharmony_ci vst1.8 {q8-q9}, [sp, :256] 29062306a36Sopenharmony_ci mov ip, STATE 29162306a36Sopenharmony_ci vld1.8 {q14-q15}, [BLOCK]! 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci // Execute the rounds. Each round is provided the order in which it 29462306a36Sopenharmony_ci // needs to use the message words. 29562306a36Sopenharmony_ci _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 29662306a36Sopenharmony_ci _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 29762306a36Sopenharmony_ci _blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 29862306a36Sopenharmony_ci _blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 29962306a36Sopenharmony_ci _blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 30062306a36Sopenharmony_ci _blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 30162306a36Sopenharmony_ci _blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 30262306a36Sopenharmony_ci _blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 30362306a36Sopenharmony_ci _blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 30462306a36Sopenharmony_ci _blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 30562306a36Sopenharmony_ci _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 30662306a36Sopenharmony_ci _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \ 30762306a36Sopenharmony_ci final=1 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_ci // Fold the final state matrix into the hash chaining value: 31062306a36Sopenharmony_ci // 31162306a36Sopenharmony_ci // for (i = 0; i < 8; i++) 31262306a36Sopenharmony_ci // h[i] ^= v[i] ^ v[i + 8]; 31362306a36Sopenharmony_ci // 31462306a36Sopenharmony_ci vld1.64 {q8-q9}, [ip]! // Load old h[0..3] 31562306a36Sopenharmony_ci veor q0, q0, q4 // v[0..1] ^= v[8..9] 31662306a36Sopenharmony_ci veor q1, q1, q5 // v[2..3] ^= v[10..11] 31762306a36Sopenharmony_ci vld1.64 {q10-q11}, [ip] // Load old h[4..7] 31862306a36Sopenharmony_ci veor q2, q2, q6 // v[4..5] ^= v[12..13] 31962306a36Sopenharmony_ci veor q3, q3, q7 // v[6..7] ^= v[14..15] 32062306a36Sopenharmony_ci veor q0, q0, q8 // v[0..1] ^= h[0..1] 32162306a36Sopenharmony_ci veor q1, q1, q9 // v[2..3] ^= h[2..3] 32262306a36Sopenharmony_ci mov ip, STATE 32362306a36Sopenharmony_ci subs NBLOCKS, NBLOCKS, #1 // nblocks-- 32462306a36Sopenharmony_ci vst1.64 {q0-q1}, [ip]! // Store new h[0..3] 32562306a36Sopenharmony_ci veor q2, q2, q10 // v[4..5] ^= h[4..5] 32662306a36Sopenharmony_ci veor q3, q3, q11 // v[6..7] ^= h[6..7] 32762306a36Sopenharmony_ci vst1.64 {q2-q3}, [ip]! // Store new h[4..7] 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_ci // Advance to the next block, if there is one. 33062306a36Sopenharmony_ci bne .Lnext_block // nblocks != 0? 33162306a36Sopenharmony_ci 33262306a36Sopenharmony_ci mov sp, ORIG_SP 33362306a36Sopenharmony_ci pop {r4-r10} 33462306a36Sopenharmony_ci mov pc, lr 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_ci.Lslow_inc_ctr: 33762306a36Sopenharmony_ci // Handle the case where the counter overflowed its low 32 bits, by 33862306a36Sopenharmony_ci // carrying the overflow bit into the full 128-bit counter. 33962306a36Sopenharmony_ci vmov r9, r10, d29 34062306a36Sopenharmony_ci adcs r8, r8, #0 34162306a36Sopenharmony_ci adcs r9, r9, #0 34262306a36Sopenharmony_ci adc r10, r10, #0 34362306a36Sopenharmony_ci vmov d28, r7, r8 34462306a36Sopenharmony_ci vmov d29, r9, r10 34562306a36Sopenharmony_ci vst1.64 {q14}, [ip] // Update t[0] and t[1] 34662306a36Sopenharmony_ci b .Linc_ctr_done 34762306a36Sopenharmony_ciENDPROC(blake2b_compress_neon) 348