162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2018 Google, Inc. 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci 662306a36Sopenharmony_ci#include <linux/linkage.h> 762306a36Sopenharmony_ci#include <asm/assembler.h> 862306a36Sopenharmony_ci 962306a36Sopenharmony_ci/* 1062306a36Sopenharmony_ci * Design notes: 1162306a36Sopenharmony_ci * 1262306a36Sopenharmony_ci * 16 registers would be needed to hold the state matrix, but only 14 are 1362306a36Sopenharmony_ci * available because 'sp' and 'pc' cannot be used. So we spill the elements 1462306a36Sopenharmony_ci * (x8, x9) to the stack and swap them out with (x10, x11). This adds one 1562306a36Sopenharmony_ci * 'ldrd' and one 'strd' instruction per round. 1662306a36Sopenharmony_ci * 1762306a36Sopenharmony_ci * All rotates are performed using the implicit rotate operand accepted by the 1862306a36Sopenharmony_ci * 'add' and 'eor' instructions. This is faster than using explicit rotate 1962306a36Sopenharmony_ci * instructions. To make this work, we allow the values in the second and last 2062306a36Sopenharmony_ci * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the 2162306a36Sopenharmony_ci * wrong rotation amount. The rotation amount is then fixed up just in time 2262306a36Sopenharmony_ci * when the values are used. 'brot' is the number of bits the values in row 'b' 2362306a36Sopenharmony_ci * need to be rotated right to arrive at the correct values, and 'drot' 2462306a36Sopenharmony_ci * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such 2562306a36Sopenharmony_ci * that they end up as (25, 24) after every round. 2662306a36Sopenharmony_ci */ 2762306a36Sopenharmony_ci 2862306a36Sopenharmony_ci // ChaCha state registers 2962306a36Sopenharmony_ci X0 .req r0 3062306a36Sopenharmony_ci X1 .req r1 3162306a36Sopenharmony_ci X2 .req r2 3262306a36Sopenharmony_ci X3 .req r3 3362306a36Sopenharmony_ci X4 .req r4 3462306a36Sopenharmony_ci X5 .req r5 3562306a36Sopenharmony_ci X6 .req r6 3662306a36Sopenharmony_ci X7 .req r7 3762306a36Sopenharmony_ci X8_X10 .req r8 // shared by x8 and x10 3862306a36Sopenharmony_ci X9_X11 .req r9 // shared by x9 and x11 3962306a36Sopenharmony_ci X12 .req r10 4062306a36Sopenharmony_ci X13 .req r11 4162306a36Sopenharmony_ci X14 .req r12 4262306a36Sopenharmony_ci X15 .req r14 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ci.macro _le32_bswap_4x a, b, c, d, tmp 4562306a36Sopenharmony_ci#ifdef __ARMEB__ 4662306a36Sopenharmony_ci rev_l \a, \tmp 4762306a36Sopenharmony_ci rev_l \b, \tmp 4862306a36Sopenharmony_ci rev_l \c, \tmp 4962306a36Sopenharmony_ci rev_l \d, \tmp 5062306a36Sopenharmony_ci#endif 5162306a36Sopenharmony_ci.endm 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci.macro __ldrd a, b, src, offset 5462306a36Sopenharmony_ci#if __LINUX_ARM_ARCH__ >= 6 5562306a36Sopenharmony_ci ldrd \a, \b, [\src, #\offset] 5662306a36Sopenharmony_ci#else 5762306a36Sopenharmony_ci ldr \a, [\src, #\offset] 5862306a36Sopenharmony_ci ldr \b, [\src, #\offset + 4] 5962306a36Sopenharmony_ci#endif 6062306a36Sopenharmony_ci.endm 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_ci.macro __strd a, b, dst, offset 6362306a36Sopenharmony_ci#if __LINUX_ARM_ARCH__ >= 6 6462306a36Sopenharmony_ci strd \a, \b, [\dst, #\offset] 6562306a36Sopenharmony_ci#else 6662306a36Sopenharmony_ci str \a, [\dst, #\offset] 6762306a36Sopenharmony_ci str \b, [\dst, #\offset + 4] 6862306a36Sopenharmony_ci#endif 6962306a36Sopenharmony_ci.endm 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci // a += b; d ^= a; d = rol(d, 16); 7462306a36Sopenharmony_ci add \a1, \a1, \b1, ror #brot 7562306a36Sopenharmony_ci add \a2, \a2, \b2, ror #brot 7662306a36Sopenharmony_ci eor \d1, \a1, \d1, ror #drot 7762306a36Sopenharmony_ci eor \d2, \a2, \d2, ror #drot 7862306a36Sopenharmony_ci // drot == 32 - 16 == 16 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci // c += d; b ^= c; b = rol(b, 12); 8162306a36Sopenharmony_ci add \c1, \c1, \d1, ror #16 8262306a36Sopenharmony_ci add \c2, \c2, \d2, ror #16 8362306a36Sopenharmony_ci eor \b1, \c1, \b1, ror #brot 8462306a36Sopenharmony_ci eor \b2, \c2, \b2, ror #brot 8562306a36Sopenharmony_ci // brot == 32 - 12 == 20 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci // a += b; d ^= a; d = rol(d, 8); 8862306a36Sopenharmony_ci add \a1, \a1, \b1, ror #20 8962306a36Sopenharmony_ci add \a2, \a2, \b2, ror #20 9062306a36Sopenharmony_ci eor \d1, \a1, \d1, ror #16 9162306a36Sopenharmony_ci eor \d2, \a2, \d2, ror #16 9262306a36Sopenharmony_ci // drot == 32 - 8 == 24 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci // c += d; b ^= c; b = rol(b, 7); 9562306a36Sopenharmony_ci add \c1, \c1, \d1, ror #24 9662306a36Sopenharmony_ci add \c2, \c2, \d2, ror #24 9762306a36Sopenharmony_ci eor \b1, \c1, \b1, ror #20 9862306a36Sopenharmony_ci eor \b2, \c2, \b2, ror #20 9962306a36Sopenharmony_ci // brot == 32 - 7 == 25 10062306a36Sopenharmony_ci.endm 10162306a36Sopenharmony_ci 10262306a36Sopenharmony_ci.macro _doubleround 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_ci // column round 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13) 10762306a36Sopenharmony_ci _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci // save (x8, x9); restore (x10, x11) 11062306a36Sopenharmony_ci __strd X8_X10, X9_X11, sp, 0 11162306a36Sopenharmony_ci __ldrd X8_X10, X9_X11, sp, 8 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_ci // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15) 11462306a36Sopenharmony_ci _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci .set brot, 25 11762306a36Sopenharmony_ci .set drot, 24 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci // diagonal round 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12) 12262306a36Sopenharmony_ci _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci // save (x10, x11); restore (x8, x9) 12562306a36Sopenharmony_ci __strd X8_X10, X9_X11, sp, 8 12662306a36Sopenharmony_ci __ldrd X8_X10, X9_X11, sp, 0 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_ci // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14) 12962306a36Sopenharmony_ci _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14 13062306a36Sopenharmony_ci.endm 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_ci.macro _chacha_permute nrounds 13362306a36Sopenharmony_ci .set brot, 0 13462306a36Sopenharmony_ci .set drot, 0 13562306a36Sopenharmony_ci .rept \nrounds / 2 13662306a36Sopenharmony_ci _doubleround 13762306a36Sopenharmony_ci .endr 13862306a36Sopenharmony_ci.endm 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_ci.macro _chacha nrounds 14162306a36Sopenharmony_ci 14262306a36Sopenharmony_ci.Lnext_block\@: 14362306a36Sopenharmony_ci // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN 14462306a36Sopenharmony_ci // Registers contain x0-x9,x12-x15. 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci // Do the core ChaCha permutation to update x0-x15. 14762306a36Sopenharmony_ci _chacha_permute \nrounds 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci add sp, #8 15062306a36Sopenharmony_ci // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN 15162306a36Sopenharmony_ci // Registers contain x0-x9,x12-x15. 15262306a36Sopenharmony_ci // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15). 15562306a36Sopenharmony_ci push {X8_X10, X9_X11, X12, X13, X14, X15} 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci // Load (OUT, IN, LEN). 15862306a36Sopenharmony_ci ldr r14, [sp, #96] 15962306a36Sopenharmony_ci ldr r12, [sp, #100] 16062306a36Sopenharmony_ci ldr r11, [sp, #104] 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_ci orr r10, r14, r12 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci // Use slow path if fewer than 64 bytes remain. 16562306a36Sopenharmony_ci cmp r11, #64 16662306a36Sopenharmony_ci blt .Lxor_slowpath\@ 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ci // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on 16962306a36Sopenharmony_ci // ARMv6+, since ldmia and stmia (used below) still require alignment. 17062306a36Sopenharmony_ci tst r10, #3 17162306a36Sopenharmony_ci bne .Lxor_slowpath\@ 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ci // Fast path: XOR 64 bytes of aligned data. 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN 17662306a36Sopenharmony_ci // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT. 17762306a36Sopenharmony_ci // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. 17862306a36Sopenharmony_ci 17962306a36Sopenharmony_ci // x0-x3 18062306a36Sopenharmony_ci __ldrd r8, r9, sp, 32 18162306a36Sopenharmony_ci __ldrd r10, r11, sp, 40 18262306a36Sopenharmony_ci add X0, X0, r8 18362306a36Sopenharmony_ci add X1, X1, r9 18462306a36Sopenharmony_ci add X2, X2, r10 18562306a36Sopenharmony_ci add X3, X3, r11 18662306a36Sopenharmony_ci _le32_bswap_4x X0, X1, X2, X3, r8 18762306a36Sopenharmony_ci ldmia r12!, {r8-r11} 18862306a36Sopenharmony_ci eor X0, X0, r8 18962306a36Sopenharmony_ci eor X1, X1, r9 19062306a36Sopenharmony_ci eor X2, X2, r10 19162306a36Sopenharmony_ci eor X3, X3, r11 19262306a36Sopenharmony_ci stmia r14!, {X0-X3} 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_ci // x4-x7 19562306a36Sopenharmony_ci __ldrd r8, r9, sp, 48 19662306a36Sopenharmony_ci __ldrd r10, r11, sp, 56 19762306a36Sopenharmony_ci add X4, r8, X4, ror #brot 19862306a36Sopenharmony_ci add X5, r9, X5, ror #brot 19962306a36Sopenharmony_ci ldmia r12!, {X0-X3} 20062306a36Sopenharmony_ci add X6, r10, X6, ror #brot 20162306a36Sopenharmony_ci add X7, r11, X7, ror #brot 20262306a36Sopenharmony_ci _le32_bswap_4x X4, X5, X6, X7, r8 20362306a36Sopenharmony_ci eor X4, X4, X0 20462306a36Sopenharmony_ci eor X5, X5, X1 20562306a36Sopenharmony_ci eor X6, X6, X2 20662306a36Sopenharmony_ci eor X7, X7, X3 20762306a36Sopenharmony_ci stmia r14!, {X4-X7} 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ci // x8-x15 21062306a36Sopenharmony_ci pop {r0-r7} // (x8-x9,x12-x15,x10-x11) 21162306a36Sopenharmony_ci __ldrd r8, r9, sp, 32 21262306a36Sopenharmony_ci __ldrd r10, r11, sp, 40 21362306a36Sopenharmony_ci add r0, r0, r8 // x8 21462306a36Sopenharmony_ci add r1, r1, r9 // x9 21562306a36Sopenharmony_ci add r6, r6, r10 // x10 21662306a36Sopenharmony_ci add r7, r7, r11 // x11 21762306a36Sopenharmony_ci _le32_bswap_4x r0, r1, r6, r7, r8 21862306a36Sopenharmony_ci ldmia r12!, {r8-r11} 21962306a36Sopenharmony_ci eor r0, r0, r8 // x8 22062306a36Sopenharmony_ci eor r1, r1, r9 // x9 22162306a36Sopenharmony_ci eor r6, r6, r10 // x10 22262306a36Sopenharmony_ci eor r7, r7, r11 // x11 22362306a36Sopenharmony_ci stmia r14!, {r0,r1,r6,r7} 22462306a36Sopenharmony_ci ldmia r12!, {r0,r1,r6,r7} 22562306a36Sopenharmony_ci __ldrd r8, r9, sp, 48 22662306a36Sopenharmony_ci __ldrd r10, r11, sp, 56 22762306a36Sopenharmony_ci add r2, r8, r2, ror #drot // x12 22862306a36Sopenharmony_ci add r3, r9, r3, ror #drot // x13 22962306a36Sopenharmony_ci add r4, r10, r4, ror #drot // x14 23062306a36Sopenharmony_ci add r5, r11, r5, ror #drot // x15 23162306a36Sopenharmony_ci _le32_bswap_4x r2, r3, r4, r5, r9 23262306a36Sopenharmony_ci ldr r9, [sp, #72] // load LEN 23362306a36Sopenharmony_ci eor r2, r2, r0 // x12 23462306a36Sopenharmony_ci eor r3, r3, r1 // x13 23562306a36Sopenharmony_ci eor r4, r4, r6 // x14 23662306a36Sopenharmony_ci eor r5, r5, r7 // x15 23762306a36Sopenharmony_ci subs r9, #64 // decrement and check LEN 23862306a36Sopenharmony_ci stmia r14!, {r2-r5} 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_ci beq .Ldone\@ 24162306a36Sopenharmony_ci 24262306a36Sopenharmony_ci.Lprepare_for_next_block\@: 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci // Stack: x0-x15 OUT IN LEN 24562306a36Sopenharmony_ci 24662306a36Sopenharmony_ci // Increment block counter (x12) 24762306a36Sopenharmony_ci add r8, #1 24862306a36Sopenharmony_ci 24962306a36Sopenharmony_ci // Store updated (OUT, IN, LEN) 25062306a36Sopenharmony_ci str r14, [sp, #64] 25162306a36Sopenharmony_ci str r12, [sp, #68] 25262306a36Sopenharmony_ci str r9, [sp, #72] 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ci mov r14, sp 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci // Store updated block counter (x12) 25762306a36Sopenharmony_ci str r8, [sp, #48] 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci sub sp, #16 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_ci // Reload state and do next block 26262306a36Sopenharmony_ci ldmia r14!, {r0-r11} // load x0-x11 26362306a36Sopenharmony_ci __strd r10, r11, sp, 8 // store x10-x11 before state 26462306a36Sopenharmony_ci ldmia r14, {r10-r12,r14} // load x12-x15 26562306a36Sopenharmony_ci b .Lnext_block\@ 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_ci.Lxor_slowpath\@: 26862306a36Sopenharmony_ci // Slow path: < 64 bytes remaining, or unaligned input or output buffer. 26962306a36Sopenharmony_ci // We handle it by storing the 64 bytes of keystream to the stack, then 27062306a36Sopenharmony_ci // XOR-ing the needed portion with the data. 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ci // Allocate keystream buffer 27362306a36Sopenharmony_ci sub sp, #64 27462306a36Sopenharmony_ci mov r14, sp 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN 27762306a36Sopenharmony_ci // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0. 27862306a36Sopenharmony_ci // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci // Save keystream for x0-x3 28162306a36Sopenharmony_ci __ldrd r8, r9, sp, 96 28262306a36Sopenharmony_ci __ldrd r10, r11, sp, 104 28362306a36Sopenharmony_ci add X0, X0, r8 28462306a36Sopenharmony_ci add X1, X1, r9 28562306a36Sopenharmony_ci add X2, X2, r10 28662306a36Sopenharmony_ci add X3, X3, r11 28762306a36Sopenharmony_ci _le32_bswap_4x X0, X1, X2, X3, r8 28862306a36Sopenharmony_ci stmia r14!, {X0-X3} 28962306a36Sopenharmony_ci 29062306a36Sopenharmony_ci // Save keystream for x4-x7 29162306a36Sopenharmony_ci __ldrd r8, r9, sp, 112 29262306a36Sopenharmony_ci __ldrd r10, r11, sp, 120 29362306a36Sopenharmony_ci add X4, r8, X4, ror #brot 29462306a36Sopenharmony_ci add X5, r9, X5, ror #brot 29562306a36Sopenharmony_ci add X6, r10, X6, ror #brot 29662306a36Sopenharmony_ci add X7, r11, X7, ror #brot 29762306a36Sopenharmony_ci _le32_bswap_4x X4, X5, X6, X7, r8 29862306a36Sopenharmony_ci add r8, sp, #64 29962306a36Sopenharmony_ci stmia r14!, {X4-X7} 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ci // Save keystream for x8-x15 30262306a36Sopenharmony_ci ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11) 30362306a36Sopenharmony_ci __ldrd r8, r9, sp, 128 30462306a36Sopenharmony_ci __ldrd r10, r11, sp, 136 30562306a36Sopenharmony_ci add r0, r0, r8 // x8 30662306a36Sopenharmony_ci add r1, r1, r9 // x9 30762306a36Sopenharmony_ci add r6, r6, r10 // x10 30862306a36Sopenharmony_ci add r7, r7, r11 // x11 30962306a36Sopenharmony_ci _le32_bswap_4x r0, r1, r6, r7, r8 31062306a36Sopenharmony_ci stmia r14!, {r0,r1,r6,r7} 31162306a36Sopenharmony_ci __ldrd r8, r9, sp, 144 31262306a36Sopenharmony_ci __ldrd r10, r11, sp, 152 31362306a36Sopenharmony_ci add r2, r8, r2, ror #drot // x12 31462306a36Sopenharmony_ci add r3, r9, r3, ror #drot // x13 31562306a36Sopenharmony_ci add r4, r10, r4, ror #drot // x14 31662306a36Sopenharmony_ci add r5, r11, r5, ror #drot // x15 31762306a36Sopenharmony_ci _le32_bswap_4x r2, r3, r4, r5, r9 31862306a36Sopenharmony_ci stmia r14, {r2-r5} 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN 32162306a36Sopenharmony_ci // Registers: r8 is block counter, r12 is IN. 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_ci ldr r9, [sp, #168] // LEN 32462306a36Sopenharmony_ci ldr r14, [sp, #160] // OUT 32562306a36Sopenharmony_ci cmp r9, #64 32662306a36Sopenharmony_ci mov r0, sp 32762306a36Sopenharmony_ci movle r1, r9 32862306a36Sopenharmony_ci movgt r1, #64 32962306a36Sopenharmony_ci // r1 is number of bytes to XOR, in range [1, 64] 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_ci.if __LINUX_ARM_ARCH__ < 6 33262306a36Sopenharmony_ci orr r2, r12, r14 33362306a36Sopenharmony_ci tst r2, #3 // IN or OUT misaligned? 33462306a36Sopenharmony_ci bne .Lxor_next_byte\@ 33562306a36Sopenharmony_ci.endif 33662306a36Sopenharmony_ci 33762306a36Sopenharmony_ci // XOR a word at a time 33862306a36Sopenharmony_ci.rept 16 33962306a36Sopenharmony_ci subs r1, #4 34062306a36Sopenharmony_ci blt .Lxor_words_done\@ 34162306a36Sopenharmony_ci ldr r2, [r12], #4 34262306a36Sopenharmony_ci ldr r3, [r0], #4 34362306a36Sopenharmony_ci eor r2, r2, r3 34462306a36Sopenharmony_ci str r2, [r14], #4 34562306a36Sopenharmony_ci.endr 34662306a36Sopenharmony_ci b .Lxor_slowpath_done\@ 34762306a36Sopenharmony_ci.Lxor_words_done\@: 34862306a36Sopenharmony_ci ands r1, r1, #3 34962306a36Sopenharmony_ci beq .Lxor_slowpath_done\@ 35062306a36Sopenharmony_ci 35162306a36Sopenharmony_ci // XOR a byte at a time 35262306a36Sopenharmony_ci.Lxor_next_byte\@: 35362306a36Sopenharmony_ci ldrb r2, [r12], #1 35462306a36Sopenharmony_ci ldrb r3, [r0], #1 35562306a36Sopenharmony_ci eor r2, r2, r3 35662306a36Sopenharmony_ci strb r2, [r14], #1 35762306a36Sopenharmony_ci subs r1, #1 35862306a36Sopenharmony_ci bne .Lxor_next_byte\@ 35962306a36Sopenharmony_ci 36062306a36Sopenharmony_ci.Lxor_slowpath_done\@: 36162306a36Sopenharmony_ci subs r9, #64 36262306a36Sopenharmony_ci add sp, #96 36362306a36Sopenharmony_ci bgt .Lprepare_for_next_block\@ 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_ci.Ldone\@: 36662306a36Sopenharmony_ci.endm // _chacha 36762306a36Sopenharmony_ci 36862306a36Sopenharmony_ci/* 36962306a36Sopenharmony_ci * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, 37062306a36Sopenharmony_ci * const u32 *state, int nrounds); 37162306a36Sopenharmony_ci */ 37262306a36Sopenharmony_ciENTRY(chacha_doarm) 37362306a36Sopenharmony_ci cmp r2, #0 // len == 0? 37462306a36Sopenharmony_ci reteq lr 37562306a36Sopenharmony_ci 37662306a36Sopenharmony_ci ldr ip, [sp] 37762306a36Sopenharmony_ci cmp ip, #12 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_ci push {r0-r2,r4-r11,lr} 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_ci // Push state x0-x15 onto stack. 38262306a36Sopenharmony_ci // Also store an extra copy of x10-x11 just before the state. 38362306a36Sopenharmony_ci 38462306a36Sopenharmony_ci add X12, r3, #48 38562306a36Sopenharmony_ci ldm X12, {X12,X13,X14,X15} 38662306a36Sopenharmony_ci push {X12,X13,X14,X15} 38762306a36Sopenharmony_ci sub sp, sp, #64 38862306a36Sopenharmony_ci 38962306a36Sopenharmony_ci __ldrd X8_X10, X9_X11, r3, 40 39062306a36Sopenharmony_ci __strd X8_X10, X9_X11, sp, 8 39162306a36Sopenharmony_ci __strd X8_X10, X9_X11, sp, 56 39262306a36Sopenharmony_ci ldm r3, {X0-X9_X11} 39362306a36Sopenharmony_ci __strd X0, X1, sp, 16 39462306a36Sopenharmony_ci __strd X2, X3, sp, 24 39562306a36Sopenharmony_ci __strd X4, X5, sp, 32 39662306a36Sopenharmony_ci __strd X6, X7, sp, 40 39762306a36Sopenharmony_ci __strd X8_X10, X9_X11, sp, 48 39862306a36Sopenharmony_ci 39962306a36Sopenharmony_ci beq 1f 40062306a36Sopenharmony_ci _chacha 20 40162306a36Sopenharmony_ci 40262306a36Sopenharmony_ci0: add sp, #76 40362306a36Sopenharmony_ci pop {r4-r11, pc} 40462306a36Sopenharmony_ci 40562306a36Sopenharmony_ci1: _chacha 12 40662306a36Sopenharmony_ci b 0b 40762306a36Sopenharmony_ciENDPROC(chacha_doarm) 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_ci/* 41062306a36Sopenharmony_ci * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds); 41162306a36Sopenharmony_ci */ 41262306a36Sopenharmony_ciENTRY(hchacha_block_arm) 41362306a36Sopenharmony_ci push {r1,r4-r11,lr} 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci cmp r2, #12 // ChaCha12 ? 41662306a36Sopenharmony_ci 41762306a36Sopenharmony_ci mov r14, r0 41862306a36Sopenharmony_ci ldmia r14!, {r0-r11} // load x0-x11 41962306a36Sopenharmony_ci push {r10-r11} // store x10-x11 to stack 42062306a36Sopenharmony_ci ldm r14, {r10-r12,r14} // load x12-x15 42162306a36Sopenharmony_ci sub sp, #8 42262306a36Sopenharmony_ci 42362306a36Sopenharmony_ci beq 1f 42462306a36Sopenharmony_ci _chacha_permute 20 42562306a36Sopenharmony_ci 42662306a36Sopenharmony_ci // Skip over (unused0-unused1, x10-x11) 42762306a36Sopenharmony_ci0: add sp, #16 42862306a36Sopenharmony_ci 42962306a36Sopenharmony_ci // Fix up rotations of x12-x15 43062306a36Sopenharmony_ci ror X12, X12, #drot 43162306a36Sopenharmony_ci ror X13, X13, #drot 43262306a36Sopenharmony_ci pop {r4} // load 'out' 43362306a36Sopenharmony_ci ror X14, X14, #drot 43462306a36Sopenharmony_ci ror X15, X15, #drot 43562306a36Sopenharmony_ci 43662306a36Sopenharmony_ci // Store (x0-x3,x12-x15) to 'out' 43762306a36Sopenharmony_ci stm r4, {X0,X1,X2,X3,X12,X13,X14,X15} 43862306a36Sopenharmony_ci 43962306a36Sopenharmony_ci pop {r4-r11,pc} 44062306a36Sopenharmony_ci 44162306a36Sopenharmony_ci1: _chacha_permute 12 44262306a36Sopenharmony_ci b 0b 44362306a36Sopenharmony_ciENDPROC(hchacha_block_arm) 444