162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 2015 - 2017 Linaro Ltd. 662306a36Sopenharmony_ci * Copyright (C) 2023 Google LLC. <ardb@google.com> 762306a36Sopenharmony_ci */ 862306a36Sopenharmony_ci 962306a36Sopenharmony_ci#include <linux/linkage.h> 1062306a36Sopenharmony_ci#include <asm/assembler.h> 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci .arch armv8-a 1362306a36Sopenharmony_ci .fpu crypto-neon-fp-armv8 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_ci SHASH .req q0 1662306a36Sopenharmony_ci T1 .req q1 1762306a36Sopenharmony_ci XL .req q2 1862306a36Sopenharmony_ci XM .req q3 1962306a36Sopenharmony_ci XH .req q4 2062306a36Sopenharmony_ci IN1 .req q4 2162306a36Sopenharmony_ci 2262306a36Sopenharmony_ci SHASH_L .req d0 2362306a36Sopenharmony_ci SHASH_H .req d1 2462306a36Sopenharmony_ci T1_L .req d2 2562306a36Sopenharmony_ci T1_H .req d3 2662306a36Sopenharmony_ci XL_L .req d4 2762306a36Sopenharmony_ci XL_H .req d5 2862306a36Sopenharmony_ci XM_L .req d6 2962306a36Sopenharmony_ci XM_H .req d7 3062306a36Sopenharmony_ci XH_L .req d8 3162306a36Sopenharmony_ci 3262306a36Sopenharmony_ci t0l .req d10 3362306a36Sopenharmony_ci t0h .req d11 3462306a36Sopenharmony_ci t1l .req d12 3562306a36Sopenharmony_ci t1h .req d13 3662306a36Sopenharmony_ci t2l .req d14 3762306a36Sopenharmony_ci t2h .req d15 3862306a36Sopenharmony_ci t3l .req d16 3962306a36Sopenharmony_ci t3h .req d17 4062306a36Sopenharmony_ci t4l .req d18 4162306a36Sopenharmony_ci t4h .req d19 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_ci t0q .req q5 4462306a36Sopenharmony_ci t1q .req q6 4562306a36Sopenharmony_ci t2q .req q7 4662306a36Sopenharmony_ci t3q .req q8 4762306a36Sopenharmony_ci t4q .req q9 4862306a36Sopenharmony_ci XH2 .req q9 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_ci s1l .req d20 5162306a36Sopenharmony_ci s1h .req d21 5262306a36Sopenharmony_ci s2l .req d22 5362306a36Sopenharmony_ci s2h .req d23 5462306a36Sopenharmony_ci s3l .req d24 5562306a36Sopenharmony_ci s3h .req d25 5662306a36Sopenharmony_ci s4l .req d26 5762306a36Sopenharmony_ci s4h .req d27 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_ci MASK .req d28 6062306a36Sopenharmony_ci SHASH2_p8 .req d28 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_ci k16 .req d29 6362306a36Sopenharmony_ci k32 .req d30 6462306a36Sopenharmony_ci k48 .req d31 6562306a36Sopenharmony_ci SHASH2_p64 .req d31 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_ci HH .req q10 6862306a36Sopenharmony_ci HH3 .req q11 6962306a36Sopenharmony_ci HH4 .req q12 7062306a36Sopenharmony_ci HH34 .req q13 7162306a36Sopenharmony_ci 7262306a36Sopenharmony_ci HH_L .req d20 7362306a36Sopenharmony_ci HH_H .req d21 7462306a36Sopenharmony_ci HH3_L .req d22 7562306a36Sopenharmony_ci HH3_H .req d23 7662306a36Sopenharmony_ci HH4_L .req d24 7762306a36Sopenharmony_ci HH4_H .req d25 7862306a36Sopenharmony_ci HH34_L .req d26 7962306a36Sopenharmony_ci HH34_H .req d27 8062306a36Sopenharmony_ci SHASH2_H .req d29 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci XL2 .req q5 8362306a36Sopenharmony_ci XM2 .req q6 8462306a36Sopenharmony_ci T2 .req q7 8562306a36Sopenharmony_ci T3 .req q8 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci XL2_L .req d10 8862306a36Sopenharmony_ci XL2_H .req d11 8962306a36Sopenharmony_ci XM2_L .req d12 9062306a36Sopenharmony_ci XM2_H .req d13 9162306a36Sopenharmony_ci T3_L .req d16 9262306a36Sopenharmony_ci T3_H .req d17 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci .text 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 9762306a36Sopenharmony_ci vmull.p64 \rd, \rn, \rm 9862306a36Sopenharmony_ci .endm 9962306a36Sopenharmony_ci 10062306a36Sopenharmony_ci /* 10162306a36Sopenharmony_ci * This implementation of 64x64 -> 128 bit polynomial multiplication 10262306a36Sopenharmony_ci * using vmull.p8 instructions (8x8 -> 16) is taken from the paper 10362306a36Sopenharmony_ci * "Fast Software Polynomial Multiplication on ARM Processors Using 10462306a36Sopenharmony_ci * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and 10562306a36Sopenharmony_ci * Ricardo Dahab (https://hal.inria.fr/hal-01506572) 10662306a36Sopenharmony_ci * 10762306a36Sopenharmony_ci * It has been slightly tweaked for in-order performance, and to allow 10862306a36Sopenharmony_ci * 'rq' to overlap with 'ad' or 'bd'. 10962306a36Sopenharmony_ci */ 11062306a36Sopenharmony_ci .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l 11162306a36Sopenharmony_ci vext.8 t0l, \ad, \ad, #1 @ A1 11262306a36Sopenharmony_ci .ifc \b1, t4l 11362306a36Sopenharmony_ci vext.8 t4l, \bd, \bd, #1 @ B1 11462306a36Sopenharmony_ci .endif 11562306a36Sopenharmony_ci vmull.p8 t0q, t0l, \bd @ F = A1*B 11662306a36Sopenharmony_ci vext.8 t1l, \ad, \ad, #2 @ A2 11762306a36Sopenharmony_ci vmull.p8 t4q, \ad, \b1 @ E = A*B1 11862306a36Sopenharmony_ci .ifc \b2, t3l 11962306a36Sopenharmony_ci vext.8 t3l, \bd, \bd, #2 @ B2 12062306a36Sopenharmony_ci .endif 12162306a36Sopenharmony_ci vmull.p8 t1q, t1l, \bd @ H = A2*B 12262306a36Sopenharmony_ci vext.8 t2l, \ad, \ad, #3 @ A3 12362306a36Sopenharmony_ci vmull.p8 t3q, \ad, \b2 @ G = A*B2 12462306a36Sopenharmony_ci veor t0q, t0q, t4q @ L = E + F 12562306a36Sopenharmony_ci .ifc \b3, t4l 12662306a36Sopenharmony_ci vext.8 t4l, \bd, \bd, #3 @ B3 12762306a36Sopenharmony_ci .endif 12862306a36Sopenharmony_ci vmull.p8 t2q, t2l, \bd @ J = A3*B 12962306a36Sopenharmony_ci veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 13062306a36Sopenharmony_ci veor t1q, t1q, t3q @ M = G + H 13162306a36Sopenharmony_ci .ifc \b4, t3l 13262306a36Sopenharmony_ci vext.8 t3l, \bd, \bd, #4 @ B4 13362306a36Sopenharmony_ci .endif 13462306a36Sopenharmony_ci vmull.p8 t4q, \ad, \b3 @ I = A*B3 13562306a36Sopenharmony_ci veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 13662306a36Sopenharmony_ci vmull.p8 t3q, \ad, \b4 @ K = A*B4 13762306a36Sopenharmony_ci vand t0h, t0h, k48 13862306a36Sopenharmony_ci vand t1h, t1h, k32 13962306a36Sopenharmony_ci veor t2q, t2q, t4q @ N = I + J 14062306a36Sopenharmony_ci veor t0l, t0l, t0h 14162306a36Sopenharmony_ci veor t1l, t1l, t1h 14262306a36Sopenharmony_ci veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 14362306a36Sopenharmony_ci vand t2h, t2h, k16 14462306a36Sopenharmony_ci veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 14562306a36Sopenharmony_ci vmov.i64 t3h, #0 14662306a36Sopenharmony_ci vext.8 t0q, t0q, t0q, #15 14762306a36Sopenharmony_ci veor t2l, t2l, t2h 14862306a36Sopenharmony_ci vext.8 t1q, t1q, t1q, #14 14962306a36Sopenharmony_ci vmull.p8 \rq, \ad, \bd @ D = A*B 15062306a36Sopenharmony_ci vext.8 t2q, t2q, t2q, #13 15162306a36Sopenharmony_ci vext.8 t3q, t3q, t3q, #12 15262306a36Sopenharmony_ci veor t0q, t0q, t1q 15362306a36Sopenharmony_ci veor t2q, t2q, t3q 15462306a36Sopenharmony_ci veor \rq, \rq, t0q 15562306a36Sopenharmony_ci veor \rq, \rq, t2q 15662306a36Sopenharmony_ci .endm 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci // 15962306a36Sopenharmony_ci // PMULL (64x64->128) based reduction for CPUs that can do 16062306a36Sopenharmony_ci // it in a single instruction. 16162306a36Sopenharmony_ci // 16262306a36Sopenharmony_ci .macro __pmull_reduce_p64 16362306a36Sopenharmony_ci vmull.p64 T1, XL_L, MASK 16462306a36Sopenharmony_ci 16562306a36Sopenharmony_ci veor XH_L, XH_L, XM_H 16662306a36Sopenharmony_ci vext.8 T1, T1, T1, #8 16762306a36Sopenharmony_ci veor XL_H, XL_H, XM_L 16862306a36Sopenharmony_ci veor T1, T1, XL 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci vmull.p64 XL, T1_H, MASK 17162306a36Sopenharmony_ci .endm 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ci // 17462306a36Sopenharmony_ci // Alternative reduction for CPUs that lack support for the 17562306a36Sopenharmony_ci // 64x64->128 PMULL instruction 17662306a36Sopenharmony_ci // 17762306a36Sopenharmony_ci .macro __pmull_reduce_p8 17862306a36Sopenharmony_ci veor XL_H, XL_H, XM_L 17962306a36Sopenharmony_ci veor XH_L, XH_L, XM_H 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_ci vshl.i64 T1, XL, #57 18262306a36Sopenharmony_ci vshl.i64 T2, XL, #62 18362306a36Sopenharmony_ci veor T1, T1, T2 18462306a36Sopenharmony_ci vshl.i64 T2, XL, #63 18562306a36Sopenharmony_ci veor T1, T1, T2 18662306a36Sopenharmony_ci veor XL_H, XL_H, T1_L 18762306a36Sopenharmony_ci veor XH_L, XH_L, T1_H 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci vshr.u64 T1, XL, #1 19062306a36Sopenharmony_ci veor XH, XH, XL 19162306a36Sopenharmony_ci veor XL, XL, T1 19262306a36Sopenharmony_ci vshr.u64 T1, T1, #6 19362306a36Sopenharmony_ci vshr.u64 XL, XL, #1 19462306a36Sopenharmony_ci .endm 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci .macro ghash_update, pn, enc, aggregate=1, head=1 19762306a36Sopenharmony_ci vld1.64 {XL}, [r1] 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci .if \head 20062306a36Sopenharmony_ci /* do the head block first, if supplied */ 20162306a36Sopenharmony_ci ldr ip, [sp] 20262306a36Sopenharmony_ci teq ip, #0 20362306a36Sopenharmony_ci beq 0f 20462306a36Sopenharmony_ci vld1.64 {T1}, [ip] 20562306a36Sopenharmony_ci teq r0, #0 20662306a36Sopenharmony_ci b 3f 20762306a36Sopenharmony_ci .endif 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ci0: .ifc \pn, p64 21062306a36Sopenharmony_ci .if \aggregate 21162306a36Sopenharmony_ci tst r0, #3 // skip until #blocks is a 21262306a36Sopenharmony_ci bne 2f // round multiple of 4 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ci vld1.8 {XL2-XM2}, [r2]! 21562306a36Sopenharmony_ci1: vld1.8 {T2-T3}, [r2]! 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci .ifnb \enc 21862306a36Sopenharmony_ci \enc\()_4x XL2, XM2, T2, T3 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_ci add ip, r3, #16 22162306a36Sopenharmony_ci vld1.64 {HH}, [ip, :128]! 22262306a36Sopenharmony_ci vld1.64 {HH3-HH4}, [ip, :128] 22362306a36Sopenharmony_ci 22462306a36Sopenharmony_ci veor SHASH2_p64, SHASH_L, SHASH_H 22562306a36Sopenharmony_ci veor SHASH2_H, HH_L, HH_H 22662306a36Sopenharmony_ci veor HH34_L, HH3_L, HH3_H 22762306a36Sopenharmony_ci veor HH34_H, HH4_L, HH4_H 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci vmov.i8 MASK, #0xe1 23062306a36Sopenharmony_ci vshl.u64 MASK, MASK, #57 23162306a36Sopenharmony_ci .endif 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci vrev64.8 XL2, XL2 23462306a36Sopenharmony_ci vrev64.8 XM2, XM2 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ci subs r0, r0, #4 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci vext.8 T1, XL2, XL2, #8 23962306a36Sopenharmony_ci veor XL2_H, XL2_H, XL_L 24062306a36Sopenharmony_ci veor XL, XL, T1 24162306a36Sopenharmony_ci 24262306a36Sopenharmony_ci vrev64.8 T1, T3 24362306a36Sopenharmony_ci vrev64.8 T3, T2 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci vmull.p64 XH, HH4_H, XL_H // a1 * b1 24662306a36Sopenharmony_ci veor XL2_H, XL2_H, XL_H 24762306a36Sopenharmony_ci vmull.p64 XL, HH4_L, XL_L // a0 * b0 24862306a36Sopenharmony_ci vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0) 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci vmull.p64 XH2, HH3_H, XM2_L // a1 * b1 25162306a36Sopenharmony_ci veor XM2_L, XM2_L, XM2_H 25262306a36Sopenharmony_ci vmull.p64 XL2, HH3_L, XM2_H // a0 * b0 25362306a36Sopenharmony_ci vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0) 25462306a36Sopenharmony_ci 25562306a36Sopenharmony_ci veor XH, XH, XH2 25662306a36Sopenharmony_ci veor XL, XL, XL2 25762306a36Sopenharmony_ci veor XM, XM, XM2 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci vmull.p64 XH2, HH_H, T3_L // a1 * b1 26062306a36Sopenharmony_ci veor T3_L, T3_L, T3_H 26162306a36Sopenharmony_ci vmull.p64 XL2, HH_L, T3_H // a0 * b0 26262306a36Sopenharmony_ci vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0) 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_ci veor XH, XH, XH2 26562306a36Sopenharmony_ci veor XL, XL, XL2 26662306a36Sopenharmony_ci veor XM, XM, XM2 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci vmull.p64 XH2, SHASH_H, T1_L // a1 * b1 26962306a36Sopenharmony_ci veor T1_L, T1_L, T1_H 27062306a36Sopenharmony_ci vmull.p64 XL2, SHASH_L, T1_H // a0 * b0 27162306a36Sopenharmony_ci vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0) 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci veor XH, XH, XH2 27462306a36Sopenharmony_ci veor XL, XL, XL2 27562306a36Sopenharmony_ci veor XM, XM, XM2 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_ci beq 4f 27862306a36Sopenharmony_ci 27962306a36Sopenharmony_ci vld1.8 {XL2-XM2}, [r2]! 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_ci veor T1, XL, XH 28262306a36Sopenharmony_ci veor XM, XM, T1 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci __pmull_reduce_p64 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_ci veor T1, T1, XH 28762306a36Sopenharmony_ci veor XL, XL, T1 28862306a36Sopenharmony_ci 28962306a36Sopenharmony_ci b 1b 29062306a36Sopenharmony_ci .endif 29162306a36Sopenharmony_ci .endif 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci2: vld1.8 {T1}, [r2]! 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci .ifnb \enc 29662306a36Sopenharmony_ci \enc\()_1x T1 29762306a36Sopenharmony_ci veor SHASH2_p64, SHASH_L, SHASH_H 29862306a36Sopenharmony_ci vmov.i8 MASK, #0xe1 29962306a36Sopenharmony_ci vshl.u64 MASK, MASK, #57 30062306a36Sopenharmony_ci .endif 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci subs r0, r0, #1 30362306a36Sopenharmony_ci 30462306a36Sopenharmony_ci3: /* multiply XL by SHASH in GF(2^128) */ 30562306a36Sopenharmony_ci vrev64.8 T1, T1 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_ci vext.8 IN1, T1, T1, #8 30862306a36Sopenharmony_ci veor T1_L, T1_L, XL_H 30962306a36Sopenharmony_ci veor XL, XL, IN1 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_ci __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 31262306a36Sopenharmony_ci veor T1, T1, XL 31362306a36Sopenharmony_ci __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 31462306a36Sopenharmony_ci __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) 31562306a36Sopenharmony_ci 31662306a36Sopenharmony_ci4: veor T1, XL, XH 31762306a36Sopenharmony_ci veor XM, XM, T1 31862306a36Sopenharmony_ci 31962306a36Sopenharmony_ci __pmull_reduce_\pn 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci veor T1, T1, XH 32262306a36Sopenharmony_ci veor XL, XL, T1 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ci bne 0b 32562306a36Sopenharmony_ci .endm 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_ci /* 32862306a36Sopenharmony_ci * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 32962306a36Sopenharmony_ci * struct ghash_key const *k, const char *head) 33062306a36Sopenharmony_ci */ 33162306a36Sopenharmony_ciENTRY(pmull_ghash_update_p64) 33262306a36Sopenharmony_ci vld1.64 {SHASH}, [r3]! 33362306a36Sopenharmony_ci vld1.64 {HH}, [r3]! 33462306a36Sopenharmony_ci vld1.64 {HH3-HH4}, [r3] 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_ci veor SHASH2_p64, SHASH_L, SHASH_H 33762306a36Sopenharmony_ci veor SHASH2_H, HH_L, HH_H 33862306a36Sopenharmony_ci veor HH34_L, HH3_L, HH3_H 33962306a36Sopenharmony_ci veor HH34_H, HH4_L, HH4_H 34062306a36Sopenharmony_ci 34162306a36Sopenharmony_ci vmov.i8 MASK, #0xe1 34262306a36Sopenharmony_ci vshl.u64 MASK, MASK, #57 34362306a36Sopenharmony_ci 34462306a36Sopenharmony_ci ghash_update p64 34562306a36Sopenharmony_ci vst1.64 {XL}, [r1] 34662306a36Sopenharmony_ci 34762306a36Sopenharmony_ci bx lr 34862306a36Sopenharmony_ciENDPROC(pmull_ghash_update_p64) 34962306a36Sopenharmony_ci 35062306a36Sopenharmony_ciENTRY(pmull_ghash_update_p8) 35162306a36Sopenharmony_ci vld1.64 {SHASH}, [r3] 35262306a36Sopenharmony_ci veor SHASH2_p8, SHASH_L, SHASH_H 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_ci vext.8 s1l, SHASH_L, SHASH_L, #1 35562306a36Sopenharmony_ci vext.8 s2l, SHASH_L, SHASH_L, #2 35662306a36Sopenharmony_ci vext.8 s3l, SHASH_L, SHASH_L, #3 35762306a36Sopenharmony_ci vext.8 s4l, SHASH_L, SHASH_L, #4 35862306a36Sopenharmony_ci vext.8 s1h, SHASH_H, SHASH_H, #1 35962306a36Sopenharmony_ci vext.8 s2h, SHASH_H, SHASH_H, #2 36062306a36Sopenharmony_ci vext.8 s3h, SHASH_H, SHASH_H, #3 36162306a36Sopenharmony_ci vext.8 s4h, SHASH_H, SHASH_H, #4 36262306a36Sopenharmony_ci 36362306a36Sopenharmony_ci vmov.i64 k16, #0xffff 36462306a36Sopenharmony_ci vmov.i64 k32, #0xffffffff 36562306a36Sopenharmony_ci vmov.i64 k48, #0xffffffffffff 36662306a36Sopenharmony_ci 36762306a36Sopenharmony_ci ghash_update p8 36862306a36Sopenharmony_ci vst1.64 {XL}, [r1] 36962306a36Sopenharmony_ci 37062306a36Sopenharmony_ci bx lr 37162306a36Sopenharmony_ciENDPROC(pmull_ghash_update_p8) 37262306a36Sopenharmony_ci 37362306a36Sopenharmony_ci e0 .req q9 37462306a36Sopenharmony_ci e1 .req q10 37562306a36Sopenharmony_ci e2 .req q11 37662306a36Sopenharmony_ci e3 .req q12 37762306a36Sopenharmony_ci e0l .req d18 37862306a36Sopenharmony_ci e0h .req d19 37962306a36Sopenharmony_ci e2l .req d22 38062306a36Sopenharmony_ci e2h .req d23 38162306a36Sopenharmony_ci e3l .req d24 38262306a36Sopenharmony_ci e3h .req d25 38362306a36Sopenharmony_ci ctr .req q13 38462306a36Sopenharmony_ci ctr0 .req d26 38562306a36Sopenharmony_ci ctr1 .req d27 38662306a36Sopenharmony_ci 38762306a36Sopenharmony_ci ek0 .req q14 38862306a36Sopenharmony_ci ek1 .req q15 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci .macro round, rk:req, regs:vararg 39162306a36Sopenharmony_ci .irp r, \regs 39262306a36Sopenharmony_ci aese.8 \r, \rk 39362306a36Sopenharmony_ci aesmc.8 \r, \r 39462306a36Sopenharmony_ci .endr 39562306a36Sopenharmony_ci .endm 39662306a36Sopenharmony_ci 39762306a36Sopenharmony_ci .macro aes_encrypt, rkp, rounds, regs:vararg 39862306a36Sopenharmony_ci vld1.8 {ek0-ek1}, [\rkp, :128]! 39962306a36Sopenharmony_ci cmp \rounds, #12 40062306a36Sopenharmony_ci blt .L\@ // AES-128 40162306a36Sopenharmony_ci 40262306a36Sopenharmony_ci round ek0, \regs 40362306a36Sopenharmony_ci vld1.8 {ek0}, [\rkp, :128]! 40462306a36Sopenharmony_ci round ek1, \regs 40562306a36Sopenharmony_ci vld1.8 {ek1}, [\rkp, :128]! 40662306a36Sopenharmony_ci 40762306a36Sopenharmony_ci beq .L\@ // AES-192 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_ci round ek0, \regs 41062306a36Sopenharmony_ci vld1.8 {ek0}, [\rkp, :128]! 41162306a36Sopenharmony_ci round ek1, \regs 41262306a36Sopenharmony_ci vld1.8 {ek1}, [\rkp, :128]! 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_ci.L\@: .rept 4 41562306a36Sopenharmony_ci round ek0, \regs 41662306a36Sopenharmony_ci vld1.8 {ek0}, [\rkp, :128]! 41762306a36Sopenharmony_ci round ek1, \regs 41862306a36Sopenharmony_ci vld1.8 {ek1}, [\rkp, :128]! 41962306a36Sopenharmony_ci .endr 42062306a36Sopenharmony_ci 42162306a36Sopenharmony_ci round ek0, \regs 42262306a36Sopenharmony_ci vld1.8 {ek0}, [\rkp, :128] 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci .irp r, \regs 42562306a36Sopenharmony_ci aese.8 \r, ek1 42662306a36Sopenharmony_ci .endr 42762306a36Sopenharmony_ci .irp r, \regs 42862306a36Sopenharmony_ci veor \r, \r, ek0 42962306a36Sopenharmony_ci .endr 43062306a36Sopenharmony_ci .endm 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_cipmull_aes_encrypt: 43362306a36Sopenharmony_ci add ip, r5, #4 43462306a36Sopenharmony_ci vld1.8 {ctr0}, [r5] // load 12 byte IV 43562306a36Sopenharmony_ci vld1.8 {ctr1}, [ip] 43662306a36Sopenharmony_ci rev r8, r7 43762306a36Sopenharmony_ci vext.8 ctr1, ctr1, ctr1, #4 43862306a36Sopenharmony_ci add r7, r7, #1 43962306a36Sopenharmony_ci vmov.32 ctr1[1], r8 44062306a36Sopenharmony_ci vmov e0, ctr 44162306a36Sopenharmony_ci 44262306a36Sopenharmony_ci add ip, r3, #64 44362306a36Sopenharmony_ci aes_encrypt ip, r6, e0 44462306a36Sopenharmony_ci bx lr 44562306a36Sopenharmony_ciENDPROC(pmull_aes_encrypt) 44662306a36Sopenharmony_ci 44762306a36Sopenharmony_cipmull_aes_encrypt_4x: 44862306a36Sopenharmony_ci add ip, r5, #4 44962306a36Sopenharmony_ci vld1.8 {ctr0}, [r5] 45062306a36Sopenharmony_ci vld1.8 {ctr1}, [ip] 45162306a36Sopenharmony_ci rev r8, r7 45262306a36Sopenharmony_ci vext.8 ctr1, ctr1, ctr1, #4 45362306a36Sopenharmony_ci add r7, r7, #1 45462306a36Sopenharmony_ci vmov.32 ctr1[1], r8 45562306a36Sopenharmony_ci rev ip, r7 45662306a36Sopenharmony_ci vmov e0, ctr 45762306a36Sopenharmony_ci add r7, r7, #1 45862306a36Sopenharmony_ci vmov.32 ctr1[1], ip 45962306a36Sopenharmony_ci rev r8, r7 46062306a36Sopenharmony_ci vmov e1, ctr 46162306a36Sopenharmony_ci add r7, r7, #1 46262306a36Sopenharmony_ci vmov.32 ctr1[1], r8 46362306a36Sopenharmony_ci rev ip, r7 46462306a36Sopenharmony_ci vmov e2, ctr 46562306a36Sopenharmony_ci add r7, r7, #1 46662306a36Sopenharmony_ci vmov.32 ctr1[1], ip 46762306a36Sopenharmony_ci vmov e3, ctr 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_ci add ip, r3, #64 47062306a36Sopenharmony_ci aes_encrypt ip, r6, e0, e1, e2, e3 47162306a36Sopenharmony_ci bx lr 47262306a36Sopenharmony_ciENDPROC(pmull_aes_encrypt_4x) 47362306a36Sopenharmony_ci 47462306a36Sopenharmony_cipmull_aes_encrypt_final: 47562306a36Sopenharmony_ci add ip, r5, #4 47662306a36Sopenharmony_ci vld1.8 {ctr0}, [r5] 47762306a36Sopenharmony_ci vld1.8 {ctr1}, [ip] 47862306a36Sopenharmony_ci rev r8, r7 47962306a36Sopenharmony_ci vext.8 ctr1, ctr1, ctr1, #4 48062306a36Sopenharmony_ci mov r7, #1 << 24 // BE #1 for the tag 48162306a36Sopenharmony_ci vmov.32 ctr1[1], r8 48262306a36Sopenharmony_ci vmov e0, ctr 48362306a36Sopenharmony_ci vmov.32 ctr1[1], r7 48462306a36Sopenharmony_ci vmov e1, ctr 48562306a36Sopenharmony_ci 48662306a36Sopenharmony_ci add ip, r3, #64 48762306a36Sopenharmony_ci aes_encrypt ip, r6, e0, e1 48862306a36Sopenharmony_ci bx lr 48962306a36Sopenharmony_ciENDPROC(pmull_aes_encrypt_final) 49062306a36Sopenharmony_ci 49162306a36Sopenharmony_ci .macro enc_1x, in0 49262306a36Sopenharmony_ci bl pmull_aes_encrypt 49362306a36Sopenharmony_ci veor \in0, \in0, e0 49462306a36Sopenharmony_ci vst1.8 {\in0}, [r4]! 49562306a36Sopenharmony_ci .endm 49662306a36Sopenharmony_ci 49762306a36Sopenharmony_ci .macro dec_1x, in0 49862306a36Sopenharmony_ci bl pmull_aes_encrypt 49962306a36Sopenharmony_ci veor e0, e0, \in0 50062306a36Sopenharmony_ci vst1.8 {e0}, [r4]! 50162306a36Sopenharmony_ci .endm 50262306a36Sopenharmony_ci 50362306a36Sopenharmony_ci .macro enc_4x, in0, in1, in2, in3 50462306a36Sopenharmony_ci bl pmull_aes_encrypt_4x 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_ci veor \in0, \in0, e0 50762306a36Sopenharmony_ci veor \in1, \in1, e1 50862306a36Sopenharmony_ci veor \in2, \in2, e2 50962306a36Sopenharmony_ci veor \in3, \in3, e3 51062306a36Sopenharmony_ci 51162306a36Sopenharmony_ci vst1.8 {\in0-\in1}, [r4]! 51262306a36Sopenharmony_ci vst1.8 {\in2-\in3}, [r4]! 51362306a36Sopenharmony_ci .endm 51462306a36Sopenharmony_ci 51562306a36Sopenharmony_ci .macro dec_4x, in0, in1, in2, in3 51662306a36Sopenharmony_ci bl pmull_aes_encrypt_4x 51762306a36Sopenharmony_ci 51862306a36Sopenharmony_ci veor e0, e0, \in0 51962306a36Sopenharmony_ci veor e1, e1, \in1 52062306a36Sopenharmony_ci veor e2, e2, \in2 52162306a36Sopenharmony_ci veor e3, e3, \in3 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_ci vst1.8 {e0-e1}, [r4]! 52462306a36Sopenharmony_ci vst1.8 {e2-e3}, [r4]! 52562306a36Sopenharmony_ci .endm 52662306a36Sopenharmony_ci 52762306a36Sopenharmony_ci /* 52862306a36Sopenharmony_ci * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src, 52962306a36Sopenharmony_ci * struct gcm_key const *k, char *dst, 53062306a36Sopenharmony_ci * char *iv, int rounds, u32 counter) 53162306a36Sopenharmony_ci */ 53262306a36Sopenharmony_ciENTRY(pmull_gcm_encrypt) 53362306a36Sopenharmony_ci push {r4-r8, lr} 53462306a36Sopenharmony_ci ldrd r4, r5, [sp, #24] 53562306a36Sopenharmony_ci ldrd r6, r7, [sp, #32] 53662306a36Sopenharmony_ci 53762306a36Sopenharmony_ci vld1.64 {SHASH}, [r3] 53862306a36Sopenharmony_ci 53962306a36Sopenharmony_ci ghash_update p64, enc, head=0 54062306a36Sopenharmony_ci vst1.64 {XL}, [r1] 54162306a36Sopenharmony_ci 54262306a36Sopenharmony_ci pop {r4-r8, pc} 54362306a36Sopenharmony_ciENDPROC(pmull_gcm_encrypt) 54462306a36Sopenharmony_ci 54562306a36Sopenharmony_ci /* 54662306a36Sopenharmony_ci * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src, 54762306a36Sopenharmony_ci * struct gcm_key const *k, char *dst, 54862306a36Sopenharmony_ci * char *iv, int rounds, u32 counter) 54962306a36Sopenharmony_ci */ 55062306a36Sopenharmony_ciENTRY(pmull_gcm_decrypt) 55162306a36Sopenharmony_ci push {r4-r8, lr} 55262306a36Sopenharmony_ci ldrd r4, r5, [sp, #24] 55362306a36Sopenharmony_ci ldrd r6, r7, [sp, #32] 55462306a36Sopenharmony_ci 55562306a36Sopenharmony_ci vld1.64 {SHASH}, [r3] 55662306a36Sopenharmony_ci 55762306a36Sopenharmony_ci ghash_update p64, dec, head=0 55862306a36Sopenharmony_ci vst1.64 {XL}, [r1] 55962306a36Sopenharmony_ci 56062306a36Sopenharmony_ci pop {r4-r8, pc} 56162306a36Sopenharmony_ciENDPROC(pmull_gcm_decrypt) 56262306a36Sopenharmony_ci 56362306a36Sopenharmony_ci /* 56462306a36Sopenharmony_ci * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag, 56562306a36Sopenharmony_ci * struct gcm_key const *k, char *head, 56662306a36Sopenharmony_ci * char *iv, int rounds, u32 counter) 56762306a36Sopenharmony_ci */ 56862306a36Sopenharmony_ciENTRY(pmull_gcm_enc_final) 56962306a36Sopenharmony_ci push {r4-r8, lr} 57062306a36Sopenharmony_ci ldrd r4, r5, [sp, #24] 57162306a36Sopenharmony_ci ldrd r6, r7, [sp, #32] 57262306a36Sopenharmony_ci 57362306a36Sopenharmony_ci bl pmull_aes_encrypt_final 57462306a36Sopenharmony_ci 57562306a36Sopenharmony_ci cmp r0, #0 57662306a36Sopenharmony_ci beq .Lenc_final 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_ci mov_l ip, .Lpermute 57962306a36Sopenharmony_ci sub r4, r4, #16 58062306a36Sopenharmony_ci add r8, ip, r0 58162306a36Sopenharmony_ci add ip, ip, #32 58262306a36Sopenharmony_ci add r4, r4, r0 58362306a36Sopenharmony_ci sub ip, ip, r0 58462306a36Sopenharmony_ci 58562306a36Sopenharmony_ci vld1.8 {e3}, [r8] // permute vector for key stream 58662306a36Sopenharmony_ci vld1.8 {e2}, [ip] // permute vector for ghash input 58762306a36Sopenharmony_ci 58862306a36Sopenharmony_ci vtbl.8 e3l, {e0}, e3l 58962306a36Sopenharmony_ci vtbl.8 e3h, {e0}, e3h 59062306a36Sopenharmony_ci 59162306a36Sopenharmony_ci vld1.8 {e0}, [r4] // encrypt tail block 59262306a36Sopenharmony_ci veor e0, e0, e3 59362306a36Sopenharmony_ci vst1.8 {e0}, [r4] 59462306a36Sopenharmony_ci 59562306a36Sopenharmony_ci vtbl.8 T1_L, {e0}, e2l 59662306a36Sopenharmony_ci vtbl.8 T1_H, {e0}, e2h 59762306a36Sopenharmony_ci 59862306a36Sopenharmony_ci vld1.64 {XL}, [r1] 59962306a36Sopenharmony_ci.Lenc_final: 60062306a36Sopenharmony_ci vld1.64 {SHASH}, [r3, :128] 60162306a36Sopenharmony_ci vmov.i8 MASK, #0xe1 60262306a36Sopenharmony_ci veor SHASH2_p64, SHASH_L, SHASH_H 60362306a36Sopenharmony_ci vshl.u64 MASK, MASK, #57 60462306a36Sopenharmony_ci mov r0, #1 60562306a36Sopenharmony_ci bne 3f // process head block first 60662306a36Sopenharmony_ci ghash_update p64, aggregate=0, head=0 60762306a36Sopenharmony_ci 60862306a36Sopenharmony_ci vrev64.8 XL, XL 60962306a36Sopenharmony_ci vext.8 XL, XL, XL, #8 61062306a36Sopenharmony_ci veor XL, XL, e1 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_ci sub r2, r2, #16 // rewind src pointer 61362306a36Sopenharmony_ci vst1.8 {XL}, [r2] // store tag 61462306a36Sopenharmony_ci 61562306a36Sopenharmony_ci pop {r4-r8, pc} 61662306a36Sopenharmony_ciENDPROC(pmull_gcm_enc_final) 61762306a36Sopenharmony_ci 61862306a36Sopenharmony_ci /* 61962306a36Sopenharmony_ci * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag, 62062306a36Sopenharmony_ci * struct gcm_key const *k, char *head, 62162306a36Sopenharmony_ci * char *iv, int rounds, u32 counter, 62262306a36Sopenharmony_ci * const char *otag, int authsize) 62362306a36Sopenharmony_ci */ 62462306a36Sopenharmony_ciENTRY(pmull_gcm_dec_final) 62562306a36Sopenharmony_ci push {r4-r8, lr} 62662306a36Sopenharmony_ci ldrd r4, r5, [sp, #24] 62762306a36Sopenharmony_ci ldrd r6, r7, [sp, #32] 62862306a36Sopenharmony_ci 62962306a36Sopenharmony_ci bl pmull_aes_encrypt_final 63062306a36Sopenharmony_ci 63162306a36Sopenharmony_ci cmp r0, #0 63262306a36Sopenharmony_ci beq .Ldec_final 63362306a36Sopenharmony_ci 63462306a36Sopenharmony_ci mov_l ip, .Lpermute 63562306a36Sopenharmony_ci sub r4, r4, #16 63662306a36Sopenharmony_ci add r8, ip, r0 63762306a36Sopenharmony_ci add ip, ip, #32 63862306a36Sopenharmony_ci add r4, r4, r0 63962306a36Sopenharmony_ci sub ip, ip, r0 64062306a36Sopenharmony_ci 64162306a36Sopenharmony_ci vld1.8 {e3}, [r8] // permute vector for key stream 64262306a36Sopenharmony_ci vld1.8 {e2}, [ip] // permute vector for ghash input 64362306a36Sopenharmony_ci 64462306a36Sopenharmony_ci vtbl.8 e3l, {e0}, e3l 64562306a36Sopenharmony_ci vtbl.8 e3h, {e0}, e3h 64662306a36Sopenharmony_ci 64762306a36Sopenharmony_ci vld1.8 {e0}, [r4] 64862306a36Sopenharmony_ci 64962306a36Sopenharmony_ci vtbl.8 T1_L, {e0}, e2l 65062306a36Sopenharmony_ci vtbl.8 T1_H, {e0}, e2h 65162306a36Sopenharmony_ci 65262306a36Sopenharmony_ci veor e0, e0, e3 65362306a36Sopenharmony_ci vst1.8 {e0}, [r4] 65462306a36Sopenharmony_ci 65562306a36Sopenharmony_ci vld1.64 {XL}, [r1] 65662306a36Sopenharmony_ci.Ldec_final: 65762306a36Sopenharmony_ci vld1.64 {SHASH}, [r3] 65862306a36Sopenharmony_ci vmov.i8 MASK, #0xe1 65962306a36Sopenharmony_ci veor SHASH2_p64, SHASH_L, SHASH_H 66062306a36Sopenharmony_ci vshl.u64 MASK, MASK, #57 66162306a36Sopenharmony_ci mov r0, #1 66262306a36Sopenharmony_ci bne 3f // process head block first 66362306a36Sopenharmony_ci ghash_update p64, aggregate=0, head=0 66462306a36Sopenharmony_ci 66562306a36Sopenharmony_ci vrev64.8 XL, XL 66662306a36Sopenharmony_ci vext.8 XL, XL, XL, #8 66762306a36Sopenharmony_ci veor XL, XL, e1 66862306a36Sopenharmony_ci 66962306a36Sopenharmony_ci mov_l ip, .Lpermute 67062306a36Sopenharmony_ci ldrd r2, r3, [sp, #40] // otag and authsize 67162306a36Sopenharmony_ci vld1.8 {T1}, [r2] 67262306a36Sopenharmony_ci add ip, ip, r3 67362306a36Sopenharmony_ci vceq.i8 T1, T1, XL // compare tags 67462306a36Sopenharmony_ci vmvn T1, T1 // 0 for eq, -1 for ne 67562306a36Sopenharmony_ci 67662306a36Sopenharmony_ci vld1.8 {e0}, [ip] 67762306a36Sopenharmony_ci vtbl.8 XL_L, {T1}, e0l // keep authsize bytes only 67862306a36Sopenharmony_ci vtbl.8 XL_H, {T1}, e0h 67962306a36Sopenharmony_ci 68062306a36Sopenharmony_ci vpmin.s8 XL_L, XL_L, XL_H // take the minimum s8 across the vector 68162306a36Sopenharmony_ci vpmin.s8 XL_L, XL_L, XL_L 68262306a36Sopenharmony_ci vmov.32 r0, XL_L[0] // fail if != 0x0 68362306a36Sopenharmony_ci 68462306a36Sopenharmony_ci pop {r4-r8, pc} 68562306a36Sopenharmony_ciENDPROC(pmull_gcm_dec_final) 68662306a36Sopenharmony_ci 68762306a36Sopenharmony_ci .section ".rodata", "a", %progbits 68862306a36Sopenharmony_ci .align 5 68962306a36Sopenharmony_ci.Lpermute: 69062306a36Sopenharmony_ci .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 69162306a36Sopenharmony_ci .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 69262306a36Sopenharmony_ci .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 69362306a36Sopenharmony_ci .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 69462306a36Sopenharmony_ci .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 69562306a36Sopenharmony_ci .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 696