162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Implementation of POLYVAL using ARMv8 Crypto Extensions. 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright 2021 Google LLC 662306a36Sopenharmony_ci */ 762306a36Sopenharmony_ci/* 862306a36Sopenharmony_ci * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions 962306a36Sopenharmony_ci * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8, 1062306a36Sopenharmony_ci * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split 1162306a36Sopenharmony_ci * finite field multiplication into two steps. 1262306a36Sopenharmony_ci * 1362306a36Sopenharmony_ci * In the first step, we consider h^i, m_i as normal polynomials of degree less 1462306a36Sopenharmony_ci * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication 1562306a36Sopenharmony_ci * is simply polynomial multiplication. 1662306a36Sopenharmony_ci * 1762306a36Sopenharmony_ci * In the second step, we compute the reduction of p(x) modulo the finite field 1862306a36Sopenharmony_ci * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. 1962306a36Sopenharmony_ci * 2062306a36Sopenharmony_ci * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where 2162306a36Sopenharmony_ci * multiplication is finite field multiplication. The advantage is that the 2262306a36Sopenharmony_ci * two-step process only requires 1 finite field reduction for every 8 2362306a36Sopenharmony_ci * polynomial multiplications. Further parallelism is gained by interleaving the 2462306a36Sopenharmony_ci * multiplications and polynomial reductions. 2562306a36Sopenharmony_ci */ 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_ci#include <linux/linkage.h> 2862306a36Sopenharmony_ci#define STRIDE_BLOCKS 8 2962306a36Sopenharmony_ci 3062306a36Sopenharmony_ciKEY_POWERS .req x0 3162306a36Sopenharmony_ciMSG .req x1 3262306a36Sopenharmony_ciBLOCKS_LEFT .req x2 3362306a36Sopenharmony_ciACCUMULATOR .req x3 3462306a36Sopenharmony_ciKEY_START .req x10 3562306a36Sopenharmony_ciEXTRA_BYTES .req x11 3662306a36Sopenharmony_ciTMP .req x13 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_ciM0 .req v0 3962306a36Sopenharmony_ciM1 .req v1 4062306a36Sopenharmony_ciM2 .req v2 4162306a36Sopenharmony_ciM3 .req v3 4262306a36Sopenharmony_ciM4 .req v4 4362306a36Sopenharmony_ciM5 .req v5 4462306a36Sopenharmony_ciM6 .req v6 4562306a36Sopenharmony_ciM7 .req v7 4662306a36Sopenharmony_ciKEY8 .req v8 4762306a36Sopenharmony_ciKEY7 .req v9 4862306a36Sopenharmony_ciKEY6 .req v10 4962306a36Sopenharmony_ciKEY5 .req v11 5062306a36Sopenharmony_ciKEY4 .req v12 5162306a36Sopenharmony_ciKEY3 .req v13 5262306a36Sopenharmony_ciKEY2 .req v14 5362306a36Sopenharmony_ciKEY1 .req v15 5462306a36Sopenharmony_ciPL .req v16 5562306a36Sopenharmony_ciPH .req v17 5662306a36Sopenharmony_ciTMP_V .req v18 5762306a36Sopenharmony_ciLO .req v20 5862306a36Sopenharmony_ciMI .req v21 5962306a36Sopenharmony_ciHI .req v22 6062306a36Sopenharmony_ciSUM .req v23 6162306a36Sopenharmony_ciGSTAR .req v24 6262306a36Sopenharmony_ci 6362306a36Sopenharmony_ci .text 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci .arch armv8-a+crypto 6662306a36Sopenharmony_ci .align 4 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci.Lgstar: 6962306a36Sopenharmony_ci .quad 0xc200000000000000, 0xc200000000000000 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci/* 7262306a36Sopenharmony_ci * Computes the product of two 128-bit polynomials in X and Y and XORs the 7362306a36Sopenharmony_ci * components of the 256-bit product into LO, MI, HI. 7462306a36Sopenharmony_ci * 7562306a36Sopenharmony_ci * Given: 7662306a36Sopenharmony_ci * X = [X_1 : X_0] 7762306a36Sopenharmony_ci * Y = [Y_1 : Y_0] 7862306a36Sopenharmony_ci * 7962306a36Sopenharmony_ci * We compute: 8062306a36Sopenharmony_ci * LO += X_0 * Y_0 8162306a36Sopenharmony_ci * MI += (X_0 + X_1) * (Y_0 + Y_1) 8262306a36Sopenharmony_ci * HI += X_1 * Y_1 8362306a36Sopenharmony_ci * 8462306a36Sopenharmony_ci * Later, the 256-bit result can be extracted as: 8562306a36Sopenharmony_ci * [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0] 8662306a36Sopenharmony_ci * This step is done when computing the polynomial reduction for efficiency 8762306a36Sopenharmony_ci * reasons. 8862306a36Sopenharmony_ci * 8962306a36Sopenharmony_ci * Karatsuba multiplication is used instead of Schoolbook multiplication because 9062306a36Sopenharmony_ci * it was found to be slightly faster on ARM64 CPUs. 9162306a36Sopenharmony_ci * 9262306a36Sopenharmony_ci */ 9362306a36Sopenharmony_ci.macro karatsuba1 X Y 9462306a36Sopenharmony_ci X .req \X 9562306a36Sopenharmony_ci Y .req \Y 9662306a36Sopenharmony_ci ext v25.16b, X.16b, X.16b, #8 9762306a36Sopenharmony_ci ext v26.16b, Y.16b, Y.16b, #8 9862306a36Sopenharmony_ci eor v25.16b, v25.16b, X.16b 9962306a36Sopenharmony_ci eor v26.16b, v26.16b, Y.16b 10062306a36Sopenharmony_ci pmull2 v28.1q, X.2d, Y.2d 10162306a36Sopenharmony_ci pmull v29.1q, X.1d, Y.1d 10262306a36Sopenharmony_ci pmull v27.1q, v25.1d, v26.1d 10362306a36Sopenharmony_ci eor HI.16b, HI.16b, v28.16b 10462306a36Sopenharmony_ci eor LO.16b, LO.16b, v29.16b 10562306a36Sopenharmony_ci eor MI.16b, MI.16b, v27.16b 10662306a36Sopenharmony_ci .unreq X 10762306a36Sopenharmony_ci .unreq Y 10862306a36Sopenharmony_ci.endm 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci/* 11162306a36Sopenharmony_ci * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into 11262306a36Sopenharmony_ci * them. 11362306a36Sopenharmony_ci */ 11462306a36Sopenharmony_ci.macro karatsuba1_store X Y 11562306a36Sopenharmony_ci X .req \X 11662306a36Sopenharmony_ci Y .req \Y 11762306a36Sopenharmony_ci ext v25.16b, X.16b, X.16b, #8 11862306a36Sopenharmony_ci ext v26.16b, Y.16b, Y.16b, #8 11962306a36Sopenharmony_ci eor v25.16b, v25.16b, X.16b 12062306a36Sopenharmony_ci eor v26.16b, v26.16b, Y.16b 12162306a36Sopenharmony_ci pmull2 HI.1q, X.2d, Y.2d 12262306a36Sopenharmony_ci pmull LO.1q, X.1d, Y.1d 12362306a36Sopenharmony_ci pmull MI.1q, v25.1d, v26.1d 12462306a36Sopenharmony_ci .unreq X 12562306a36Sopenharmony_ci .unreq Y 12662306a36Sopenharmony_ci.endm 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_ci/* 12962306a36Sopenharmony_ci * Computes the 256-bit polynomial represented by LO, HI, MI. Stores 13062306a36Sopenharmony_ci * the result in PL, PH. 13162306a36Sopenharmony_ci * [PH : PL] = 13262306a36Sopenharmony_ci * [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0] 13362306a36Sopenharmony_ci */ 13462306a36Sopenharmony_ci.macro karatsuba2 13562306a36Sopenharmony_ci // v4 = [HI_1 + MI_1 : HI_0 + MI_0] 13662306a36Sopenharmony_ci eor v4.16b, HI.16b, MI.16b 13762306a36Sopenharmony_ci // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0] 13862306a36Sopenharmony_ci eor v4.16b, v4.16b, LO.16b 13962306a36Sopenharmony_ci // v5 = [HI_0 : LO_1] 14062306a36Sopenharmony_ci ext v5.16b, LO.16b, HI.16b, #8 14162306a36Sopenharmony_ci // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0] 14262306a36Sopenharmony_ci eor v4.16b, v4.16b, v5.16b 14362306a36Sopenharmony_ci // HI = [HI_0 : HI_1] 14462306a36Sopenharmony_ci ext HI.16b, HI.16b, HI.16b, #8 14562306a36Sopenharmony_ci // LO = [LO_0 : LO_1] 14662306a36Sopenharmony_ci ext LO.16b, LO.16b, LO.16b, #8 14762306a36Sopenharmony_ci // PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1] 14862306a36Sopenharmony_ci ext PH.16b, v4.16b, HI.16b, #8 14962306a36Sopenharmony_ci // PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0] 15062306a36Sopenharmony_ci ext PL.16b, LO.16b, v4.16b, #8 15162306a36Sopenharmony_ci.endm 15262306a36Sopenharmony_ci 15362306a36Sopenharmony_ci/* 15462306a36Sopenharmony_ci * Computes the 128-bit reduction of PH : PL. Stores the result in dest. 15562306a36Sopenharmony_ci * 15662306a36Sopenharmony_ci * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = 15762306a36Sopenharmony_ci * x^128 + x^127 + x^126 + x^121 + 1. 15862306a36Sopenharmony_ci * 15962306a36Sopenharmony_ci * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the 16062306a36Sopenharmony_ci * product of two 128-bit polynomials in Montgomery form. We need to reduce it 16162306a36Sopenharmony_ci * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor 16262306a36Sopenharmony_ci * of x^128, this product has two extra factors of x^128. To get it back into 16362306a36Sopenharmony_ci * Montgomery form, we need to remove one of these factors by dividing by x^128. 16462306a36Sopenharmony_ci * 16562306a36Sopenharmony_ci * To accomplish both of these goals, we add multiples of g(x) that cancel out 16662306a36Sopenharmony_ci * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low 16762306a36Sopenharmony_ci * bits are zero, the polynomial division by x^128 can be done by right 16862306a36Sopenharmony_ci * shifting. 16962306a36Sopenharmony_ci * 17062306a36Sopenharmony_ci * Since the only nonzero term in the low 64 bits of g(x) is the constant term, 17162306a36Sopenharmony_ci * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can 17262306a36Sopenharmony_ci * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + 17362306a36Sopenharmony_ci * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to 17462306a36Sopenharmony_ci * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T 17562306a36Sopenharmony_ci * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. 17662306a36Sopenharmony_ci * 17762306a36Sopenharmony_ci * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits 17862306a36Sopenharmony_ci * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 17962306a36Sopenharmony_ci * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * 18062306a36Sopenharmony_ci * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : 18162306a36Sopenharmony_ci * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). 18262306a36Sopenharmony_ci * 18362306a36Sopenharmony_ci * So our final computation is: 18462306a36Sopenharmony_ci * T = T_1 : T_0 = g*(x) * P_0 18562306a36Sopenharmony_ci * V = V_1 : V_0 = g*(x) * (P_1 + T_0) 18662306a36Sopenharmony_ci * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 18762306a36Sopenharmony_ci * 18862306a36Sopenharmony_ci * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 18962306a36Sopenharmony_ci * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : 19062306a36Sopenharmony_ci * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. 19162306a36Sopenharmony_ci */ 19262306a36Sopenharmony_ci.macro montgomery_reduction dest 19362306a36Sopenharmony_ci DEST .req \dest 19462306a36Sopenharmony_ci // TMP_V = T_1 : T_0 = P_0 * g*(x) 19562306a36Sopenharmony_ci pmull TMP_V.1q, PL.1d, GSTAR.1d 19662306a36Sopenharmony_ci // TMP_V = T_0 : T_1 19762306a36Sopenharmony_ci ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 19862306a36Sopenharmony_ci // TMP_V = P_1 + T_0 : P_0 + T_1 19962306a36Sopenharmony_ci eor TMP_V.16b, PL.16b, TMP_V.16b 20062306a36Sopenharmony_ci // PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 20162306a36Sopenharmony_ci eor PH.16b, PH.16b, TMP_V.16b 20262306a36Sopenharmony_ci // TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x) 20362306a36Sopenharmony_ci pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d 20462306a36Sopenharmony_ci eor DEST.16b, PH.16b, TMP_V.16b 20562306a36Sopenharmony_ci .unreq DEST 20662306a36Sopenharmony_ci.endm 20762306a36Sopenharmony_ci 20862306a36Sopenharmony_ci/* 20962306a36Sopenharmony_ci * Compute Polyval on 8 blocks. 21062306a36Sopenharmony_ci * 21162306a36Sopenharmony_ci * If reduce is set, also computes the montgomery reduction of the 21262306a36Sopenharmony_ci * previous full_stride call and XORs with the first message block. 21362306a36Sopenharmony_ci * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. 21462306a36Sopenharmony_ci * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. 21562306a36Sopenharmony_ci * 21662306a36Sopenharmony_ci * Sets PL, PH. 21762306a36Sopenharmony_ci */ 21862306a36Sopenharmony_ci.macro full_stride reduce 21962306a36Sopenharmony_ci eor LO.16b, LO.16b, LO.16b 22062306a36Sopenharmony_ci eor MI.16b, MI.16b, MI.16b 22162306a36Sopenharmony_ci eor HI.16b, HI.16b, HI.16b 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ci ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 22462306a36Sopenharmony_ci ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci karatsuba1 M7 KEY1 22762306a36Sopenharmony_ci .if \reduce 22862306a36Sopenharmony_ci pmull TMP_V.1q, PL.1d, GSTAR.1d 22962306a36Sopenharmony_ci .endif 23062306a36Sopenharmony_ci 23162306a36Sopenharmony_ci karatsuba1 M6 KEY2 23262306a36Sopenharmony_ci .if \reduce 23362306a36Sopenharmony_ci ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 23462306a36Sopenharmony_ci .endif 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ci karatsuba1 M5 KEY3 23762306a36Sopenharmony_ci .if \reduce 23862306a36Sopenharmony_ci eor TMP_V.16b, PL.16b, TMP_V.16b 23962306a36Sopenharmony_ci .endif 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci karatsuba1 M4 KEY4 24262306a36Sopenharmony_ci .if \reduce 24362306a36Sopenharmony_ci eor PH.16b, PH.16b, TMP_V.16b 24462306a36Sopenharmony_ci .endif 24562306a36Sopenharmony_ci 24662306a36Sopenharmony_ci karatsuba1 M3 KEY5 24762306a36Sopenharmony_ci .if \reduce 24862306a36Sopenharmony_ci pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d 24962306a36Sopenharmony_ci .endif 25062306a36Sopenharmony_ci 25162306a36Sopenharmony_ci karatsuba1 M2 KEY6 25262306a36Sopenharmony_ci .if \reduce 25362306a36Sopenharmony_ci eor SUM.16b, PH.16b, TMP_V.16b 25462306a36Sopenharmony_ci .endif 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci karatsuba1 M1 KEY7 25762306a36Sopenharmony_ci eor M0.16b, M0.16b, SUM.16b 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci karatsuba1 M0 KEY8 26062306a36Sopenharmony_ci karatsuba2 26162306a36Sopenharmony_ci.endm 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci/* 26462306a36Sopenharmony_ci * Handle any extra blocks after full_stride loop. 26562306a36Sopenharmony_ci */ 26662306a36Sopenharmony_ci.macro partial_stride 26762306a36Sopenharmony_ci add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4) 26862306a36Sopenharmony_ci sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4 26962306a36Sopenharmony_ci ld1 {KEY1.16b}, [KEY_POWERS], #16 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_ci ld1 {TMP_V.16b}, [MSG], #16 27262306a36Sopenharmony_ci eor SUM.16b, SUM.16b, TMP_V.16b 27362306a36Sopenharmony_ci karatsuba1_store KEY1 SUM 27462306a36Sopenharmony_ci sub BLOCKS_LEFT, BLOCKS_LEFT, #1 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci tst BLOCKS_LEFT, #4 27762306a36Sopenharmony_ci beq .Lpartial4BlocksDone 27862306a36Sopenharmony_ci ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 27962306a36Sopenharmony_ci ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 28062306a36Sopenharmony_ci karatsuba1 M0 KEY8 28162306a36Sopenharmony_ci karatsuba1 M1 KEY7 28262306a36Sopenharmony_ci karatsuba1 M2 KEY6 28362306a36Sopenharmony_ci karatsuba1 M3 KEY5 28462306a36Sopenharmony_ci.Lpartial4BlocksDone: 28562306a36Sopenharmony_ci tst BLOCKS_LEFT, #2 28662306a36Sopenharmony_ci beq .Lpartial2BlocksDone 28762306a36Sopenharmony_ci ld1 {M0.16b, M1.16b}, [MSG], #32 28862306a36Sopenharmony_ci ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32 28962306a36Sopenharmony_ci karatsuba1 M0 KEY8 29062306a36Sopenharmony_ci karatsuba1 M1 KEY7 29162306a36Sopenharmony_ci.Lpartial2BlocksDone: 29262306a36Sopenharmony_ci tst BLOCKS_LEFT, #1 29362306a36Sopenharmony_ci beq .LpartialDone 29462306a36Sopenharmony_ci ld1 {M0.16b}, [MSG], #16 29562306a36Sopenharmony_ci ld1 {KEY8.16b}, [KEY_POWERS], #16 29662306a36Sopenharmony_ci karatsuba1 M0 KEY8 29762306a36Sopenharmony_ci.LpartialDone: 29862306a36Sopenharmony_ci karatsuba2 29962306a36Sopenharmony_ci montgomery_reduction SUM 30062306a36Sopenharmony_ci.endm 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci/* 30362306a36Sopenharmony_ci * Perform montgomery multiplication in GF(2^128) and store result in op1. 30462306a36Sopenharmony_ci * 30562306a36Sopenharmony_ci * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 30662306a36Sopenharmony_ci * If op1, op2 are in montgomery form, this computes the montgomery 30762306a36Sopenharmony_ci * form of op1*op2. 30862306a36Sopenharmony_ci * 30962306a36Sopenharmony_ci * void pmull_polyval_mul(u8 *op1, const u8 *op2); 31062306a36Sopenharmony_ci */ 31162306a36Sopenharmony_ciSYM_FUNC_START(pmull_polyval_mul) 31262306a36Sopenharmony_ci adr TMP, .Lgstar 31362306a36Sopenharmony_ci ld1 {GSTAR.2d}, [TMP] 31462306a36Sopenharmony_ci ld1 {v0.16b}, [x0] 31562306a36Sopenharmony_ci ld1 {v1.16b}, [x1] 31662306a36Sopenharmony_ci karatsuba1_store v0 v1 31762306a36Sopenharmony_ci karatsuba2 31862306a36Sopenharmony_ci montgomery_reduction SUM 31962306a36Sopenharmony_ci st1 {SUM.16b}, [x0] 32062306a36Sopenharmony_ci ret 32162306a36Sopenharmony_ciSYM_FUNC_END(pmull_polyval_mul) 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_ci/* 32462306a36Sopenharmony_ci * Perform polynomial evaluation as specified by POLYVAL. This computes: 32562306a36Sopenharmony_ci * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} 32662306a36Sopenharmony_ci * where n=nblocks, h is the hash key, and m_i are the message blocks. 32762306a36Sopenharmony_ci * 32862306a36Sopenharmony_ci * x0 - pointer to precomputed key powers h^8 ... h^1 32962306a36Sopenharmony_ci * x1 - pointer to message blocks 33062306a36Sopenharmony_ci * x2 - number of blocks to hash 33162306a36Sopenharmony_ci * x3 - pointer to accumulator 33262306a36Sopenharmony_ci * 33362306a36Sopenharmony_ci * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in, 33462306a36Sopenharmony_ci * size_t nblocks, u8 *accumulator); 33562306a36Sopenharmony_ci */ 33662306a36Sopenharmony_ciSYM_FUNC_START(pmull_polyval_update) 33762306a36Sopenharmony_ci adr TMP, .Lgstar 33862306a36Sopenharmony_ci mov KEY_START, KEY_POWERS 33962306a36Sopenharmony_ci ld1 {GSTAR.2d}, [TMP] 34062306a36Sopenharmony_ci ld1 {SUM.16b}, [ACCUMULATOR] 34162306a36Sopenharmony_ci subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS 34262306a36Sopenharmony_ci blt .LstrideLoopExit 34362306a36Sopenharmony_ci ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 34462306a36Sopenharmony_ci ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64 34562306a36Sopenharmony_ci full_stride 0 34662306a36Sopenharmony_ci subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS 34762306a36Sopenharmony_ci blt .LstrideLoopExitReduce 34862306a36Sopenharmony_ci.LstrideLoop: 34962306a36Sopenharmony_ci full_stride 1 35062306a36Sopenharmony_ci subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS 35162306a36Sopenharmony_ci bge .LstrideLoop 35262306a36Sopenharmony_ci.LstrideLoopExitReduce: 35362306a36Sopenharmony_ci montgomery_reduction SUM 35462306a36Sopenharmony_ci.LstrideLoopExit: 35562306a36Sopenharmony_ci adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS 35662306a36Sopenharmony_ci beq .LskipPartial 35762306a36Sopenharmony_ci partial_stride 35862306a36Sopenharmony_ci.LskipPartial: 35962306a36Sopenharmony_ci st1 {SUM.16b}, [ACCUMULATOR] 36062306a36Sopenharmony_ci ret 36162306a36Sopenharmony_ciSYM_FUNC_END(pmull_polyval_update) 362