162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright 2021 Google LLC 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci/* 662306a36Sopenharmony_ci * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI 762306a36Sopenharmony_ci * instructions. It works on 8 blocks at a time, by precomputing the first 8 862306a36Sopenharmony_ci * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation 962306a36Sopenharmony_ci * allows us to split finite field multiplication into two steps. 1062306a36Sopenharmony_ci * 1162306a36Sopenharmony_ci * In the first step, we consider h^i, m_i as normal polynomials of degree less 1262306a36Sopenharmony_ci * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication 1362306a36Sopenharmony_ci * is simply polynomial multiplication. 1462306a36Sopenharmony_ci * 1562306a36Sopenharmony_ci * In the second step, we compute the reduction of p(x) modulo the finite field 1662306a36Sopenharmony_ci * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. 1762306a36Sopenharmony_ci * 1862306a36Sopenharmony_ci * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where 1962306a36Sopenharmony_ci * multiplication is finite field multiplication. The advantage is that the 2062306a36Sopenharmony_ci * two-step process only requires 1 finite field reduction for every 8 2162306a36Sopenharmony_ci * polynomial multiplications. Further parallelism is gained by interleaving the 2262306a36Sopenharmony_ci * multiplications and polynomial reductions. 2362306a36Sopenharmony_ci */ 2462306a36Sopenharmony_ci 2562306a36Sopenharmony_ci#include <linux/linkage.h> 2662306a36Sopenharmony_ci#include <asm/frame.h> 2762306a36Sopenharmony_ci 2862306a36Sopenharmony_ci#define STRIDE_BLOCKS 8 2962306a36Sopenharmony_ci 3062306a36Sopenharmony_ci#define GSTAR %xmm7 3162306a36Sopenharmony_ci#define PL %xmm8 3262306a36Sopenharmony_ci#define PH %xmm9 3362306a36Sopenharmony_ci#define TMP_XMM %xmm11 3462306a36Sopenharmony_ci#define LO %xmm12 3562306a36Sopenharmony_ci#define HI %xmm13 3662306a36Sopenharmony_ci#define MI %xmm14 3762306a36Sopenharmony_ci#define SUM %xmm15 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci#define KEY_POWERS %rdi 4062306a36Sopenharmony_ci#define MSG %rsi 4162306a36Sopenharmony_ci#define BLOCKS_LEFT %rdx 4262306a36Sopenharmony_ci#define ACCUMULATOR %rcx 4362306a36Sopenharmony_ci#define TMP %rax 4462306a36Sopenharmony_ci 4562306a36Sopenharmony_ci.section .rodata.cst16.gstar, "aM", @progbits, 16 4662306a36Sopenharmony_ci.align 16 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_ci.Lgstar: 4962306a36Sopenharmony_ci .quad 0xc200000000000000, 0xc200000000000000 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci.text 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci/* 5462306a36Sopenharmony_ci * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length 5562306a36Sopenharmony_ci * count pointed to by MSG and KEY_POWERS. 5662306a36Sopenharmony_ci */ 5762306a36Sopenharmony_ci.macro schoolbook1 count 5862306a36Sopenharmony_ci .set i, 0 5962306a36Sopenharmony_ci .rept (\count) 6062306a36Sopenharmony_ci schoolbook1_iteration i 0 6162306a36Sopenharmony_ci .set i, (i +1) 6262306a36Sopenharmony_ci .endr 6362306a36Sopenharmony_ci.endm 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci/* 6662306a36Sopenharmony_ci * Computes the product of two 128-bit polynomials at the memory locations 6762306a36Sopenharmony_ci * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of 6862306a36Sopenharmony_ci * the 256-bit product into LO, MI, HI. 6962306a36Sopenharmony_ci * 7062306a36Sopenharmony_ci * Given: 7162306a36Sopenharmony_ci * X = [X_1 : X_0] 7262306a36Sopenharmony_ci * Y = [Y_1 : Y_0] 7362306a36Sopenharmony_ci * 7462306a36Sopenharmony_ci * We compute: 7562306a36Sopenharmony_ci * LO += X_0 * Y_0 7662306a36Sopenharmony_ci * MI += X_0 * Y_1 + X_1 * Y_0 7762306a36Sopenharmony_ci * HI += X_1 * Y_1 7862306a36Sopenharmony_ci * 7962306a36Sopenharmony_ci * Later, the 256-bit result can be extracted as: 8062306a36Sopenharmony_ci * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] 8162306a36Sopenharmony_ci * This step is done when computing the polynomial reduction for efficiency 8262306a36Sopenharmony_ci * reasons. 8362306a36Sopenharmony_ci * 8462306a36Sopenharmony_ci * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an 8562306a36Sopenharmony_ci * extra multiplication of SUM and h^8. 8662306a36Sopenharmony_ci */ 8762306a36Sopenharmony_ci.macro schoolbook1_iteration i xor_sum 8862306a36Sopenharmony_ci movups (16*\i)(MSG), %xmm0 8962306a36Sopenharmony_ci .if (\i == 0 && \xor_sum == 1) 9062306a36Sopenharmony_ci pxor SUM, %xmm0 9162306a36Sopenharmony_ci .endif 9262306a36Sopenharmony_ci vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2 9362306a36Sopenharmony_ci vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1 9462306a36Sopenharmony_ci vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3 9562306a36Sopenharmony_ci vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4 9662306a36Sopenharmony_ci vpxor %xmm2, MI, MI 9762306a36Sopenharmony_ci vpxor %xmm1, LO, LO 9862306a36Sopenharmony_ci vpxor %xmm4, HI, HI 9962306a36Sopenharmony_ci vpxor %xmm3, MI, MI 10062306a36Sopenharmony_ci.endm 10162306a36Sopenharmony_ci 10262306a36Sopenharmony_ci/* 10362306a36Sopenharmony_ci * Performs the same computation as schoolbook1_iteration, except we expect the 10462306a36Sopenharmony_ci * arguments to already be loaded into xmm0 and xmm1 and we set the result 10562306a36Sopenharmony_ci * registers LO, MI, and HI directly rather than XOR'ing into them. 10662306a36Sopenharmony_ci */ 10762306a36Sopenharmony_ci.macro schoolbook1_noload 10862306a36Sopenharmony_ci vpclmulqdq $0x01, %xmm0, %xmm1, MI 10962306a36Sopenharmony_ci vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2 11062306a36Sopenharmony_ci vpclmulqdq $0x00, %xmm0, %xmm1, LO 11162306a36Sopenharmony_ci vpclmulqdq $0x11, %xmm0, %xmm1, HI 11262306a36Sopenharmony_ci vpxor %xmm2, MI, MI 11362306a36Sopenharmony_ci.endm 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_ci/* 11662306a36Sopenharmony_ci * Computes the 256-bit polynomial represented by LO, HI, MI. Stores 11762306a36Sopenharmony_ci * the result in PL, PH. 11862306a36Sopenharmony_ci * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] 11962306a36Sopenharmony_ci */ 12062306a36Sopenharmony_ci.macro schoolbook2 12162306a36Sopenharmony_ci vpslldq $8, MI, PL 12262306a36Sopenharmony_ci vpsrldq $8, MI, PH 12362306a36Sopenharmony_ci pxor LO, PL 12462306a36Sopenharmony_ci pxor HI, PH 12562306a36Sopenharmony_ci.endm 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_ci/* 12862306a36Sopenharmony_ci * Computes the 128-bit reduction of PH : PL. Stores the result in dest. 12962306a36Sopenharmony_ci * 13062306a36Sopenharmony_ci * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = 13162306a36Sopenharmony_ci * x^128 + x^127 + x^126 + x^121 + 1. 13262306a36Sopenharmony_ci * 13362306a36Sopenharmony_ci * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the 13462306a36Sopenharmony_ci * product of two 128-bit polynomials in Montgomery form. We need to reduce it 13562306a36Sopenharmony_ci * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor 13662306a36Sopenharmony_ci * of x^128, this product has two extra factors of x^128. To get it back into 13762306a36Sopenharmony_ci * Montgomery form, we need to remove one of these factors by dividing by x^128. 13862306a36Sopenharmony_ci * 13962306a36Sopenharmony_ci * To accomplish both of these goals, we add multiples of g(x) that cancel out 14062306a36Sopenharmony_ci * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low 14162306a36Sopenharmony_ci * bits are zero, the polynomial division by x^128 can be done by right shifting. 14262306a36Sopenharmony_ci * 14362306a36Sopenharmony_ci * Since the only nonzero term in the low 64 bits of g(x) is the constant term, 14462306a36Sopenharmony_ci * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can 14562306a36Sopenharmony_ci * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + 14662306a36Sopenharmony_ci * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to 14762306a36Sopenharmony_ci * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T 14862306a36Sopenharmony_ci * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. 14962306a36Sopenharmony_ci * 15062306a36Sopenharmony_ci * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits 15162306a36Sopenharmony_ci * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 15262306a36Sopenharmony_ci * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * 15362306a36Sopenharmony_ci * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : 15462306a36Sopenharmony_ci * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). 15562306a36Sopenharmony_ci * 15662306a36Sopenharmony_ci * So our final computation is: 15762306a36Sopenharmony_ci * T = T_1 : T_0 = g*(x) * P_0 15862306a36Sopenharmony_ci * V = V_1 : V_0 = g*(x) * (P_1 + T_0) 15962306a36Sopenharmony_ci * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 16062306a36Sopenharmony_ci * 16162306a36Sopenharmony_ci * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 16262306a36Sopenharmony_ci * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : 16362306a36Sopenharmony_ci * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. 16462306a36Sopenharmony_ci */ 16562306a36Sopenharmony_ci.macro montgomery_reduction dest 16662306a36Sopenharmony_ci vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x) 16762306a36Sopenharmony_ci pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1 16862306a36Sopenharmony_ci pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1 16962306a36Sopenharmony_ci pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 17062306a36Sopenharmony_ci pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)] 17162306a36Sopenharmony_ci vpxor TMP_XMM, PH, \dest 17262306a36Sopenharmony_ci.endm 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_ci/* 17562306a36Sopenharmony_ci * Compute schoolbook multiplication for 8 blocks 17662306a36Sopenharmony_ci * m_0h^8 + ... + m_7h^1 17762306a36Sopenharmony_ci * 17862306a36Sopenharmony_ci * If reduce is set, also computes the montgomery reduction of the 17962306a36Sopenharmony_ci * previous full_stride call and XORs with the first message block. 18062306a36Sopenharmony_ci * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. 18162306a36Sopenharmony_ci * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. 18262306a36Sopenharmony_ci */ 18362306a36Sopenharmony_ci.macro full_stride reduce 18462306a36Sopenharmony_ci pxor LO, LO 18562306a36Sopenharmony_ci pxor HI, HI 18662306a36Sopenharmony_ci pxor MI, MI 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci schoolbook1_iteration 7 0 18962306a36Sopenharmony_ci .if \reduce 19062306a36Sopenharmony_ci vpclmulqdq $0x00, PL, GSTAR, TMP_XMM 19162306a36Sopenharmony_ci .endif 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci schoolbook1_iteration 6 0 19462306a36Sopenharmony_ci .if \reduce 19562306a36Sopenharmony_ci pshufd $0b01001110, TMP_XMM, TMP_XMM 19662306a36Sopenharmony_ci .endif 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci schoolbook1_iteration 5 0 19962306a36Sopenharmony_ci .if \reduce 20062306a36Sopenharmony_ci pxor PL, TMP_XMM 20162306a36Sopenharmony_ci .endif 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci schoolbook1_iteration 4 0 20462306a36Sopenharmony_ci .if \reduce 20562306a36Sopenharmony_ci pxor TMP_XMM, PH 20662306a36Sopenharmony_ci .endif 20762306a36Sopenharmony_ci 20862306a36Sopenharmony_ci schoolbook1_iteration 3 0 20962306a36Sopenharmony_ci .if \reduce 21062306a36Sopenharmony_ci pclmulqdq $0x11, GSTAR, TMP_XMM 21162306a36Sopenharmony_ci .endif 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci schoolbook1_iteration 2 0 21462306a36Sopenharmony_ci .if \reduce 21562306a36Sopenharmony_ci vpxor TMP_XMM, PH, SUM 21662306a36Sopenharmony_ci .endif 21762306a36Sopenharmony_ci 21862306a36Sopenharmony_ci schoolbook1_iteration 1 0 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_ci schoolbook1_iteration 0 1 22162306a36Sopenharmony_ci 22262306a36Sopenharmony_ci addq $(8*16), MSG 22362306a36Sopenharmony_ci schoolbook2 22462306a36Sopenharmony_ci.endm 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci/* 22762306a36Sopenharmony_ci * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS 22862306a36Sopenharmony_ci */ 22962306a36Sopenharmony_ci.macro partial_stride 23062306a36Sopenharmony_ci mov BLOCKS_LEFT, TMP 23162306a36Sopenharmony_ci shlq $4, TMP 23262306a36Sopenharmony_ci addq $(16*STRIDE_BLOCKS), KEY_POWERS 23362306a36Sopenharmony_ci subq TMP, KEY_POWERS 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_ci movups (MSG), %xmm0 23662306a36Sopenharmony_ci pxor SUM, %xmm0 23762306a36Sopenharmony_ci movaps (KEY_POWERS), %xmm1 23862306a36Sopenharmony_ci schoolbook1_noload 23962306a36Sopenharmony_ci dec BLOCKS_LEFT 24062306a36Sopenharmony_ci addq $16, MSG 24162306a36Sopenharmony_ci addq $16, KEY_POWERS 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_ci test $4, BLOCKS_LEFT 24462306a36Sopenharmony_ci jz .Lpartial4BlocksDone 24562306a36Sopenharmony_ci schoolbook1 4 24662306a36Sopenharmony_ci addq $(4*16), MSG 24762306a36Sopenharmony_ci addq $(4*16), KEY_POWERS 24862306a36Sopenharmony_ci.Lpartial4BlocksDone: 24962306a36Sopenharmony_ci test $2, BLOCKS_LEFT 25062306a36Sopenharmony_ci jz .Lpartial2BlocksDone 25162306a36Sopenharmony_ci schoolbook1 2 25262306a36Sopenharmony_ci addq $(2*16), MSG 25362306a36Sopenharmony_ci addq $(2*16), KEY_POWERS 25462306a36Sopenharmony_ci.Lpartial2BlocksDone: 25562306a36Sopenharmony_ci test $1, BLOCKS_LEFT 25662306a36Sopenharmony_ci jz .LpartialDone 25762306a36Sopenharmony_ci schoolbook1 1 25862306a36Sopenharmony_ci.LpartialDone: 25962306a36Sopenharmony_ci schoolbook2 26062306a36Sopenharmony_ci montgomery_reduction SUM 26162306a36Sopenharmony_ci.endm 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci/* 26462306a36Sopenharmony_ci * Perform montgomery multiplication in GF(2^128) and store result in op1. 26562306a36Sopenharmony_ci * 26662306a36Sopenharmony_ci * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 26762306a36Sopenharmony_ci * If op1, op2 are in montgomery form, this computes the montgomery 26862306a36Sopenharmony_ci * form of op1*op2. 26962306a36Sopenharmony_ci * 27062306a36Sopenharmony_ci * void clmul_polyval_mul(u8 *op1, const u8 *op2); 27162306a36Sopenharmony_ci */ 27262306a36Sopenharmony_ciSYM_FUNC_START(clmul_polyval_mul) 27362306a36Sopenharmony_ci FRAME_BEGIN 27462306a36Sopenharmony_ci vmovdqa .Lgstar(%rip), GSTAR 27562306a36Sopenharmony_ci movups (%rdi), %xmm0 27662306a36Sopenharmony_ci movups (%rsi), %xmm1 27762306a36Sopenharmony_ci schoolbook1_noload 27862306a36Sopenharmony_ci schoolbook2 27962306a36Sopenharmony_ci montgomery_reduction SUM 28062306a36Sopenharmony_ci movups SUM, (%rdi) 28162306a36Sopenharmony_ci FRAME_END 28262306a36Sopenharmony_ci RET 28362306a36Sopenharmony_ciSYM_FUNC_END(clmul_polyval_mul) 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_ci/* 28662306a36Sopenharmony_ci * Perform polynomial evaluation as specified by POLYVAL. This computes: 28762306a36Sopenharmony_ci * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} 28862306a36Sopenharmony_ci * where n=nblocks, h is the hash key, and m_i are the message blocks. 28962306a36Sopenharmony_ci * 29062306a36Sopenharmony_ci * rdi - pointer to precomputed key powers h^8 ... h^1 29162306a36Sopenharmony_ci * rsi - pointer to message blocks 29262306a36Sopenharmony_ci * rdx - number of blocks to hash 29362306a36Sopenharmony_ci * rcx - pointer to the accumulator 29462306a36Sopenharmony_ci * 29562306a36Sopenharmony_ci * void clmul_polyval_update(const struct polyval_tfm_ctx *keys, 29662306a36Sopenharmony_ci * const u8 *in, size_t nblocks, u8 *accumulator); 29762306a36Sopenharmony_ci */ 29862306a36Sopenharmony_ciSYM_FUNC_START(clmul_polyval_update) 29962306a36Sopenharmony_ci FRAME_BEGIN 30062306a36Sopenharmony_ci vmovdqa .Lgstar(%rip), GSTAR 30162306a36Sopenharmony_ci movups (ACCUMULATOR), SUM 30262306a36Sopenharmony_ci subq $STRIDE_BLOCKS, BLOCKS_LEFT 30362306a36Sopenharmony_ci js .LstrideLoopExit 30462306a36Sopenharmony_ci full_stride 0 30562306a36Sopenharmony_ci subq $STRIDE_BLOCKS, BLOCKS_LEFT 30662306a36Sopenharmony_ci js .LstrideLoopExitReduce 30762306a36Sopenharmony_ci.LstrideLoop: 30862306a36Sopenharmony_ci full_stride 1 30962306a36Sopenharmony_ci subq $STRIDE_BLOCKS, BLOCKS_LEFT 31062306a36Sopenharmony_ci jns .LstrideLoop 31162306a36Sopenharmony_ci.LstrideLoopExitReduce: 31262306a36Sopenharmony_ci montgomery_reduction SUM 31362306a36Sopenharmony_ci.LstrideLoopExit: 31462306a36Sopenharmony_ci add $STRIDE_BLOCKS, BLOCKS_LEFT 31562306a36Sopenharmony_ci jz .LskipPartial 31662306a36Sopenharmony_ci partial_stride 31762306a36Sopenharmony_ci.LskipPartial: 31862306a36Sopenharmony_ci movups SUM, (ACCUMULATOR) 31962306a36Sopenharmony_ci FRAME_END 32062306a36Sopenharmony_ci RET 32162306a36Sopenharmony_ciSYM_FUNC_END(clmul_polyval_update) 322