18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Hardware-accelerated CRC-32 variants for Linux on z Systems 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Use the z/Architecture Vector Extension Facility to accelerate the 68c2ecf20Sopenharmony_ci * computing of CRC-32 checksums. 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * This CRC-32 implementation algorithm processes the most-significant 98c2ecf20Sopenharmony_ci * bit first (BE). 108c2ecf20Sopenharmony_ci * 118c2ecf20Sopenharmony_ci * Copyright IBM Corp. 2015 128c2ecf20Sopenharmony_ci * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> 138c2ecf20Sopenharmony_ci */ 148c2ecf20Sopenharmony_ci 158c2ecf20Sopenharmony_ci#include <linux/linkage.h> 168c2ecf20Sopenharmony_ci#include <asm/nospec-insn.h> 178c2ecf20Sopenharmony_ci#include <asm/vx-insn.h> 188c2ecf20Sopenharmony_ci 198c2ecf20Sopenharmony_ci/* Vector register range containing CRC-32 constants */ 208c2ecf20Sopenharmony_ci#define CONST_R1R2 %v9 218c2ecf20Sopenharmony_ci#define CONST_R3R4 %v10 228c2ecf20Sopenharmony_ci#define CONST_R5 %v11 238c2ecf20Sopenharmony_ci#define CONST_R6 %v12 248c2ecf20Sopenharmony_ci#define CONST_RU_POLY %v13 258c2ecf20Sopenharmony_ci#define CONST_CRC_POLY %v14 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci.data 288c2ecf20Sopenharmony_ci.align 8 298c2ecf20Sopenharmony_ci 308c2ecf20Sopenharmony_ci/* 318c2ecf20Sopenharmony_ci * The CRC-32 constant block contains reduction constants to fold and 328c2ecf20Sopenharmony_ci * process particular chunks of the input data stream in parallel. 338c2ecf20Sopenharmony_ci * 348c2ecf20Sopenharmony_ci * For the CRC-32 variants, the constants are precomputed according to 358c2ecf20Sopenharmony_ci * these defintions: 368c2ecf20Sopenharmony_ci * 378c2ecf20Sopenharmony_ci * R1 = x4*128+64 mod P(x) 388c2ecf20Sopenharmony_ci * R2 = x4*128 mod P(x) 398c2ecf20Sopenharmony_ci * R3 = x128+64 mod P(x) 408c2ecf20Sopenharmony_ci * R4 = x128 mod P(x) 418c2ecf20Sopenharmony_ci * R5 = x96 mod P(x) 428c2ecf20Sopenharmony_ci * R6 = x64 mod P(x) 438c2ecf20Sopenharmony_ci * 448c2ecf20Sopenharmony_ci * Barret reduction constant, u, is defined as floor(x**64 / P(x)). 458c2ecf20Sopenharmony_ci * 468c2ecf20Sopenharmony_ci * where P(x) is the polynomial in the normal domain and the P'(x) is the 478c2ecf20Sopenharmony_ci * polynomial in the reversed (bitreflected) domain. 488c2ecf20Sopenharmony_ci * 498c2ecf20Sopenharmony_ci * Note that the constant definitions below are extended in order to compute 508c2ecf20Sopenharmony_ci * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. 518c2ecf20Sopenharmony_ci * The righmost doubleword can be 0 to prevent contribution to the result or 528c2ecf20Sopenharmony_ci * can be multiplied by 1 to perform an XOR without the need for a separate 538c2ecf20Sopenharmony_ci * VECTOR EXCLUSIVE OR instruction. 548c2ecf20Sopenharmony_ci * 558c2ecf20Sopenharmony_ci * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: 568c2ecf20Sopenharmony_ci * 578c2ecf20Sopenharmony_ci * P(x) = 0x04C11DB7 588c2ecf20Sopenharmony_ci * P'(x) = 0xEDB88320 598c2ecf20Sopenharmony_ci */ 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci.Lconstants_CRC_32_BE: 628c2ecf20Sopenharmony_ci .quad 0x08833794c, 0x0e6228b11 # R1, R2 638c2ecf20Sopenharmony_ci .quad 0x0c5b9cd4c, 0x0e8a45605 # R3, R4 648c2ecf20Sopenharmony_ci .quad 0x0f200aa66, 1 << 32 # R5, x32 658c2ecf20Sopenharmony_ci .quad 0x0490d678d, 1 # R6, 1 668c2ecf20Sopenharmony_ci .quad 0x104d101df, 0 # u 678c2ecf20Sopenharmony_ci .quad 0x104C11DB7, 0 # P(x) 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci.previous 708c2ecf20Sopenharmony_ci 718c2ecf20Sopenharmony_ci GEN_BR_THUNK %r14 728c2ecf20Sopenharmony_ci 738c2ecf20Sopenharmony_ci.text 748c2ecf20Sopenharmony_ci/* 758c2ecf20Sopenharmony_ci * The CRC-32 function(s) use these calling conventions: 768c2ecf20Sopenharmony_ci * 778c2ecf20Sopenharmony_ci * Parameters: 788c2ecf20Sopenharmony_ci * 798c2ecf20Sopenharmony_ci * %r2: Initial CRC value, typically ~0; and final CRC (return) value. 808c2ecf20Sopenharmony_ci * %r3: Input buffer pointer, performance might be improved if the 818c2ecf20Sopenharmony_ci * buffer is on a doubleword boundary. 828c2ecf20Sopenharmony_ci * %r4: Length of the buffer, must be 64 bytes or greater. 838c2ecf20Sopenharmony_ci * 848c2ecf20Sopenharmony_ci * Register usage: 858c2ecf20Sopenharmony_ci * 868c2ecf20Sopenharmony_ci * %r5: CRC-32 constant pool base pointer. 878c2ecf20Sopenharmony_ci * V0: Initial CRC value and intermediate constants and results. 888c2ecf20Sopenharmony_ci * V1..V4: Data for CRC computation. 898c2ecf20Sopenharmony_ci * V5..V8: Next data chunks that are fetched from the input buffer. 908c2ecf20Sopenharmony_ci * 918c2ecf20Sopenharmony_ci * V9..V14: CRC-32 constants. 928c2ecf20Sopenharmony_ci */ 938c2ecf20Sopenharmony_ciENTRY(crc32_be_vgfm_16) 948c2ecf20Sopenharmony_ci /* Load CRC-32 constants */ 958c2ecf20Sopenharmony_ci larl %r5,.Lconstants_CRC_32_BE 968c2ecf20Sopenharmony_ci VLM CONST_R1R2,CONST_CRC_POLY,0,%r5 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_ci /* Load the initial CRC value into the leftmost word of V0. */ 998c2ecf20Sopenharmony_ci VZERO %v0 1008c2ecf20Sopenharmony_ci VLVGF %v0,%r2,0 1018c2ecf20Sopenharmony_ci 1028c2ecf20Sopenharmony_ci /* Load a 64-byte data chunk and XOR with CRC */ 1038c2ecf20Sopenharmony_ci VLM %v1,%v4,0,%r3 /* 64-bytes into V1..V4 */ 1048c2ecf20Sopenharmony_ci VX %v1,%v0,%v1 /* V1 ^= CRC */ 1058c2ecf20Sopenharmony_ci aghi %r3,64 /* BUF = BUF + 64 */ 1068c2ecf20Sopenharmony_ci aghi %r4,-64 /* LEN = LEN - 64 */ 1078c2ecf20Sopenharmony_ci 1088c2ecf20Sopenharmony_ci /* Check remaining buffer size and jump to proper folding method */ 1098c2ecf20Sopenharmony_ci cghi %r4,64 1108c2ecf20Sopenharmony_ci jl .Lless_than_64bytes 1118c2ecf20Sopenharmony_ci 1128c2ecf20Sopenharmony_ci.Lfold_64bytes_loop: 1138c2ecf20Sopenharmony_ci /* Load the next 64-byte data chunk into V5 to V8 */ 1148c2ecf20Sopenharmony_ci VLM %v5,%v8,0,%r3 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_ci /* 1178c2ecf20Sopenharmony_ci * Perform a GF(2) multiplication of the doublewords in V1 with 1188c2ecf20Sopenharmony_ci * the reduction constants in V0. The intermediate result is 1198c2ecf20Sopenharmony_ci * then folded (accumulated) with the next data chunk in V5 and 1208c2ecf20Sopenharmony_ci * stored in V1. Repeat this step for the register contents 1218c2ecf20Sopenharmony_ci * in V2, V3, and V4 respectively. 1228c2ecf20Sopenharmony_ci */ 1238c2ecf20Sopenharmony_ci VGFMAG %v1,CONST_R1R2,%v1,%v5 1248c2ecf20Sopenharmony_ci VGFMAG %v2,CONST_R1R2,%v2,%v6 1258c2ecf20Sopenharmony_ci VGFMAG %v3,CONST_R1R2,%v3,%v7 1268c2ecf20Sopenharmony_ci VGFMAG %v4,CONST_R1R2,%v4,%v8 1278c2ecf20Sopenharmony_ci 1288c2ecf20Sopenharmony_ci /* Adjust buffer pointer and length for next loop */ 1298c2ecf20Sopenharmony_ci aghi %r3,64 /* BUF = BUF + 64 */ 1308c2ecf20Sopenharmony_ci aghi %r4,-64 /* LEN = LEN - 64 */ 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci cghi %r4,64 1338c2ecf20Sopenharmony_ci jnl .Lfold_64bytes_loop 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_ci.Lless_than_64bytes: 1368c2ecf20Sopenharmony_ci /* Fold V1 to V4 into a single 128-bit value in V1 */ 1378c2ecf20Sopenharmony_ci VGFMAG %v1,CONST_R3R4,%v1,%v2 1388c2ecf20Sopenharmony_ci VGFMAG %v1,CONST_R3R4,%v1,%v3 1398c2ecf20Sopenharmony_ci VGFMAG %v1,CONST_R3R4,%v1,%v4 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_ci /* Check whether to continue with 64-bit folding */ 1428c2ecf20Sopenharmony_ci cghi %r4,16 1438c2ecf20Sopenharmony_ci jl .Lfinal_fold 1448c2ecf20Sopenharmony_ci 1458c2ecf20Sopenharmony_ci.Lfold_16bytes_loop: 1468c2ecf20Sopenharmony_ci 1478c2ecf20Sopenharmony_ci VL %v2,0,,%r3 /* Load next data chunk */ 1488c2ecf20Sopenharmony_ci VGFMAG %v1,CONST_R3R4,%v1,%v2 /* Fold next data chunk */ 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_ci /* Adjust buffer pointer and size for folding next data chunk */ 1518c2ecf20Sopenharmony_ci aghi %r3,16 1528c2ecf20Sopenharmony_ci aghi %r4,-16 1538c2ecf20Sopenharmony_ci 1548c2ecf20Sopenharmony_ci /* Process remaining data chunks */ 1558c2ecf20Sopenharmony_ci cghi %r4,16 1568c2ecf20Sopenharmony_ci jnl .Lfold_16bytes_loop 1578c2ecf20Sopenharmony_ci 1588c2ecf20Sopenharmony_ci.Lfinal_fold: 1598c2ecf20Sopenharmony_ci /* 1608c2ecf20Sopenharmony_ci * The R5 constant is used to fold a 128-bit value into an 96-bit value 1618c2ecf20Sopenharmony_ci * that is XORed with the next 96-bit input data chunk. To use a single 1628c2ecf20Sopenharmony_ci * VGFMG instruction, multiply the rightmost 64-bit with x^32 (1<<32) to 1638c2ecf20Sopenharmony_ci * form an intermediate 96-bit value (with appended zeros) which is then 1648c2ecf20Sopenharmony_ci * XORed with the intermediate reduction result. 1658c2ecf20Sopenharmony_ci */ 1668c2ecf20Sopenharmony_ci VGFMG %v1,CONST_R5,%v1 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_ci /* 1698c2ecf20Sopenharmony_ci * Further reduce the remaining 96-bit value to a 64-bit value using a 1708c2ecf20Sopenharmony_ci * single VGFMG, the rightmost doubleword is multiplied with 0x1. The 1718c2ecf20Sopenharmony_ci * intermediate result is then XORed with the product of the leftmost 1728c2ecf20Sopenharmony_ci * doubleword with R6. The result is a 64-bit value and is subject to 1738c2ecf20Sopenharmony_ci * the Barret reduction. 1748c2ecf20Sopenharmony_ci */ 1758c2ecf20Sopenharmony_ci VGFMG %v1,CONST_R6,%v1 1768c2ecf20Sopenharmony_ci 1778c2ecf20Sopenharmony_ci /* 1788c2ecf20Sopenharmony_ci * The input values to the Barret reduction are the degree-63 polynomial 1798c2ecf20Sopenharmony_ci * in V1 (R(x)), degree-32 generator polynomial, and the reduction 1808c2ecf20Sopenharmony_ci * constant u. The Barret reduction result is the CRC value of R(x) mod 1818c2ecf20Sopenharmony_ci * P(x). 1828c2ecf20Sopenharmony_ci * 1838c2ecf20Sopenharmony_ci * The Barret reduction algorithm is defined as: 1848c2ecf20Sopenharmony_ci * 1858c2ecf20Sopenharmony_ci * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u 1868c2ecf20Sopenharmony_ci * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) 1878c2ecf20Sopenharmony_ci * 3. C(x) = R(x) XOR T2(x) mod x^32 1888c2ecf20Sopenharmony_ci * 1898c2ecf20Sopenharmony_ci * Note: To compensate the division by x^32, use the vector unpack 1908c2ecf20Sopenharmony_ci * instruction to move the leftmost word into the leftmost doubleword 1918c2ecf20Sopenharmony_ci * of the vector register. The rightmost doubleword is multiplied 1928c2ecf20Sopenharmony_ci * with zero to not contribute to the intermedate results. 1938c2ecf20Sopenharmony_ci */ 1948c2ecf20Sopenharmony_ci 1958c2ecf20Sopenharmony_ci /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ 1968c2ecf20Sopenharmony_ci VUPLLF %v2,%v1 1978c2ecf20Sopenharmony_ci VGFMG %v2,CONST_RU_POLY,%v2 1988c2ecf20Sopenharmony_ci 1998c2ecf20Sopenharmony_ci /* 2008c2ecf20Sopenharmony_ci * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in 2018c2ecf20Sopenharmony_ci * V2 and XOR the intermediate result, T2(x), with the value in V1. 2028c2ecf20Sopenharmony_ci * The final result is in the rightmost word of V2. 2038c2ecf20Sopenharmony_ci */ 2048c2ecf20Sopenharmony_ci VUPLLF %v2,%v2 2058c2ecf20Sopenharmony_ci VGFMAG %v2,CONST_CRC_POLY,%v2,%v1 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_ci.Ldone: 2088c2ecf20Sopenharmony_ci VLGVF %r2,%v2,3 2098c2ecf20Sopenharmony_ci BR_EX %r14 2108c2ecf20Sopenharmony_ciENDPROC(crc32_be_vgfm_16) 2118c2ecf20Sopenharmony_ci 2128c2ecf20Sopenharmony_ci.previous 213