18c2ecf20Sopenharmony_ci/* GPL HEADER START 28c2ecf20Sopenharmony_ci * 38c2ecf20Sopenharmony_ci * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * This program is free software; you can redistribute it and/or modify 68c2ecf20Sopenharmony_ci * it under the terms of the GNU General Public License version 2 only, 78c2ecf20Sopenharmony_ci * as published by the Free Software Foundation. 88c2ecf20Sopenharmony_ci * 98c2ecf20Sopenharmony_ci * This program is distributed in the hope that it will be useful, but 108c2ecf20Sopenharmony_ci * WITHOUT ANY WARRANTY; without even the implied warranty of 118c2ecf20Sopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 128c2ecf20Sopenharmony_ci * General Public License version 2 for more details (a copy is included 138c2ecf20Sopenharmony_ci * in the LICENSE file that accompanied this code). 148c2ecf20Sopenharmony_ci * 158c2ecf20Sopenharmony_ci * You should have received a copy of the GNU General Public License 168c2ecf20Sopenharmony_ci * version 2 along with this program; If not, see http://www.gnu.org/licenses 178c2ecf20Sopenharmony_ci * 188c2ecf20Sopenharmony_ci * Please visit http://www.xyratex.com/contact if you need additional 198c2ecf20Sopenharmony_ci * information or have any questions. 208c2ecf20Sopenharmony_ci * 218c2ecf20Sopenharmony_ci * GPL HEADER END 228c2ecf20Sopenharmony_ci */ 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_ci/* 258c2ecf20Sopenharmony_ci * Copyright 2012 Xyratex Technology Limited 268c2ecf20Sopenharmony_ci * 278c2ecf20Sopenharmony_ci * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 288c2ecf20Sopenharmony_ci * calculation. 298c2ecf20Sopenharmony_ci * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) 308c2ecf20Sopenharmony_ci * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found 318c2ecf20Sopenharmony_ci * at: 328c2ecf20Sopenharmony_ci * http://www.intel.com/products/processor/manuals/ 338c2ecf20Sopenharmony_ci * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 348c2ecf20Sopenharmony_ci * Volume 2B: Instruction Set Reference, N-Z 358c2ecf20Sopenharmony_ci * 368c2ecf20Sopenharmony_ci * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> 378c2ecf20Sopenharmony_ci * Alexander Boyko <Alexander_Boyko@xyratex.com> 388c2ecf20Sopenharmony_ci */ 398c2ecf20Sopenharmony_ci 408c2ecf20Sopenharmony_ci#include <linux/linkage.h> 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci 438c2ecf20Sopenharmony_ci.section .rodata 448c2ecf20Sopenharmony_ci.align 16 458c2ecf20Sopenharmony_ci/* 468c2ecf20Sopenharmony_ci * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 478c2ecf20Sopenharmony_ci * #define CONSTANT_R1 0x154442bd4LL 488c2ecf20Sopenharmony_ci * 498c2ecf20Sopenharmony_ci * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 508c2ecf20Sopenharmony_ci * #define CONSTANT_R2 0x1c6e41596LL 518c2ecf20Sopenharmony_ci */ 528c2ecf20Sopenharmony_ci.Lconstant_R2R1: 538c2ecf20Sopenharmony_ci .octa 0x00000001c6e415960000000154442bd4 548c2ecf20Sopenharmony_ci/* 558c2ecf20Sopenharmony_ci * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 568c2ecf20Sopenharmony_ci * #define CONSTANT_R3 0x1751997d0LL 578c2ecf20Sopenharmony_ci * 588c2ecf20Sopenharmony_ci * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e 598c2ecf20Sopenharmony_ci * #define CONSTANT_R4 0x0ccaa009eLL 608c2ecf20Sopenharmony_ci */ 618c2ecf20Sopenharmony_ci.Lconstant_R4R3: 628c2ecf20Sopenharmony_ci .octa 0x00000000ccaa009e00000001751997d0 638c2ecf20Sopenharmony_ci/* 648c2ecf20Sopenharmony_ci * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 658c2ecf20Sopenharmony_ci * #define CONSTANT_R5 0x163cd6124LL 668c2ecf20Sopenharmony_ci */ 678c2ecf20Sopenharmony_ci.Lconstant_R5: 688c2ecf20Sopenharmony_ci .octa 0x00000000000000000000000163cd6124 698c2ecf20Sopenharmony_ci.Lconstant_mask32: 708c2ecf20Sopenharmony_ci .octa 0x000000000000000000000000FFFFFFFF 718c2ecf20Sopenharmony_ci/* 728c2ecf20Sopenharmony_ci * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL 738c2ecf20Sopenharmony_ci * 748c2ecf20Sopenharmony_ci * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL 758c2ecf20Sopenharmony_ci * #define CONSTANT_RU 0x1F7011641LL 768c2ecf20Sopenharmony_ci */ 778c2ecf20Sopenharmony_ci.Lconstant_RUpoly: 788c2ecf20Sopenharmony_ci .octa 0x00000001F701164100000001DB710641 798c2ecf20Sopenharmony_ci 808c2ecf20Sopenharmony_ci#define CONSTANT %xmm0 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_ci#ifdef __x86_64__ 838c2ecf20Sopenharmony_ci#define BUF %rdi 848c2ecf20Sopenharmony_ci#define LEN %rsi 858c2ecf20Sopenharmony_ci#define CRC %edx 868c2ecf20Sopenharmony_ci#else 878c2ecf20Sopenharmony_ci#define BUF %eax 888c2ecf20Sopenharmony_ci#define LEN %edx 898c2ecf20Sopenharmony_ci#define CRC %ecx 908c2ecf20Sopenharmony_ci#endif 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci.text 958c2ecf20Sopenharmony_ci/** 968c2ecf20Sopenharmony_ci * Calculate crc32 978c2ecf20Sopenharmony_ci * BUF - buffer (16 bytes aligned) 988c2ecf20Sopenharmony_ci * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63 998c2ecf20Sopenharmony_ci * CRC - initial crc32 1008c2ecf20Sopenharmony_ci * return %eax crc32 1018c2ecf20Sopenharmony_ci * uint crc32_pclmul_le_16(unsigned char const *buffer, 1028c2ecf20Sopenharmony_ci * size_t len, uint crc32) 1038c2ecf20Sopenharmony_ci */ 1048c2ecf20Sopenharmony_ci 1058c2ecf20Sopenharmony_ciSYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ 1068c2ecf20Sopenharmony_ci movdqa (BUF), %xmm1 1078c2ecf20Sopenharmony_ci movdqa 0x10(BUF), %xmm2 1088c2ecf20Sopenharmony_ci movdqa 0x20(BUF), %xmm3 1098c2ecf20Sopenharmony_ci movdqa 0x30(BUF), %xmm4 1108c2ecf20Sopenharmony_ci movd CRC, CONSTANT 1118c2ecf20Sopenharmony_ci pxor CONSTANT, %xmm1 1128c2ecf20Sopenharmony_ci sub $0x40, LEN 1138c2ecf20Sopenharmony_ci add $0x40, BUF 1148c2ecf20Sopenharmony_ci cmp $0x40, LEN 1158c2ecf20Sopenharmony_ci jb less_64 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ci#ifdef __x86_64__ 1188c2ecf20Sopenharmony_ci movdqa .Lconstant_R2R1(%rip), CONSTANT 1198c2ecf20Sopenharmony_ci#else 1208c2ecf20Sopenharmony_ci movdqa .Lconstant_R2R1, CONSTANT 1218c2ecf20Sopenharmony_ci#endif 1228c2ecf20Sopenharmony_ci 1238c2ecf20Sopenharmony_ciloop_64:/* 64 bytes Full cache line folding */ 1248c2ecf20Sopenharmony_ci prefetchnta 0x40(BUF) 1258c2ecf20Sopenharmony_ci movdqa %xmm1, %xmm5 1268c2ecf20Sopenharmony_ci movdqa %xmm2, %xmm6 1278c2ecf20Sopenharmony_ci movdqa %xmm3, %xmm7 1288c2ecf20Sopenharmony_ci#ifdef __x86_64__ 1298c2ecf20Sopenharmony_ci movdqa %xmm4, %xmm8 1308c2ecf20Sopenharmony_ci#endif 1318c2ecf20Sopenharmony_ci pclmulqdq $0x00, CONSTANT, %xmm1 1328c2ecf20Sopenharmony_ci pclmulqdq $0x00, CONSTANT, %xmm2 1338c2ecf20Sopenharmony_ci pclmulqdq $0x00, CONSTANT, %xmm3 1348c2ecf20Sopenharmony_ci#ifdef __x86_64__ 1358c2ecf20Sopenharmony_ci pclmulqdq $0x00, CONSTANT, %xmm4 1368c2ecf20Sopenharmony_ci#endif 1378c2ecf20Sopenharmony_ci pclmulqdq $0x11, CONSTANT, %xmm5 1388c2ecf20Sopenharmony_ci pclmulqdq $0x11, CONSTANT, %xmm6 1398c2ecf20Sopenharmony_ci pclmulqdq $0x11, CONSTANT, %xmm7 1408c2ecf20Sopenharmony_ci#ifdef __x86_64__ 1418c2ecf20Sopenharmony_ci pclmulqdq $0x11, CONSTANT, %xmm8 1428c2ecf20Sopenharmony_ci#endif 1438c2ecf20Sopenharmony_ci pxor %xmm5, %xmm1 1448c2ecf20Sopenharmony_ci pxor %xmm6, %xmm2 1458c2ecf20Sopenharmony_ci pxor %xmm7, %xmm3 1468c2ecf20Sopenharmony_ci#ifdef __x86_64__ 1478c2ecf20Sopenharmony_ci pxor %xmm8, %xmm4 1488c2ecf20Sopenharmony_ci#else 1498c2ecf20Sopenharmony_ci /* xmm8 unsupported for x32 */ 1508c2ecf20Sopenharmony_ci movdqa %xmm4, %xmm5 1518c2ecf20Sopenharmony_ci pclmulqdq $0x00, CONSTANT, %xmm4 1528c2ecf20Sopenharmony_ci pclmulqdq $0x11, CONSTANT, %xmm5 1538c2ecf20Sopenharmony_ci pxor %xmm5, %xmm4 1548c2ecf20Sopenharmony_ci#endif 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci pxor (BUF), %xmm1 1578c2ecf20Sopenharmony_ci pxor 0x10(BUF), %xmm2 1588c2ecf20Sopenharmony_ci pxor 0x20(BUF), %xmm3 1598c2ecf20Sopenharmony_ci pxor 0x30(BUF), %xmm4 1608c2ecf20Sopenharmony_ci 1618c2ecf20Sopenharmony_ci sub $0x40, LEN 1628c2ecf20Sopenharmony_ci add $0x40, BUF 1638c2ecf20Sopenharmony_ci cmp $0x40, LEN 1648c2ecf20Sopenharmony_ci jge loop_64 1658c2ecf20Sopenharmony_ciless_64:/* Folding cache line into 128bit */ 1668c2ecf20Sopenharmony_ci#ifdef __x86_64__ 1678c2ecf20Sopenharmony_ci movdqa .Lconstant_R4R3(%rip), CONSTANT 1688c2ecf20Sopenharmony_ci#else 1698c2ecf20Sopenharmony_ci movdqa .Lconstant_R4R3, CONSTANT 1708c2ecf20Sopenharmony_ci#endif 1718c2ecf20Sopenharmony_ci prefetchnta (BUF) 1728c2ecf20Sopenharmony_ci 1738c2ecf20Sopenharmony_ci movdqa %xmm1, %xmm5 1748c2ecf20Sopenharmony_ci pclmulqdq $0x00, CONSTANT, %xmm1 1758c2ecf20Sopenharmony_ci pclmulqdq $0x11, CONSTANT, %xmm5 1768c2ecf20Sopenharmony_ci pxor %xmm5, %xmm1 1778c2ecf20Sopenharmony_ci pxor %xmm2, %xmm1 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci movdqa %xmm1, %xmm5 1808c2ecf20Sopenharmony_ci pclmulqdq $0x00, CONSTANT, %xmm1 1818c2ecf20Sopenharmony_ci pclmulqdq $0x11, CONSTANT, %xmm5 1828c2ecf20Sopenharmony_ci pxor %xmm5, %xmm1 1838c2ecf20Sopenharmony_ci pxor %xmm3, %xmm1 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci movdqa %xmm1, %xmm5 1868c2ecf20Sopenharmony_ci pclmulqdq $0x00, CONSTANT, %xmm1 1878c2ecf20Sopenharmony_ci pclmulqdq $0x11, CONSTANT, %xmm5 1888c2ecf20Sopenharmony_ci pxor %xmm5, %xmm1 1898c2ecf20Sopenharmony_ci pxor %xmm4, %xmm1 1908c2ecf20Sopenharmony_ci 1918c2ecf20Sopenharmony_ci cmp $0x10, LEN 1928c2ecf20Sopenharmony_ci jb fold_64 1938c2ecf20Sopenharmony_ciloop_16:/* Folding rest buffer into 128bit */ 1948c2ecf20Sopenharmony_ci movdqa %xmm1, %xmm5 1958c2ecf20Sopenharmony_ci pclmulqdq $0x00, CONSTANT, %xmm1 1968c2ecf20Sopenharmony_ci pclmulqdq $0x11, CONSTANT, %xmm5 1978c2ecf20Sopenharmony_ci pxor %xmm5, %xmm1 1988c2ecf20Sopenharmony_ci pxor (BUF), %xmm1 1998c2ecf20Sopenharmony_ci sub $0x10, LEN 2008c2ecf20Sopenharmony_ci add $0x10, BUF 2018c2ecf20Sopenharmony_ci cmp $0x10, LEN 2028c2ecf20Sopenharmony_ci jge loop_16 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_cifold_64: 2058c2ecf20Sopenharmony_ci /* perform the last 64 bit fold, also adds 32 zeroes 2068c2ecf20Sopenharmony_ci * to the input stream */ 2078c2ecf20Sopenharmony_ci pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ 2088c2ecf20Sopenharmony_ci psrldq $0x08, %xmm1 2098c2ecf20Sopenharmony_ci pxor CONSTANT, %xmm1 2108c2ecf20Sopenharmony_ci 2118c2ecf20Sopenharmony_ci /* final 32-bit fold */ 2128c2ecf20Sopenharmony_ci movdqa %xmm1, %xmm2 2138c2ecf20Sopenharmony_ci#ifdef __x86_64__ 2148c2ecf20Sopenharmony_ci movdqa .Lconstant_R5(%rip), CONSTANT 2158c2ecf20Sopenharmony_ci movdqa .Lconstant_mask32(%rip), %xmm3 2168c2ecf20Sopenharmony_ci#else 2178c2ecf20Sopenharmony_ci movdqa .Lconstant_R5, CONSTANT 2188c2ecf20Sopenharmony_ci movdqa .Lconstant_mask32, %xmm3 2198c2ecf20Sopenharmony_ci#endif 2208c2ecf20Sopenharmony_ci psrldq $0x04, %xmm2 2218c2ecf20Sopenharmony_ci pand %xmm3, %xmm1 2228c2ecf20Sopenharmony_ci pclmulqdq $0x00, CONSTANT, %xmm1 2238c2ecf20Sopenharmony_ci pxor %xmm2, %xmm1 2248c2ecf20Sopenharmony_ci 2258c2ecf20Sopenharmony_ci /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ 2268c2ecf20Sopenharmony_ci#ifdef __x86_64__ 2278c2ecf20Sopenharmony_ci movdqa .Lconstant_RUpoly(%rip), CONSTANT 2288c2ecf20Sopenharmony_ci#else 2298c2ecf20Sopenharmony_ci movdqa .Lconstant_RUpoly, CONSTANT 2308c2ecf20Sopenharmony_ci#endif 2318c2ecf20Sopenharmony_ci movdqa %xmm1, %xmm2 2328c2ecf20Sopenharmony_ci pand %xmm3, %xmm1 2338c2ecf20Sopenharmony_ci pclmulqdq $0x10, CONSTANT, %xmm1 2348c2ecf20Sopenharmony_ci pand %xmm3, %xmm1 2358c2ecf20Sopenharmony_ci pclmulqdq $0x00, CONSTANT, %xmm1 2368c2ecf20Sopenharmony_ci pxor %xmm2, %xmm1 2378c2ecf20Sopenharmony_ci pextrd $0x01, %xmm1, %eax 2388c2ecf20Sopenharmony_ci 2398c2ecf20Sopenharmony_ci RET 2408c2ecf20Sopenharmony_ciSYM_FUNC_END(crc32_pclmul_le_16) 241