162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright 2012 Xyratex Technology Limited
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
662306a36Sopenharmony_ci * calculation.
762306a36Sopenharmony_ci * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
862306a36Sopenharmony_ci * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
962306a36Sopenharmony_ci * at:
1062306a36Sopenharmony_ci * http://www.intel.com/products/processor/manuals/
1162306a36Sopenharmony_ci * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
1262306a36Sopenharmony_ci * Volume 2B: Instruction Set Reference, N-Z
1362306a36Sopenharmony_ci *
1462306a36Sopenharmony_ci * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
1562306a36Sopenharmony_ci *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
1662306a36Sopenharmony_ci */
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci#include <linux/linkage.h>
1962306a36Sopenharmony_ci
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_ci.section .rodata
2262306a36Sopenharmony_ci.align 16
2362306a36Sopenharmony_ci/*
2462306a36Sopenharmony_ci * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
2562306a36Sopenharmony_ci * #define CONSTANT_R1  0x154442bd4LL
2662306a36Sopenharmony_ci *
2762306a36Sopenharmony_ci * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
2862306a36Sopenharmony_ci * #define CONSTANT_R2  0x1c6e41596LL
2962306a36Sopenharmony_ci */
3062306a36Sopenharmony_ci.Lconstant_R2R1:
3162306a36Sopenharmony_ci	.octa 0x00000001c6e415960000000154442bd4
3262306a36Sopenharmony_ci/*
3362306a36Sopenharmony_ci * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
3462306a36Sopenharmony_ci * #define CONSTANT_R3  0x1751997d0LL
3562306a36Sopenharmony_ci *
3662306a36Sopenharmony_ci * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
3762306a36Sopenharmony_ci * #define CONSTANT_R4  0x0ccaa009eLL
3862306a36Sopenharmony_ci */
3962306a36Sopenharmony_ci.Lconstant_R4R3:
4062306a36Sopenharmony_ci	.octa 0x00000000ccaa009e00000001751997d0
4162306a36Sopenharmony_ci/*
4262306a36Sopenharmony_ci * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
4362306a36Sopenharmony_ci * #define CONSTANT_R5  0x163cd6124LL
4462306a36Sopenharmony_ci */
4562306a36Sopenharmony_ci.Lconstant_R5:
4662306a36Sopenharmony_ci	.octa 0x00000000000000000000000163cd6124
4762306a36Sopenharmony_ci.Lconstant_mask32:
4862306a36Sopenharmony_ci	.octa 0x000000000000000000000000FFFFFFFF
4962306a36Sopenharmony_ci/*
5062306a36Sopenharmony_ci * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
5162306a36Sopenharmony_ci *
5262306a36Sopenharmony_ci * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
5362306a36Sopenharmony_ci * #define CONSTANT_RU  0x1F7011641LL
5462306a36Sopenharmony_ci */
5562306a36Sopenharmony_ci.Lconstant_RUpoly:
5662306a36Sopenharmony_ci	.octa 0x00000001F701164100000001DB710641
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_ci#define CONSTANT %xmm0
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_ci#ifdef __x86_64__
6162306a36Sopenharmony_ci#define BUF     %rdi
6262306a36Sopenharmony_ci#define LEN     %rsi
6362306a36Sopenharmony_ci#define CRC     %edx
6462306a36Sopenharmony_ci#else
6562306a36Sopenharmony_ci#define BUF     %eax
6662306a36Sopenharmony_ci#define LEN     %edx
6762306a36Sopenharmony_ci#define CRC     %ecx
6862306a36Sopenharmony_ci#endif
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ci
7262306a36Sopenharmony_ci.text
7362306a36Sopenharmony_ci/**
7462306a36Sopenharmony_ci *      Calculate crc32
7562306a36Sopenharmony_ci *      BUF - buffer (16 bytes aligned)
7662306a36Sopenharmony_ci *      LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
7762306a36Sopenharmony_ci *      CRC - initial crc32
7862306a36Sopenharmony_ci *      return %eax crc32
7962306a36Sopenharmony_ci *      uint crc32_pclmul_le_16(unsigned char const *buffer,
8062306a36Sopenharmony_ci *	                     size_t len, uint crc32)
8162306a36Sopenharmony_ci */
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_ciSYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
8462306a36Sopenharmony_ci	movdqa  (BUF), %xmm1
8562306a36Sopenharmony_ci	movdqa  0x10(BUF), %xmm2
8662306a36Sopenharmony_ci	movdqa  0x20(BUF), %xmm3
8762306a36Sopenharmony_ci	movdqa  0x30(BUF), %xmm4
8862306a36Sopenharmony_ci	movd    CRC, CONSTANT
8962306a36Sopenharmony_ci	pxor    CONSTANT, %xmm1
9062306a36Sopenharmony_ci	sub     $0x40, LEN
9162306a36Sopenharmony_ci	add     $0x40, BUF
9262306a36Sopenharmony_ci	cmp     $0x40, LEN
9362306a36Sopenharmony_ci	jb      .Lless_64
9462306a36Sopenharmony_ci
9562306a36Sopenharmony_ci#ifdef __x86_64__
9662306a36Sopenharmony_ci	movdqa .Lconstant_R2R1(%rip), CONSTANT
9762306a36Sopenharmony_ci#else
9862306a36Sopenharmony_ci	movdqa .Lconstant_R2R1, CONSTANT
9962306a36Sopenharmony_ci#endif
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_ci.Lloop_64:/*  64 bytes Full cache line folding */
10262306a36Sopenharmony_ci	prefetchnta    0x40(BUF)
10362306a36Sopenharmony_ci	movdqa  %xmm1, %xmm5
10462306a36Sopenharmony_ci	movdqa  %xmm2, %xmm6
10562306a36Sopenharmony_ci	movdqa  %xmm3, %xmm7
10662306a36Sopenharmony_ci#ifdef __x86_64__
10762306a36Sopenharmony_ci	movdqa  %xmm4, %xmm8
10862306a36Sopenharmony_ci#endif
10962306a36Sopenharmony_ci	pclmulqdq $0x00, CONSTANT, %xmm1
11062306a36Sopenharmony_ci	pclmulqdq $0x00, CONSTANT, %xmm2
11162306a36Sopenharmony_ci	pclmulqdq $0x00, CONSTANT, %xmm3
11262306a36Sopenharmony_ci#ifdef __x86_64__
11362306a36Sopenharmony_ci	pclmulqdq $0x00, CONSTANT, %xmm4
11462306a36Sopenharmony_ci#endif
11562306a36Sopenharmony_ci	pclmulqdq $0x11, CONSTANT, %xmm5
11662306a36Sopenharmony_ci	pclmulqdq $0x11, CONSTANT, %xmm6
11762306a36Sopenharmony_ci	pclmulqdq $0x11, CONSTANT, %xmm7
11862306a36Sopenharmony_ci#ifdef __x86_64__
11962306a36Sopenharmony_ci	pclmulqdq $0x11, CONSTANT, %xmm8
12062306a36Sopenharmony_ci#endif
12162306a36Sopenharmony_ci	pxor    %xmm5, %xmm1
12262306a36Sopenharmony_ci	pxor    %xmm6, %xmm2
12362306a36Sopenharmony_ci	pxor    %xmm7, %xmm3
12462306a36Sopenharmony_ci#ifdef __x86_64__
12562306a36Sopenharmony_ci	pxor    %xmm8, %xmm4
12662306a36Sopenharmony_ci#else
12762306a36Sopenharmony_ci	/* xmm8 unsupported for x32 */
12862306a36Sopenharmony_ci	movdqa  %xmm4, %xmm5
12962306a36Sopenharmony_ci	pclmulqdq $0x00, CONSTANT, %xmm4
13062306a36Sopenharmony_ci	pclmulqdq $0x11, CONSTANT, %xmm5
13162306a36Sopenharmony_ci	pxor    %xmm5, %xmm4
13262306a36Sopenharmony_ci#endif
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ci	pxor    (BUF), %xmm1
13562306a36Sopenharmony_ci	pxor    0x10(BUF), %xmm2
13662306a36Sopenharmony_ci	pxor    0x20(BUF), %xmm3
13762306a36Sopenharmony_ci	pxor    0x30(BUF), %xmm4
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_ci	sub     $0x40, LEN
14062306a36Sopenharmony_ci	add     $0x40, BUF
14162306a36Sopenharmony_ci	cmp     $0x40, LEN
14262306a36Sopenharmony_ci	jge     .Lloop_64
14362306a36Sopenharmony_ci.Lless_64:/*  Folding cache line into 128bit */
14462306a36Sopenharmony_ci#ifdef __x86_64__
14562306a36Sopenharmony_ci	movdqa  .Lconstant_R4R3(%rip), CONSTANT
14662306a36Sopenharmony_ci#else
14762306a36Sopenharmony_ci	movdqa  .Lconstant_R4R3, CONSTANT
14862306a36Sopenharmony_ci#endif
14962306a36Sopenharmony_ci	prefetchnta     (BUF)
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_ci	movdqa  %xmm1, %xmm5
15262306a36Sopenharmony_ci	pclmulqdq $0x00, CONSTANT, %xmm1
15362306a36Sopenharmony_ci	pclmulqdq $0x11, CONSTANT, %xmm5
15462306a36Sopenharmony_ci	pxor    %xmm5, %xmm1
15562306a36Sopenharmony_ci	pxor    %xmm2, %xmm1
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci	movdqa  %xmm1, %xmm5
15862306a36Sopenharmony_ci	pclmulqdq $0x00, CONSTANT, %xmm1
15962306a36Sopenharmony_ci	pclmulqdq $0x11, CONSTANT, %xmm5
16062306a36Sopenharmony_ci	pxor    %xmm5, %xmm1
16162306a36Sopenharmony_ci	pxor    %xmm3, %xmm1
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci	movdqa  %xmm1, %xmm5
16462306a36Sopenharmony_ci	pclmulqdq $0x00, CONSTANT, %xmm1
16562306a36Sopenharmony_ci	pclmulqdq $0x11, CONSTANT, %xmm5
16662306a36Sopenharmony_ci	pxor    %xmm5, %xmm1
16762306a36Sopenharmony_ci	pxor    %xmm4, %xmm1
16862306a36Sopenharmony_ci
16962306a36Sopenharmony_ci	cmp     $0x10, LEN
17062306a36Sopenharmony_ci	jb      .Lfold_64
17162306a36Sopenharmony_ci.Lloop_16:/* Folding rest buffer into 128bit */
17262306a36Sopenharmony_ci	movdqa  %xmm1, %xmm5
17362306a36Sopenharmony_ci	pclmulqdq $0x00, CONSTANT, %xmm1
17462306a36Sopenharmony_ci	pclmulqdq $0x11, CONSTANT, %xmm5
17562306a36Sopenharmony_ci	pxor    %xmm5, %xmm1
17662306a36Sopenharmony_ci	pxor    (BUF), %xmm1
17762306a36Sopenharmony_ci	sub     $0x10, LEN
17862306a36Sopenharmony_ci	add     $0x10, BUF
17962306a36Sopenharmony_ci	cmp     $0x10, LEN
18062306a36Sopenharmony_ci	jge     .Lloop_16
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci.Lfold_64:
18362306a36Sopenharmony_ci	/* perform the last 64 bit fold, also adds 32 zeroes
18462306a36Sopenharmony_ci	 * to the input stream */
18562306a36Sopenharmony_ci	pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
18662306a36Sopenharmony_ci	psrldq  $0x08, %xmm1
18762306a36Sopenharmony_ci	pxor    CONSTANT, %xmm1
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci	/* final 32-bit fold */
19062306a36Sopenharmony_ci	movdqa  %xmm1, %xmm2
19162306a36Sopenharmony_ci#ifdef __x86_64__
19262306a36Sopenharmony_ci	movdqa  .Lconstant_R5(%rip), CONSTANT
19362306a36Sopenharmony_ci	movdqa  .Lconstant_mask32(%rip), %xmm3
19462306a36Sopenharmony_ci#else
19562306a36Sopenharmony_ci	movdqa  .Lconstant_R5, CONSTANT
19662306a36Sopenharmony_ci	movdqa  .Lconstant_mask32, %xmm3
19762306a36Sopenharmony_ci#endif
19862306a36Sopenharmony_ci	psrldq  $0x04, %xmm2
19962306a36Sopenharmony_ci	pand    %xmm3, %xmm1
20062306a36Sopenharmony_ci	pclmulqdq $0x00, CONSTANT, %xmm1
20162306a36Sopenharmony_ci	pxor    %xmm2, %xmm1
20262306a36Sopenharmony_ci
20362306a36Sopenharmony_ci	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
20462306a36Sopenharmony_ci#ifdef __x86_64__
20562306a36Sopenharmony_ci	movdqa  .Lconstant_RUpoly(%rip), CONSTANT
20662306a36Sopenharmony_ci#else
20762306a36Sopenharmony_ci	movdqa  .Lconstant_RUpoly, CONSTANT
20862306a36Sopenharmony_ci#endif
20962306a36Sopenharmony_ci	movdqa  %xmm1, %xmm2
21062306a36Sopenharmony_ci	pand    %xmm3, %xmm1
21162306a36Sopenharmony_ci	pclmulqdq $0x10, CONSTANT, %xmm1
21262306a36Sopenharmony_ci	pand    %xmm3, %xmm1
21362306a36Sopenharmony_ci	pclmulqdq $0x00, CONSTANT, %xmm1
21462306a36Sopenharmony_ci	pxor    %xmm2, %xmm1
21562306a36Sopenharmony_ci	pextrd  $0x01, %xmm1, %eax
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci	RET
21862306a36Sopenharmony_ciSYM_FUNC_END(crc32_pclmul_le_16)
219