162306a36Sopenharmony_ci/*
262306a36Sopenharmony_ci * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
362306a36Sopenharmony_ci *
462306a36Sopenharmony_ci * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
562306a36Sopenharmony_ci *
662306a36Sopenharmony_ci * This program is free software; you can redistribute it and/or modify
762306a36Sopenharmony_ci * it under the terms of the GNU General Public License version 2 as
862306a36Sopenharmony_ci * published by the Free Software Foundation.
962306a36Sopenharmony_ci */
1062306a36Sopenharmony_ci
1162306a36Sopenharmony_ci/* GPL HEADER START
1262306a36Sopenharmony_ci *
1362306a36Sopenharmony_ci * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
1462306a36Sopenharmony_ci *
1562306a36Sopenharmony_ci * This program is free software; you can redistribute it and/or modify
1662306a36Sopenharmony_ci * it under the terms of the GNU General Public License version 2 only,
1762306a36Sopenharmony_ci * as published by the Free Software Foundation.
1862306a36Sopenharmony_ci *
1962306a36Sopenharmony_ci * This program is distributed in the hope that it will be useful, but
2062306a36Sopenharmony_ci * WITHOUT ANY WARRANTY; without even the implied warranty of
2162306a36Sopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
2262306a36Sopenharmony_ci * General Public License version 2 for more details (a copy is included
2362306a36Sopenharmony_ci * in the LICENSE file that accompanied this code).
2462306a36Sopenharmony_ci *
2562306a36Sopenharmony_ci * You should have received a copy of the GNU General Public License
2662306a36Sopenharmony_ci * version 2 along with this program; If not, see http://www.gnu.org/licenses
2762306a36Sopenharmony_ci *
2862306a36Sopenharmony_ci * Please  visit http://www.xyratex.com/contact if you need additional
2962306a36Sopenharmony_ci * information or have any questions.
3062306a36Sopenharmony_ci *
3162306a36Sopenharmony_ci * GPL HEADER END
3262306a36Sopenharmony_ci */
3362306a36Sopenharmony_ci
3462306a36Sopenharmony_ci/*
3562306a36Sopenharmony_ci * Copyright 2012 Xyratex Technology Limited
3662306a36Sopenharmony_ci *
3762306a36Sopenharmony_ci * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
3862306a36Sopenharmony_ci * calculation.
3962306a36Sopenharmony_ci * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
4062306a36Sopenharmony_ci * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
4162306a36Sopenharmony_ci * at:
4262306a36Sopenharmony_ci * https://www.intel.com/products/processor/manuals/
4362306a36Sopenharmony_ci * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
4462306a36Sopenharmony_ci * Volume 2B: Instruction Set Reference, N-Z
4562306a36Sopenharmony_ci *
4662306a36Sopenharmony_ci * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
4762306a36Sopenharmony_ci *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
4862306a36Sopenharmony_ci */
4962306a36Sopenharmony_ci
5062306a36Sopenharmony_ci#include <linux/linkage.h>
5162306a36Sopenharmony_ci#include <asm/assembler.h>
5262306a36Sopenharmony_ci
5362306a36Sopenharmony_ci	.text
5462306a36Sopenharmony_ci	.align		6
5562306a36Sopenharmony_ci	.arch		armv8-a
5662306a36Sopenharmony_ci	.arch_extension	crc
5762306a36Sopenharmony_ci	.fpu		crypto-neon-fp-armv8
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci.Lcrc32_constants:
6062306a36Sopenharmony_ci	/*
6162306a36Sopenharmony_ci	 * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
6262306a36Sopenharmony_ci	 * #define CONSTANT_R1  0x154442bd4LL
6362306a36Sopenharmony_ci	 *
6462306a36Sopenharmony_ci	 * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
6562306a36Sopenharmony_ci	 * #define CONSTANT_R2  0x1c6e41596LL
6662306a36Sopenharmony_ci	 */
6762306a36Sopenharmony_ci	.quad		0x0000000154442bd4
6862306a36Sopenharmony_ci	.quad		0x00000001c6e41596
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci	/*
7162306a36Sopenharmony_ci	 * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
7262306a36Sopenharmony_ci	 * #define CONSTANT_R3  0x1751997d0LL
7362306a36Sopenharmony_ci	 *
7462306a36Sopenharmony_ci	 * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
7562306a36Sopenharmony_ci	 * #define CONSTANT_R4  0x0ccaa009eLL
7662306a36Sopenharmony_ci	 */
7762306a36Sopenharmony_ci	.quad		0x00000001751997d0
7862306a36Sopenharmony_ci	.quad		0x00000000ccaa009e
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_ci	/*
8162306a36Sopenharmony_ci	 * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
8262306a36Sopenharmony_ci	 * #define CONSTANT_R5  0x163cd6124LL
8362306a36Sopenharmony_ci	 */
8462306a36Sopenharmony_ci	.quad		0x0000000163cd6124
8562306a36Sopenharmony_ci	.quad		0x00000000FFFFFFFF
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ci	/*
8862306a36Sopenharmony_ci	 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
8962306a36Sopenharmony_ci	 *
9062306a36Sopenharmony_ci	 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
9162306a36Sopenharmony_ci	 *                                                      = 0x1F7011641LL
9262306a36Sopenharmony_ci	 * #define CONSTANT_RU  0x1F7011641LL
9362306a36Sopenharmony_ci	 */
9462306a36Sopenharmony_ci	.quad		0x00000001DB710641
9562306a36Sopenharmony_ci	.quad		0x00000001F7011641
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ci.Lcrc32c_constants:
9862306a36Sopenharmony_ci	.quad		0x00000000740eef02
9962306a36Sopenharmony_ci	.quad		0x000000009e4addf8
10062306a36Sopenharmony_ci	.quad		0x00000000f20c0dfe
10162306a36Sopenharmony_ci	.quad		0x000000014cd00bd6
10262306a36Sopenharmony_ci	.quad		0x00000000dd45aab8
10362306a36Sopenharmony_ci	.quad		0x00000000FFFFFFFF
10462306a36Sopenharmony_ci	.quad		0x0000000105ec76f0
10562306a36Sopenharmony_ci	.quad		0x00000000dea713f1
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci	dCONSTANTl	.req	d0
10862306a36Sopenharmony_ci	dCONSTANTh	.req	d1
10962306a36Sopenharmony_ci	qCONSTANT	.req	q0
11062306a36Sopenharmony_ci
11162306a36Sopenharmony_ci	BUF		.req	r0
11262306a36Sopenharmony_ci	LEN		.req	r1
11362306a36Sopenharmony_ci	CRC		.req	r2
11462306a36Sopenharmony_ci
11562306a36Sopenharmony_ci	qzr		.req	q9
11662306a36Sopenharmony_ci
11762306a36Sopenharmony_ci	/**
11862306a36Sopenharmony_ci	 * Calculate crc32
11962306a36Sopenharmony_ci	 * BUF - buffer
12062306a36Sopenharmony_ci	 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
12162306a36Sopenharmony_ci	 * CRC - initial crc32
12262306a36Sopenharmony_ci	 * return %eax crc32
12362306a36Sopenharmony_ci	 * uint crc32_pmull_le(unsigned char const *buffer,
12462306a36Sopenharmony_ci	 *                     size_t len, uint crc32)
12562306a36Sopenharmony_ci	 */
12662306a36Sopenharmony_ciENTRY(crc32_pmull_le)
12762306a36Sopenharmony_ci	adr		r3, .Lcrc32_constants
12862306a36Sopenharmony_ci	b		0f
12962306a36Sopenharmony_ci
13062306a36Sopenharmony_ciENTRY(crc32c_pmull_le)
13162306a36Sopenharmony_ci	adr		r3, .Lcrc32c_constants
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci0:	bic		LEN, LEN, #15
13462306a36Sopenharmony_ci	vld1.8		{q1-q2}, [BUF, :128]!
13562306a36Sopenharmony_ci	vld1.8		{q3-q4}, [BUF, :128]!
13662306a36Sopenharmony_ci	vmov.i8		qzr, #0
13762306a36Sopenharmony_ci	vmov.i8		qCONSTANT, #0
13862306a36Sopenharmony_ci	vmov.32		dCONSTANTl[0], CRC
13962306a36Sopenharmony_ci	veor.8		d2, d2, dCONSTANTl
14062306a36Sopenharmony_ci	sub		LEN, LEN, #0x40
14162306a36Sopenharmony_ci	cmp		LEN, #0x40
14262306a36Sopenharmony_ci	blt		less_64
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci	vld1.64		{qCONSTANT}, [r3]
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_ciloop_64:		/* 64 bytes Full cache line folding */
14762306a36Sopenharmony_ci	sub		LEN, LEN, #0x40
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_ci	vmull.p64	q5, d3, dCONSTANTh
15062306a36Sopenharmony_ci	vmull.p64	q6, d5, dCONSTANTh
15162306a36Sopenharmony_ci	vmull.p64	q7, d7, dCONSTANTh
15262306a36Sopenharmony_ci	vmull.p64	q8, d9, dCONSTANTh
15362306a36Sopenharmony_ci
15462306a36Sopenharmony_ci	vmull.p64	q1, d2, dCONSTANTl
15562306a36Sopenharmony_ci	vmull.p64	q2, d4, dCONSTANTl
15662306a36Sopenharmony_ci	vmull.p64	q3, d6, dCONSTANTl
15762306a36Sopenharmony_ci	vmull.p64	q4, d8, dCONSTANTl
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_ci	veor.8		q1, q1, q5
16062306a36Sopenharmony_ci	vld1.8		{q5}, [BUF, :128]!
16162306a36Sopenharmony_ci	veor.8		q2, q2, q6
16262306a36Sopenharmony_ci	vld1.8		{q6}, [BUF, :128]!
16362306a36Sopenharmony_ci	veor.8		q3, q3, q7
16462306a36Sopenharmony_ci	vld1.8		{q7}, [BUF, :128]!
16562306a36Sopenharmony_ci	veor.8		q4, q4, q8
16662306a36Sopenharmony_ci	vld1.8		{q8}, [BUF, :128]!
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ci	veor.8		q1, q1, q5
16962306a36Sopenharmony_ci	veor.8		q2, q2, q6
17062306a36Sopenharmony_ci	veor.8		q3, q3, q7
17162306a36Sopenharmony_ci	veor.8		q4, q4, q8
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_ci	cmp		LEN, #0x40
17462306a36Sopenharmony_ci	bge		loop_64
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_ciless_64:		/* Folding cache line into 128bit */
17762306a36Sopenharmony_ci	vldr		dCONSTANTl, [r3, #16]
17862306a36Sopenharmony_ci	vldr		dCONSTANTh, [r3, #24]
17962306a36Sopenharmony_ci
18062306a36Sopenharmony_ci	vmull.p64	q5, d3, dCONSTANTh
18162306a36Sopenharmony_ci	vmull.p64	q1, d2, dCONSTANTl
18262306a36Sopenharmony_ci	veor.8		q1, q1, q5
18362306a36Sopenharmony_ci	veor.8		q1, q1, q2
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_ci	vmull.p64	q5, d3, dCONSTANTh
18662306a36Sopenharmony_ci	vmull.p64	q1, d2, dCONSTANTl
18762306a36Sopenharmony_ci	veor.8		q1, q1, q5
18862306a36Sopenharmony_ci	veor.8		q1, q1, q3
18962306a36Sopenharmony_ci
19062306a36Sopenharmony_ci	vmull.p64	q5, d3, dCONSTANTh
19162306a36Sopenharmony_ci	vmull.p64	q1, d2, dCONSTANTl
19262306a36Sopenharmony_ci	veor.8		q1, q1, q5
19362306a36Sopenharmony_ci	veor.8		q1, q1, q4
19462306a36Sopenharmony_ci
19562306a36Sopenharmony_ci	teq		LEN, #0
19662306a36Sopenharmony_ci	beq		fold_64
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ciloop_16:		/* Folding rest buffer into 128bit */
19962306a36Sopenharmony_ci	subs		LEN, LEN, #0x10
20062306a36Sopenharmony_ci
20162306a36Sopenharmony_ci	vld1.8		{q2}, [BUF, :128]!
20262306a36Sopenharmony_ci	vmull.p64	q5, d3, dCONSTANTh
20362306a36Sopenharmony_ci	vmull.p64	q1, d2, dCONSTANTl
20462306a36Sopenharmony_ci	veor.8		q1, q1, q5
20562306a36Sopenharmony_ci	veor.8		q1, q1, q2
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci	bne		loop_16
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_cifold_64:
21062306a36Sopenharmony_ci	/* perform the last 64 bit fold, also adds 32 zeroes
21162306a36Sopenharmony_ci	 * to the input stream */
21262306a36Sopenharmony_ci	vmull.p64	q2, d2, dCONSTANTh
21362306a36Sopenharmony_ci	vext.8		q1, q1, qzr, #8
21462306a36Sopenharmony_ci	veor.8		q1, q1, q2
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ci	/* final 32-bit fold */
21762306a36Sopenharmony_ci	vldr		dCONSTANTl, [r3, #32]
21862306a36Sopenharmony_ci	vldr		d6, [r3, #40]
21962306a36Sopenharmony_ci	vmov.i8		d7, #0
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_ci	vext.8		q2, q1, qzr, #4
22262306a36Sopenharmony_ci	vand.8		d2, d2, d6
22362306a36Sopenharmony_ci	vmull.p64	q1, d2, dCONSTANTl
22462306a36Sopenharmony_ci	veor.8		q1, q1, q2
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_ci	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
22762306a36Sopenharmony_ci	vldr		dCONSTANTl, [r3, #48]
22862306a36Sopenharmony_ci	vldr		dCONSTANTh, [r3, #56]
22962306a36Sopenharmony_ci
23062306a36Sopenharmony_ci	vand.8		q2, q1, q3
23162306a36Sopenharmony_ci	vext.8		q2, qzr, q2, #8
23262306a36Sopenharmony_ci	vmull.p64	q2, d5, dCONSTANTh
23362306a36Sopenharmony_ci	vand.8		q2, q2, q3
23462306a36Sopenharmony_ci	vmull.p64	q2, d4, dCONSTANTl
23562306a36Sopenharmony_ci	veor.8		q1, q1, q2
23662306a36Sopenharmony_ci	vmov		r0, s5
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci	bx		lr
23962306a36Sopenharmony_ciENDPROC(crc32_pmull_le)
24062306a36Sopenharmony_ciENDPROC(crc32c_pmull_le)
24162306a36Sopenharmony_ci
24262306a36Sopenharmony_ci	.macro		__crc32, c
24362306a36Sopenharmony_ci	subs		ip, r2, #8
24462306a36Sopenharmony_ci	bmi		.Ltail\c
24562306a36Sopenharmony_ci
24662306a36Sopenharmony_ci	tst		r1, #3
24762306a36Sopenharmony_ci	bne		.Lunaligned\c
24862306a36Sopenharmony_ci
24962306a36Sopenharmony_ci	teq		ip, #0
25062306a36Sopenharmony_ci.Laligned8\c:
25162306a36Sopenharmony_ci	ldrd		r2, r3, [r1], #8
25262306a36Sopenharmony_ciARM_BE8(rev		r2, r2		)
25362306a36Sopenharmony_ciARM_BE8(rev		r3, r3		)
25462306a36Sopenharmony_ci	crc32\c\()w	r0, r0, r2
25562306a36Sopenharmony_ci	crc32\c\()w	r0, r0, r3
25662306a36Sopenharmony_ci	bxeq		lr
25762306a36Sopenharmony_ci	subs		ip, ip, #8
25862306a36Sopenharmony_ci	bpl		.Laligned8\c
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci.Ltail\c:
26162306a36Sopenharmony_ci	tst		ip, #4
26262306a36Sopenharmony_ci	beq		2f
26362306a36Sopenharmony_ci	ldr		r3, [r1], #4
26462306a36Sopenharmony_ciARM_BE8(rev		r3, r3		)
26562306a36Sopenharmony_ci	crc32\c\()w	r0, r0, r3
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci2:	tst		ip, #2
26862306a36Sopenharmony_ci	beq		1f
26962306a36Sopenharmony_ci	ldrh		r3, [r1], #2
27062306a36Sopenharmony_ciARM_BE8(rev16		r3, r3		)
27162306a36Sopenharmony_ci	crc32\c\()h	r0, r0, r3
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci1:	tst		ip, #1
27462306a36Sopenharmony_ci	bxeq		lr
27562306a36Sopenharmony_ci	ldrb		r3, [r1]
27662306a36Sopenharmony_ci	crc32\c\()b	r0, r0, r3
27762306a36Sopenharmony_ci	bx		lr
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_ci.Lunaligned\c:
28062306a36Sopenharmony_ci	tst		r1, #1
28162306a36Sopenharmony_ci	beq		2f
28262306a36Sopenharmony_ci	ldrb		r3, [r1], #1
28362306a36Sopenharmony_ci	subs		r2, r2, #1
28462306a36Sopenharmony_ci	crc32\c\()b	r0, r0, r3
28562306a36Sopenharmony_ci
28662306a36Sopenharmony_ci	tst		r1, #2
28762306a36Sopenharmony_ci	beq		0f
28862306a36Sopenharmony_ci2:	ldrh		r3, [r1], #2
28962306a36Sopenharmony_ci	subs		r2, r2, #2
29062306a36Sopenharmony_ciARM_BE8(rev16		r3, r3		)
29162306a36Sopenharmony_ci	crc32\c\()h	r0, r0, r3
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci0:	subs		ip, r2, #8
29462306a36Sopenharmony_ci	bpl		.Laligned8\c
29562306a36Sopenharmony_ci	b		.Ltail\c
29662306a36Sopenharmony_ci	.endm
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci	.align		5
29962306a36Sopenharmony_ciENTRY(crc32_armv8_le)
30062306a36Sopenharmony_ci	__crc32
30162306a36Sopenharmony_ciENDPROC(crc32_armv8_le)
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci	.align		5
30462306a36Sopenharmony_ciENTRY(crc32c_armv8_le)
30562306a36Sopenharmony_ci	__crc32		c
30662306a36Sopenharmony_ciENDPROC(crc32c_armv8_le)
307