162306a36Sopenharmony_ci//
262306a36Sopenharmony_ci// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
362306a36Sopenharmony_ci//
462306a36Sopenharmony_ci// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
562306a36Sopenharmony_ci// Copyright (C) 2019 Google LLC <ebiggers@google.com>
662306a36Sopenharmony_ci//
762306a36Sopenharmony_ci// This program is free software; you can redistribute it and/or modify
862306a36Sopenharmony_ci// it under the terms of the GNU General Public License version 2 as
962306a36Sopenharmony_ci// published by the Free Software Foundation.
1062306a36Sopenharmony_ci//
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci// Derived from the x86 version:
1362306a36Sopenharmony_ci//
1462306a36Sopenharmony_ci// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
1562306a36Sopenharmony_ci//
1662306a36Sopenharmony_ci// Copyright (c) 2013, Intel Corporation
1762306a36Sopenharmony_ci//
1862306a36Sopenharmony_ci// Authors:
1962306a36Sopenharmony_ci//     Erdinc Ozturk <erdinc.ozturk@intel.com>
2062306a36Sopenharmony_ci//     Vinodh Gopal <vinodh.gopal@intel.com>
2162306a36Sopenharmony_ci//     James Guilford <james.guilford@intel.com>
2262306a36Sopenharmony_ci//     Tim Chen <tim.c.chen@linux.intel.com>
2362306a36Sopenharmony_ci//
2462306a36Sopenharmony_ci// This software is available to you under a choice of one of two
2562306a36Sopenharmony_ci// licenses.  You may choose to be licensed under the terms of the GNU
2662306a36Sopenharmony_ci// General Public License (GPL) Version 2, available from the file
2762306a36Sopenharmony_ci// COPYING in the main directory of this source tree, or the
2862306a36Sopenharmony_ci// OpenIB.org BSD license below:
2962306a36Sopenharmony_ci//
3062306a36Sopenharmony_ci// Redistribution and use in source and binary forms, with or without
3162306a36Sopenharmony_ci// modification, are permitted provided that the following conditions are
3262306a36Sopenharmony_ci// met:
3362306a36Sopenharmony_ci//
3462306a36Sopenharmony_ci// * Redistributions of source code must retain the above copyright
3562306a36Sopenharmony_ci//   notice, this list of conditions and the following disclaimer.
3662306a36Sopenharmony_ci//
3762306a36Sopenharmony_ci// * Redistributions in binary form must reproduce the above copyright
3862306a36Sopenharmony_ci//   notice, this list of conditions and the following disclaimer in the
3962306a36Sopenharmony_ci//   documentation and/or other materials provided with the
4062306a36Sopenharmony_ci//   distribution.
4162306a36Sopenharmony_ci//
4262306a36Sopenharmony_ci// * Neither the name of the Intel Corporation nor the names of its
4362306a36Sopenharmony_ci//   contributors may be used to endorse or promote products derived from
4462306a36Sopenharmony_ci//   this software without specific prior written permission.
4562306a36Sopenharmony_ci//
4662306a36Sopenharmony_ci//
4762306a36Sopenharmony_ci// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
4862306a36Sopenharmony_ci// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
4962306a36Sopenharmony_ci// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
5062306a36Sopenharmony_ci// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
5162306a36Sopenharmony_ci// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
5262306a36Sopenharmony_ci// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
5362306a36Sopenharmony_ci// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
5462306a36Sopenharmony_ci// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
5562306a36Sopenharmony_ci// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
5662306a36Sopenharmony_ci// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
5762306a36Sopenharmony_ci// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5862306a36Sopenharmony_ci//
5962306a36Sopenharmony_ci//       Reference paper titled "Fast CRC Computation for Generic
6062306a36Sopenharmony_ci//	Polynomials Using PCLMULQDQ Instruction"
6162306a36Sopenharmony_ci//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
6262306a36Sopenharmony_ci//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
6362306a36Sopenharmony_ci//
6462306a36Sopenharmony_ci
6562306a36Sopenharmony_ci#include <linux/linkage.h>
6662306a36Sopenharmony_ci#include <asm/assembler.h>
6762306a36Sopenharmony_ci
6862306a36Sopenharmony_ci	.text
6962306a36Sopenharmony_ci	.arch		armv8-a+crypto
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ci	init_crc	.req	w0
7262306a36Sopenharmony_ci	buf		.req	x1
7362306a36Sopenharmony_ci	len		.req	x2
7462306a36Sopenharmony_ci	fold_consts_ptr	.req	x3
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci	fold_consts	.req	v10
7762306a36Sopenharmony_ci
7862306a36Sopenharmony_ci	ad		.req	v14
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_ci	k00_16		.req	v15
8162306a36Sopenharmony_ci	k32_48		.req	v16
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_ci	t3		.req	v17
8462306a36Sopenharmony_ci	t4		.req	v18
8562306a36Sopenharmony_ci	t5		.req	v19
8662306a36Sopenharmony_ci	t6		.req	v20
8762306a36Sopenharmony_ci	t7		.req	v21
8862306a36Sopenharmony_ci	t8		.req	v22
8962306a36Sopenharmony_ci	t9		.req	v23
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci	perm1		.req	v24
9262306a36Sopenharmony_ci	perm2		.req	v25
9362306a36Sopenharmony_ci	perm3		.req	v26
9462306a36Sopenharmony_ci	perm4		.req	v27
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci	bd1		.req	v28
9762306a36Sopenharmony_ci	bd2		.req	v29
9862306a36Sopenharmony_ci	bd3		.req	v30
9962306a36Sopenharmony_ci	bd4		.req	v31
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_ci	.macro		__pmull_init_p64
10262306a36Sopenharmony_ci	.endm
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_ci	.macro		__pmull_pre_p64, bd
10562306a36Sopenharmony_ci	.endm
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci	.macro		__pmull_init_p8
10862306a36Sopenharmony_ci	// k00_16 := 0x0000000000000000_000000000000ffff
10962306a36Sopenharmony_ci	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
11062306a36Sopenharmony_ci	movi		k32_48.2d, #0xffffffff
11162306a36Sopenharmony_ci	mov		k32_48.h[2], k32_48.h[0]
11262306a36Sopenharmony_ci	ushr		k00_16.2d, k32_48.2d, #32
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_ci	// prepare the permutation vectors
11562306a36Sopenharmony_ci	mov_q		x5, 0x080f0e0d0c0b0a09
11662306a36Sopenharmony_ci	movi		perm4.8b, #8
11762306a36Sopenharmony_ci	dup		perm1.2d, x5
11862306a36Sopenharmony_ci	eor		perm1.16b, perm1.16b, perm4.16b
11962306a36Sopenharmony_ci	ushr		perm2.2d, perm1.2d, #8
12062306a36Sopenharmony_ci	ushr		perm3.2d, perm1.2d, #16
12162306a36Sopenharmony_ci	ushr		perm4.2d, perm1.2d, #24
12262306a36Sopenharmony_ci	sli		perm2.2d, perm1.2d, #56
12362306a36Sopenharmony_ci	sli		perm3.2d, perm1.2d, #48
12462306a36Sopenharmony_ci	sli		perm4.2d, perm1.2d, #40
12562306a36Sopenharmony_ci	.endm
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_ci	.macro		__pmull_pre_p8, bd
12862306a36Sopenharmony_ci	tbl		bd1.16b, {\bd\().16b}, perm1.16b
12962306a36Sopenharmony_ci	tbl		bd2.16b, {\bd\().16b}, perm2.16b
13062306a36Sopenharmony_ci	tbl		bd3.16b, {\bd\().16b}, perm3.16b
13162306a36Sopenharmony_ci	tbl		bd4.16b, {\bd\().16b}, perm4.16b
13262306a36Sopenharmony_ci	.endm
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__pmull_p8_core)
13562306a36Sopenharmony_ci.L__pmull_p8_core:
13662306a36Sopenharmony_ci	ext		t4.8b, ad.8b, ad.8b, #1			// A1
13762306a36Sopenharmony_ci	ext		t5.8b, ad.8b, ad.8b, #2			// A2
13862306a36Sopenharmony_ci	ext		t6.8b, ad.8b, ad.8b, #3			// A3
13962306a36Sopenharmony_ci
14062306a36Sopenharmony_ci	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
14162306a36Sopenharmony_ci	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
14262306a36Sopenharmony_ci	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
14362306a36Sopenharmony_ci	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
14462306a36Sopenharmony_ci	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
14562306a36Sopenharmony_ci	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
14662306a36Sopenharmony_ci	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
14762306a36Sopenharmony_ci	b		0f
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_ci.L__pmull_p8_core2:
15062306a36Sopenharmony_ci	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
15162306a36Sopenharmony_ci	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
15262306a36Sopenharmony_ci	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
15362306a36Sopenharmony_ci
15462306a36Sopenharmony_ci	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
15562306a36Sopenharmony_ci	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
15662306a36Sopenharmony_ci	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
15762306a36Sopenharmony_ci	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
15862306a36Sopenharmony_ci	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
15962306a36Sopenharmony_ci	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
16062306a36Sopenharmony_ci	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
16162306a36Sopenharmony_ci
16262306a36Sopenharmony_ci0:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
16362306a36Sopenharmony_ci	eor		t5.16b, t5.16b, t7.16b			// M = G + H
16462306a36Sopenharmony_ci	eor		t6.16b, t6.16b, t9.16b			// N = I + J
16562306a36Sopenharmony_ci
16662306a36Sopenharmony_ci	uzp1		t8.2d, t4.2d, t5.2d
16762306a36Sopenharmony_ci	uzp2		t4.2d, t4.2d, t5.2d
16862306a36Sopenharmony_ci	uzp1		t7.2d, t6.2d, t3.2d
16962306a36Sopenharmony_ci	uzp2		t6.2d, t6.2d, t3.2d
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ci	// t4 = (L) (P0 + P1) << 8
17262306a36Sopenharmony_ci	// t5 = (M) (P2 + P3) << 16
17362306a36Sopenharmony_ci	eor		t8.16b, t8.16b, t4.16b
17462306a36Sopenharmony_ci	and		t4.16b, t4.16b, k32_48.16b
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_ci	// t6 = (N) (P4 + P5) << 24
17762306a36Sopenharmony_ci	// t7 = (K) (P6 + P7) << 32
17862306a36Sopenharmony_ci	eor		t7.16b, t7.16b, t6.16b
17962306a36Sopenharmony_ci	and		t6.16b, t6.16b, k00_16.16b
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_ci	eor		t8.16b, t8.16b, t4.16b
18262306a36Sopenharmony_ci	eor		t7.16b, t7.16b, t6.16b
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_ci	zip2		t5.2d, t8.2d, t4.2d
18562306a36Sopenharmony_ci	zip1		t4.2d, t8.2d, t4.2d
18662306a36Sopenharmony_ci	zip2		t3.2d, t7.2d, t6.2d
18762306a36Sopenharmony_ci	zip1		t6.2d, t7.2d, t6.2d
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci	ext		t4.16b, t4.16b, t4.16b, #15
19062306a36Sopenharmony_ci	ext		t5.16b, t5.16b, t5.16b, #14
19162306a36Sopenharmony_ci	ext		t6.16b, t6.16b, t6.16b, #13
19262306a36Sopenharmony_ci	ext		t3.16b, t3.16b, t3.16b, #12
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_ci	eor		t4.16b, t4.16b, t5.16b
19562306a36Sopenharmony_ci	eor		t6.16b, t6.16b, t3.16b
19662306a36Sopenharmony_ci	ret
19762306a36Sopenharmony_ciSYM_FUNC_END(__pmull_p8_core)
19862306a36Sopenharmony_ci
19962306a36Sopenharmony_ci	.macro		__pmull_p8, rq, ad, bd, i
20062306a36Sopenharmony_ci	.ifnc		\bd, fold_consts
20162306a36Sopenharmony_ci	.err
20262306a36Sopenharmony_ci	.endif
20362306a36Sopenharmony_ci	mov		ad.16b, \ad\().16b
20462306a36Sopenharmony_ci	.ifb		\i
20562306a36Sopenharmony_ci	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
20662306a36Sopenharmony_ci	.else
20762306a36Sopenharmony_ci	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
20862306a36Sopenharmony_ci	.endif
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_ci	bl		.L__pmull_p8_core\i
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci	eor		\rq\().16b, \rq\().16b, t4.16b
21362306a36Sopenharmony_ci	eor		\rq\().16b, \rq\().16b, t6.16b
21462306a36Sopenharmony_ci	.endm
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ci	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
21762306a36Sopenharmony_ci	// into reg1, reg2.
21862306a36Sopenharmony_ci	.macro		fold_32_bytes, p, reg1, reg2
21962306a36Sopenharmony_ci	ldp		q11, q12, [buf], #0x20
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_ci	__pmull_\p	v8, \reg1, fold_consts, 2
22262306a36Sopenharmony_ci	__pmull_\p	\reg1, \reg1, fold_consts
22362306a36Sopenharmony_ci
22462306a36Sopenharmony_ciCPU_LE(	rev64		v11.16b, v11.16b		)
22562306a36Sopenharmony_ciCPU_LE(	rev64		v12.16b, v12.16b		)
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ci	__pmull_\p	v9, \reg2, fold_consts, 2
22862306a36Sopenharmony_ci	__pmull_\p	\reg2, \reg2, fold_consts
22962306a36Sopenharmony_ci
23062306a36Sopenharmony_ciCPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
23162306a36Sopenharmony_ciCPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_ci	eor		\reg1\().16b, \reg1\().16b, v8.16b
23462306a36Sopenharmony_ci	eor		\reg2\().16b, \reg2\().16b, v9.16b
23562306a36Sopenharmony_ci	eor		\reg1\().16b, \reg1\().16b, v11.16b
23662306a36Sopenharmony_ci	eor		\reg2\().16b, \reg2\().16b, v12.16b
23762306a36Sopenharmony_ci	.endm
23862306a36Sopenharmony_ci
23962306a36Sopenharmony_ci	// Fold src_reg into dst_reg, optionally loading the next fold constants
24062306a36Sopenharmony_ci	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
24162306a36Sopenharmony_ci	__pmull_\p	v8, \src_reg, fold_consts
24262306a36Sopenharmony_ci	__pmull_\p	\src_reg, \src_reg, fold_consts, 2
24362306a36Sopenharmony_ci	.ifnb		\load_next_consts
24462306a36Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
24562306a36Sopenharmony_ci	__pmull_pre_\p	fold_consts
24662306a36Sopenharmony_ci	.endif
24762306a36Sopenharmony_ci	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
24862306a36Sopenharmony_ci	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
24962306a36Sopenharmony_ci	.endm
25062306a36Sopenharmony_ci
25162306a36Sopenharmony_ci	.macro		__pmull_p64, rd, rn, rm, n
25262306a36Sopenharmony_ci	.ifb		\n
25362306a36Sopenharmony_ci	pmull		\rd\().1q, \rn\().1d, \rm\().1d
25462306a36Sopenharmony_ci	.else
25562306a36Sopenharmony_ci	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
25662306a36Sopenharmony_ci	.endif
25762306a36Sopenharmony_ci	.endm
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	.macro		crc_t10dif_pmull, p
26062306a36Sopenharmony_ci	__pmull_init_\p
26162306a36Sopenharmony_ci
26262306a36Sopenharmony_ci	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
26362306a36Sopenharmony_ci	cmp		len, #256
26462306a36Sopenharmony_ci	b.lt		.Lless_than_256_bytes_\@
26562306a36Sopenharmony_ci
26662306a36Sopenharmony_ci	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	// Load the first 128 data bytes.  Byte swapping is necessary to make
26962306a36Sopenharmony_ci	// the bit order match the polynomial coefficient order.
27062306a36Sopenharmony_ci	ldp		q0, q1, [buf]
27162306a36Sopenharmony_ci	ldp		q2, q3, [buf, #0x20]
27262306a36Sopenharmony_ci	ldp		q4, q5, [buf, #0x40]
27362306a36Sopenharmony_ci	ldp		q6, q7, [buf, #0x60]
27462306a36Sopenharmony_ci	add		buf, buf, #0x80
27562306a36Sopenharmony_ciCPU_LE(	rev64		v0.16b, v0.16b			)
27662306a36Sopenharmony_ciCPU_LE(	rev64		v1.16b, v1.16b			)
27762306a36Sopenharmony_ciCPU_LE(	rev64		v2.16b, v2.16b			)
27862306a36Sopenharmony_ciCPU_LE(	rev64		v3.16b, v3.16b			)
27962306a36Sopenharmony_ciCPU_LE(	rev64		v4.16b, v4.16b			)
28062306a36Sopenharmony_ciCPU_LE(	rev64		v5.16b, v5.16b			)
28162306a36Sopenharmony_ciCPU_LE(	rev64		v6.16b, v6.16b			)
28262306a36Sopenharmony_ciCPU_LE(	rev64		v7.16b, v7.16b			)
28362306a36Sopenharmony_ciCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
28462306a36Sopenharmony_ciCPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
28562306a36Sopenharmony_ciCPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
28662306a36Sopenharmony_ciCPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
28762306a36Sopenharmony_ciCPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
28862306a36Sopenharmony_ciCPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
28962306a36Sopenharmony_ciCPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
29062306a36Sopenharmony_ciCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
29162306a36Sopenharmony_ci
29262306a36Sopenharmony_ci	// XOR the first 16 data *bits* with the initial CRC value.
29362306a36Sopenharmony_ci	movi		v8.16b, #0
29462306a36Sopenharmony_ci	mov		v8.h[7], init_crc
29562306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v8.16b
29662306a36Sopenharmony_ci
29762306a36Sopenharmony_ci	// Load the constants for folding across 128 bytes.
29862306a36Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr]
29962306a36Sopenharmony_ci	__pmull_pre_\p	fold_consts
30062306a36Sopenharmony_ci
30162306a36Sopenharmony_ci	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
30262306a36Sopenharmony_ci	// 128 to simplify the termination condition of the following loop.
30362306a36Sopenharmony_ci	sub		len, len, #256
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_ci	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
30662306a36Sopenharmony_ci	// bytes v0-v7 into them, storing the result back into v0-v7.
30762306a36Sopenharmony_ci.Lfold_128_bytes_loop_\@:
30862306a36Sopenharmony_ci	fold_32_bytes	\p, v0, v1
30962306a36Sopenharmony_ci	fold_32_bytes	\p, v2, v3
31062306a36Sopenharmony_ci	fold_32_bytes	\p, v4, v5
31162306a36Sopenharmony_ci	fold_32_bytes	\p, v6, v7
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci	subs		len, len, #128
31462306a36Sopenharmony_ci	b.ge		.Lfold_128_bytes_loop_\@
31562306a36Sopenharmony_ci
31662306a36Sopenharmony_ci	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
31762306a36Sopenharmony_ci
31862306a36Sopenharmony_ci	// Fold across 64 bytes.
31962306a36Sopenharmony_ci	add		fold_consts_ptr, fold_consts_ptr, #16
32062306a36Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
32162306a36Sopenharmony_ci	__pmull_pre_\p	fold_consts
32262306a36Sopenharmony_ci	fold_16_bytes	\p, v0, v4
32362306a36Sopenharmony_ci	fold_16_bytes	\p, v1, v5
32462306a36Sopenharmony_ci	fold_16_bytes	\p, v2, v6
32562306a36Sopenharmony_ci	fold_16_bytes	\p, v3, v7, 1
32662306a36Sopenharmony_ci	// Fold across 32 bytes.
32762306a36Sopenharmony_ci	fold_16_bytes	\p, v4, v6
32862306a36Sopenharmony_ci	fold_16_bytes	\p, v5, v7, 1
32962306a36Sopenharmony_ci	// Fold across 16 bytes.
33062306a36Sopenharmony_ci	fold_16_bytes	\p, v6, v7
33162306a36Sopenharmony_ci
33262306a36Sopenharmony_ci	// Add 128 to get the correct number of data bytes remaining in 0...127
33362306a36Sopenharmony_ci	// (not counting v7), following the previous extra subtraction by 128.
33462306a36Sopenharmony_ci	// Then subtract 16 to simplify the termination condition of the
33562306a36Sopenharmony_ci	// following loop.
33662306a36Sopenharmony_ci	adds		len, len, #(128-16)
33762306a36Sopenharmony_ci
33862306a36Sopenharmony_ci	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
33962306a36Sopenharmony_ci	// into them, storing the result back into v7.
34062306a36Sopenharmony_ci	b.lt		.Lfold_16_bytes_loop_done_\@
34162306a36Sopenharmony_ci.Lfold_16_bytes_loop_\@:
34262306a36Sopenharmony_ci	__pmull_\p	v8, v7, fold_consts
34362306a36Sopenharmony_ci	__pmull_\p	v7, v7, fold_consts, 2
34462306a36Sopenharmony_ci	eor		v7.16b, v7.16b, v8.16b
34562306a36Sopenharmony_ci	ldr		q0, [buf], #16
34662306a36Sopenharmony_ciCPU_LE(	rev64		v0.16b, v0.16b			)
34762306a36Sopenharmony_ciCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
34862306a36Sopenharmony_ci	eor		v7.16b, v7.16b, v0.16b
34962306a36Sopenharmony_ci	subs		len, len, #16
35062306a36Sopenharmony_ci	b.ge		.Lfold_16_bytes_loop_\@
35162306a36Sopenharmony_ci
35262306a36Sopenharmony_ci.Lfold_16_bytes_loop_done_\@:
35362306a36Sopenharmony_ci	// Add 16 to get the correct number of data bytes remaining in 0...15
35462306a36Sopenharmony_ci	// (not counting v7), following the previous extra subtraction by 16.
35562306a36Sopenharmony_ci	adds		len, len, #16
35662306a36Sopenharmony_ci	b.eq		.Lreduce_final_16_bytes_\@
35762306a36Sopenharmony_ci
35862306a36Sopenharmony_ci.Lhandle_partial_segment_\@:
35962306a36Sopenharmony_ci	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
36062306a36Sopenharmony_ci	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
36162306a36Sopenharmony_ci	// do this without needing a fold constant for each possible 'len',
36262306a36Sopenharmony_ci	// redivide the bytes into a first chunk of 'len' bytes and a second
36362306a36Sopenharmony_ci	// chunk of 16 bytes, then fold the first chunk into the second.
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_ci	// v0 = last 16 original data bytes
36662306a36Sopenharmony_ci	add		buf, buf, len
36762306a36Sopenharmony_ci	ldr		q0, [buf, #-16]
36862306a36Sopenharmony_ciCPU_LE(	rev64		v0.16b, v0.16b			)
36962306a36Sopenharmony_ciCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
37062306a36Sopenharmony_ci
37162306a36Sopenharmony_ci	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
37262306a36Sopenharmony_ci	adr_l		x4, .Lbyteshift_table + 16
37362306a36Sopenharmony_ci	sub		x4, x4, len
37462306a36Sopenharmony_ci	ld1		{v2.16b}, [x4]
37562306a36Sopenharmony_ci	tbl		v1.16b, {v7.16b}, v2.16b
37662306a36Sopenharmony_ci
37762306a36Sopenharmony_ci	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
37862306a36Sopenharmony_ci	movi		v3.16b, #0x80
37962306a36Sopenharmony_ci	eor		v2.16b, v2.16b, v3.16b
38062306a36Sopenharmony_ci	tbl		v3.16b, {v7.16b}, v2.16b
38162306a36Sopenharmony_ci
38262306a36Sopenharmony_ci	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
38362306a36Sopenharmony_ci	sshr		v2.16b, v2.16b, #7
38462306a36Sopenharmony_ci
38562306a36Sopenharmony_ci	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
38662306a36Sopenharmony_ci	// then '16-len' bytes from v1 (high-order bytes).
38762306a36Sopenharmony_ci	bsl		v2.16b, v1.16b, v0.16b
38862306a36Sopenharmony_ci
38962306a36Sopenharmony_ci	// Fold the first chunk into the second chunk, storing the result in v7.
39062306a36Sopenharmony_ci	__pmull_\p	v0, v3, fold_consts
39162306a36Sopenharmony_ci	__pmull_\p	v7, v3, fold_consts, 2
39262306a36Sopenharmony_ci	eor		v7.16b, v7.16b, v0.16b
39362306a36Sopenharmony_ci	eor		v7.16b, v7.16b, v2.16b
39462306a36Sopenharmony_ci
39562306a36Sopenharmony_ci.Lreduce_final_16_bytes_\@:
39662306a36Sopenharmony_ci	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
39762306a36Sopenharmony_ci
39862306a36Sopenharmony_ci	movi		v2.16b, #0		// init zero register
39962306a36Sopenharmony_ci
40062306a36Sopenharmony_ci	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
40162306a36Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
40262306a36Sopenharmony_ci	__pmull_pre_\p	fold_consts
40362306a36Sopenharmony_ci
40462306a36Sopenharmony_ci	// Fold the high 64 bits into the low 64 bits, while also multiplying by
40562306a36Sopenharmony_ci	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
40662306a36Sopenharmony_ci	// whose low 48 bits are 0.
40762306a36Sopenharmony_ci	ext		v0.16b, v2.16b, v7.16b, #8
40862306a36Sopenharmony_ci	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
40962306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64
41062306a36Sopenharmony_ci
41162306a36Sopenharmony_ci	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
41262306a36Sopenharmony_ci	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
41362306a36Sopenharmony_ci	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
41462306a36Sopenharmony_ci	mov		v0.s[3], v2.s[0]	// zero high 32 bits
41562306a36Sopenharmony_ci	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
41662306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v1.16b	// + low bits
41762306a36Sopenharmony_ci
41862306a36Sopenharmony_ci	// Load G(x) and floor(x^48 / G(x)).
41962306a36Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr]
42062306a36Sopenharmony_ci	__pmull_pre_\p	fold_consts
42162306a36Sopenharmony_ci
42262306a36Sopenharmony_ci	// Use Barrett reduction to compute the final CRC value.
42362306a36Sopenharmony_ci	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
42462306a36Sopenharmony_ci	ushr		v1.2d, v1.2d, #32	// /= x^32
42562306a36Sopenharmony_ci	__pmull_\p	v1, v1, fold_consts	// *= G(x)
42662306a36Sopenharmony_ci	ushr		v0.2d, v0.2d, #48
42762306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
42862306a36Sopenharmony_ci	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
42962306a36Sopenharmony_ci
43062306a36Sopenharmony_ci	umov		w0, v0.h[0]
43162306a36Sopenharmony_ci	.ifc		\p, p8
43262306a36Sopenharmony_ci	frame_pop
43362306a36Sopenharmony_ci	.endif
43462306a36Sopenharmony_ci	ret
43562306a36Sopenharmony_ci
43662306a36Sopenharmony_ci.Lless_than_256_bytes_\@:
43762306a36Sopenharmony_ci	// Checksumming a buffer of length 16...255 bytes
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ci	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
44062306a36Sopenharmony_ci
44162306a36Sopenharmony_ci	// Load the first 16 data bytes.
44262306a36Sopenharmony_ci	ldr		q7, [buf], #0x10
44362306a36Sopenharmony_ciCPU_LE(	rev64		v7.16b, v7.16b			)
44462306a36Sopenharmony_ciCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
44562306a36Sopenharmony_ci
44662306a36Sopenharmony_ci	// XOR the first 16 data *bits* with the initial CRC value.
44762306a36Sopenharmony_ci	movi		v0.16b, #0
44862306a36Sopenharmony_ci	mov		v0.h[7], init_crc
44962306a36Sopenharmony_ci	eor		v7.16b, v7.16b, v0.16b
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci	// Load the fold-across-16-bytes constants.
45262306a36Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
45362306a36Sopenharmony_ci	__pmull_pre_\p	fold_consts
45462306a36Sopenharmony_ci
45562306a36Sopenharmony_ci	cmp		len, #16
45662306a36Sopenharmony_ci	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
45762306a36Sopenharmony_ci	subs		len, len, #32
45862306a36Sopenharmony_ci	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
45962306a36Sopenharmony_ci	add		len, len, #16
46062306a36Sopenharmony_ci	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
46162306a36Sopenharmony_ci	.endm
46262306a36Sopenharmony_ci
46362306a36Sopenharmony_ci//
46462306a36Sopenharmony_ci// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
46562306a36Sopenharmony_ci//
46662306a36Sopenharmony_ci// Assumes len >= 16.
46762306a36Sopenharmony_ci//
46862306a36Sopenharmony_ciSYM_FUNC_START(crc_t10dif_pmull_p8)
46962306a36Sopenharmony_ci	frame_push	1
47062306a36Sopenharmony_ci	crc_t10dif_pmull p8
47162306a36Sopenharmony_ciSYM_FUNC_END(crc_t10dif_pmull_p8)
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_ci	.align		5
47462306a36Sopenharmony_ci//
47562306a36Sopenharmony_ci// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
47662306a36Sopenharmony_ci//
47762306a36Sopenharmony_ci// Assumes len >= 16.
47862306a36Sopenharmony_ci//
47962306a36Sopenharmony_ciSYM_FUNC_START(crc_t10dif_pmull_p64)
48062306a36Sopenharmony_ci	crc_t10dif_pmull	p64
48162306a36Sopenharmony_ciSYM_FUNC_END(crc_t10dif_pmull_p64)
48262306a36Sopenharmony_ci
48362306a36Sopenharmony_ci	.section	".rodata", "a"
48462306a36Sopenharmony_ci	.align		4
48562306a36Sopenharmony_ci
48662306a36Sopenharmony_ci// Fold constants precomputed from the polynomial 0x18bb7
48762306a36Sopenharmony_ci// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
48862306a36Sopenharmony_ci.Lfold_across_128_bytes_consts:
48962306a36Sopenharmony_ci	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
49062306a36Sopenharmony_ci	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
49162306a36Sopenharmony_ci// .Lfold_across_64_bytes_consts:
49262306a36Sopenharmony_ci	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
49362306a36Sopenharmony_ci	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
49462306a36Sopenharmony_ci// .Lfold_across_32_bytes_consts:
49562306a36Sopenharmony_ci	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
49662306a36Sopenharmony_ci	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
49762306a36Sopenharmony_ci.Lfold_across_16_bytes_consts:
49862306a36Sopenharmony_ci	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
49962306a36Sopenharmony_ci	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
50062306a36Sopenharmony_ci// .Lfinal_fold_consts:
50162306a36Sopenharmony_ci	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
50262306a36Sopenharmony_ci	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
50362306a36Sopenharmony_ci// .Lbarrett_reduction_consts:
50462306a36Sopenharmony_ci	.quad		0x0000000000018bb7	// G(x)
50562306a36Sopenharmony_ci	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
50662306a36Sopenharmony_ci
50762306a36Sopenharmony_ci// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
50862306a36Sopenharmony_ci// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
50962306a36Sopenharmony_ci// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
51062306a36Sopenharmony_ci.Lbyteshift_table:
51162306a36Sopenharmony_ci	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
51262306a36Sopenharmony_ci	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
51362306a36Sopenharmony_ci	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
51462306a36Sopenharmony_ci	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
515