162306a36Sopenharmony_ci//
262306a36Sopenharmony_ci// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
362306a36Sopenharmony_ci//
462306a36Sopenharmony_ci// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
562306a36Sopenharmony_ci// Copyright (C) 2019 Google LLC <ebiggers@google.com>
662306a36Sopenharmony_ci//
762306a36Sopenharmony_ci// This program is free software; you can redistribute it and/or modify
862306a36Sopenharmony_ci// it under the terms of the GNU General Public License version 2 as
962306a36Sopenharmony_ci// published by the Free Software Foundation.
1062306a36Sopenharmony_ci//
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci// Derived from the x86 version:
1362306a36Sopenharmony_ci//
1462306a36Sopenharmony_ci// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
1562306a36Sopenharmony_ci//
1662306a36Sopenharmony_ci// Copyright (c) 2013, Intel Corporation
1762306a36Sopenharmony_ci//
1862306a36Sopenharmony_ci// Authors:
1962306a36Sopenharmony_ci//     Erdinc Ozturk <erdinc.ozturk@intel.com>
2062306a36Sopenharmony_ci//     Vinodh Gopal <vinodh.gopal@intel.com>
2162306a36Sopenharmony_ci//     James Guilford <james.guilford@intel.com>
2262306a36Sopenharmony_ci//     Tim Chen <tim.c.chen@linux.intel.com>
2362306a36Sopenharmony_ci//
2462306a36Sopenharmony_ci// This software is available to you under a choice of one of two
2562306a36Sopenharmony_ci// licenses.  You may choose to be licensed under the terms of the GNU
2662306a36Sopenharmony_ci// General Public License (GPL) Version 2, available from the file
2762306a36Sopenharmony_ci// COPYING in the main directory of this source tree, or the
2862306a36Sopenharmony_ci// OpenIB.org BSD license below:
2962306a36Sopenharmony_ci//
3062306a36Sopenharmony_ci// Redistribution and use in source and binary forms, with or without
3162306a36Sopenharmony_ci// modification, are permitted provided that the following conditions are
3262306a36Sopenharmony_ci// met:
3362306a36Sopenharmony_ci//
3462306a36Sopenharmony_ci// * Redistributions of source code must retain the above copyright
3562306a36Sopenharmony_ci//   notice, this list of conditions and the following disclaimer.
3662306a36Sopenharmony_ci//
3762306a36Sopenharmony_ci// * Redistributions in binary form must reproduce the above copyright
3862306a36Sopenharmony_ci//   notice, this list of conditions and the following disclaimer in the
3962306a36Sopenharmony_ci//   documentation and/or other materials provided with the
4062306a36Sopenharmony_ci//   distribution.
4162306a36Sopenharmony_ci//
4262306a36Sopenharmony_ci// * Neither the name of the Intel Corporation nor the names of its
4362306a36Sopenharmony_ci//   contributors may be used to endorse or promote products derived from
4462306a36Sopenharmony_ci//   this software without specific prior written permission.
4562306a36Sopenharmony_ci//
4662306a36Sopenharmony_ci//
4762306a36Sopenharmony_ci// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
4862306a36Sopenharmony_ci// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
4962306a36Sopenharmony_ci// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
5062306a36Sopenharmony_ci// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
5162306a36Sopenharmony_ci// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
5262306a36Sopenharmony_ci// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
5362306a36Sopenharmony_ci// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
5462306a36Sopenharmony_ci// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
5562306a36Sopenharmony_ci// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
5662306a36Sopenharmony_ci// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
5762306a36Sopenharmony_ci// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5862306a36Sopenharmony_ci//
5962306a36Sopenharmony_ci//       Reference paper titled "Fast CRC Computation for Generic
6062306a36Sopenharmony_ci//	Polynomials Using PCLMULQDQ Instruction"
6162306a36Sopenharmony_ci//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
6262306a36Sopenharmony_ci//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
6362306a36Sopenharmony_ci//
6462306a36Sopenharmony_ci
6562306a36Sopenharmony_ci#include <linux/linkage.h>
6662306a36Sopenharmony_ci#include <asm/assembler.h>
6762306a36Sopenharmony_ci
6862306a36Sopenharmony_ci#ifdef CONFIG_CPU_ENDIAN_BE8
6962306a36Sopenharmony_ci#define CPU_LE(code...)
7062306a36Sopenharmony_ci#else
7162306a36Sopenharmony_ci#define CPU_LE(code...)		code
7262306a36Sopenharmony_ci#endif
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_ci	.text
7562306a36Sopenharmony_ci	.arch		armv8-a
7662306a36Sopenharmony_ci	.fpu		crypto-neon-fp-armv8
7762306a36Sopenharmony_ci
7862306a36Sopenharmony_ci	init_crc	.req	r0
7962306a36Sopenharmony_ci	buf		.req	r1
8062306a36Sopenharmony_ci	len		.req	r2
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_ci	fold_consts_ptr	.req	ip
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci	q0l		.req	d0
8562306a36Sopenharmony_ci	q0h		.req	d1
8662306a36Sopenharmony_ci	q1l		.req	d2
8762306a36Sopenharmony_ci	q1h		.req	d3
8862306a36Sopenharmony_ci	q2l		.req	d4
8962306a36Sopenharmony_ci	q2h		.req	d5
9062306a36Sopenharmony_ci	q3l		.req	d6
9162306a36Sopenharmony_ci	q3h		.req	d7
9262306a36Sopenharmony_ci	q4l		.req	d8
9362306a36Sopenharmony_ci	q4h		.req	d9
9462306a36Sopenharmony_ci	q5l		.req	d10
9562306a36Sopenharmony_ci	q5h		.req	d11
9662306a36Sopenharmony_ci	q6l		.req	d12
9762306a36Sopenharmony_ci	q6h		.req	d13
9862306a36Sopenharmony_ci	q7l		.req	d14
9962306a36Sopenharmony_ci	q7h		.req	d15
10062306a36Sopenharmony_ci	q8l		.req	d16
10162306a36Sopenharmony_ci	q8h		.req	d17
10262306a36Sopenharmony_ci	q9l		.req	d18
10362306a36Sopenharmony_ci	q9h		.req	d19
10462306a36Sopenharmony_ci	q10l		.req	d20
10562306a36Sopenharmony_ci	q10h		.req	d21
10662306a36Sopenharmony_ci	q11l		.req	d22
10762306a36Sopenharmony_ci	q11h		.req	d23
10862306a36Sopenharmony_ci	q12l		.req	d24
10962306a36Sopenharmony_ci	q12h		.req	d25
11062306a36Sopenharmony_ci
11162306a36Sopenharmony_ci	FOLD_CONSTS	.req	q10
11262306a36Sopenharmony_ci	FOLD_CONST_L	.req	q10l
11362306a36Sopenharmony_ci	FOLD_CONST_H	.req	q10h
11462306a36Sopenharmony_ci
11562306a36Sopenharmony_ci	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
11662306a36Sopenharmony_ci	// into reg1, reg2.
11762306a36Sopenharmony_ci	.macro		fold_32_bytes, reg1, reg2
11862306a36Sopenharmony_ci	vld1.64		{q11-q12}, [buf]!
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_ci	vmull.p64	q8, \reg1\()h, FOLD_CONST_H
12162306a36Sopenharmony_ci	vmull.p64	\reg1, \reg1\()l, FOLD_CONST_L
12262306a36Sopenharmony_ci	vmull.p64	q9, \reg2\()h, FOLD_CONST_H
12362306a36Sopenharmony_ci	vmull.p64	\reg2, \reg2\()l, FOLD_CONST_L
12462306a36Sopenharmony_ci
12562306a36Sopenharmony_ciCPU_LE(	vrev64.8	q11, q11	)
12662306a36Sopenharmony_ciCPU_LE(	vrev64.8	q12, q12	)
12762306a36Sopenharmony_ci	vswp		q11l, q11h
12862306a36Sopenharmony_ci	vswp		q12l, q12h
12962306a36Sopenharmony_ci
13062306a36Sopenharmony_ci	veor.8		\reg1, \reg1, q8
13162306a36Sopenharmony_ci	veor.8		\reg2, \reg2, q9
13262306a36Sopenharmony_ci	veor.8		\reg1, \reg1, q11
13362306a36Sopenharmony_ci	veor.8		\reg2, \reg2, q12
13462306a36Sopenharmony_ci	.endm
13562306a36Sopenharmony_ci
13662306a36Sopenharmony_ci	// Fold src_reg into dst_reg, optionally loading the next fold constants
13762306a36Sopenharmony_ci	.macro		fold_16_bytes, src_reg, dst_reg, load_next_consts
13862306a36Sopenharmony_ci	vmull.p64	q8, \src_reg\()l, FOLD_CONST_L
13962306a36Sopenharmony_ci	vmull.p64	\src_reg, \src_reg\()h, FOLD_CONST_H
14062306a36Sopenharmony_ci	.ifnb		\load_next_consts
14162306a36Sopenharmony_ci	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
14262306a36Sopenharmony_ci	.endif
14362306a36Sopenharmony_ci	veor.8		\dst_reg, \dst_reg, q8
14462306a36Sopenharmony_ci	veor.8		\dst_reg, \dst_reg, \src_reg
14562306a36Sopenharmony_ci	.endm
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci	.macro		__adrl, out, sym
14862306a36Sopenharmony_ci	movw		\out, #:lower16:\sym
14962306a36Sopenharmony_ci	movt		\out, #:upper16:\sym
15062306a36Sopenharmony_ci	.endm
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci//
15362306a36Sopenharmony_ci// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
15462306a36Sopenharmony_ci//
15562306a36Sopenharmony_ci// Assumes len >= 16.
15662306a36Sopenharmony_ci//
15762306a36Sopenharmony_ciENTRY(crc_t10dif_pmull)
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_ci	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
16062306a36Sopenharmony_ci	cmp		len, #256
16162306a36Sopenharmony_ci	blt		.Lless_than_256_bytes
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci	__adrl		fold_consts_ptr, .Lfold_across_128_bytes_consts
16462306a36Sopenharmony_ci
16562306a36Sopenharmony_ci	// Load the first 128 data bytes.  Byte swapping is necessary to make
16662306a36Sopenharmony_ci	// the bit order match the polynomial coefficient order.
16762306a36Sopenharmony_ci	vld1.64		{q0-q1}, [buf]!
16862306a36Sopenharmony_ci	vld1.64		{q2-q3}, [buf]!
16962306a36Sopenharmony_ci	vld1.64		{q4-q5}, [buf]!
17062306a36Sopenharmony_ci	vld1.64		{q6-q7}, [buf]!
17162306a36Sopenharmony_ciCPU_LE(	vrev64.8	q0, q0	)
17262306a36Sopenharmony_ciCPU_LE(	vrev64.8	q1, q1	)
17362306a36Sopenharmony_ciCPU_LE(	vrev64.8	q2, q2	)
17462306a36Sopenharmony_ciCPU_LE(	vrev64.8	q3, q3	)
17562306a36Sopenharmony_ciCPU_LE(	vrev64.8	q4, q4	)
17662306a36Sopenharmony_ciCPU_LE(	vrev64.8	q5, q5	)
17762306a36Sopenharmony_ciCPU_LE(	vrev64.8	q6, q6	)
17862306a36Sopenharmony_ciCPU_LE(	vrev64.8	q7, q7	)
17962306a36Sopenharmony_ci	vswp		q0l, q0h
18062306a36Sopenharmony_ci	vswp		q1l, q1h
18162306a36Sopenharmony_ci	vswp		q2l, q2h
18262306a36Sopenharmony_ci	vswp		q3l, q3h
18362306a36Sopenharmony_ci	vswp		q4l, q4h
18462306a36Sopenharmony_ci	vswp		q5l, q5h
18562306a36Sopenharmony_ci	vswp		q6l, q6h
18662306a36Sopenharmony_ci	vswp		q7l, q7h
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_ci	// XOR the first 16 data *bits* with the initial CRC value.
18962306a36Sopenharmony_ci	vmov.i8		q8h, #0
19062306a36Sopenharmony_ci	vmov.u16	q8h[3], init_crc
19162306a36Sopenharmony_ci	veor		q0h, q0h, q8h
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_ci	// Load the constants for folding across 128 bytes.
19462306a36Sopenharmony_ci	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_ci	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
19762306a36Sopenharmony_ci	// 128 to simplify the termination condition of the following loop.
19862306a36Sopenharmony_ci	sub		len, len, #256
19962306a36Sopenharmony_ci
20062306a36Sopenharmony_ci	// While >= 128 data bytes remain (not counting q0-q7), fold the 128
20162306a36Sopenharmony_ci	// bytes q0-q7 into them, storing the result back into q0-q7.
20262306a36Sopenharmony_ci.Lfold_128_bytes_loop:
20362306a36Sopenharmony_ci	fold_32_bytes	q0, q1
20462306a36Sopenharmony_ci	fold_32_bytes	q2, q3
20562306a36Sopenharmony_ci	fold_32_bytes	q4, q5
20662306a36Sopenharmony_ci	fold_32_bytes	q6, q7
20762306a36Sopenharmony_ci	subs		len, len, #128
20862306a36Sopenharmony_ci	bge		.Lfold_128_bytes_loop
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_ci	// Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci	// Fold across 64 bytes.
21362306a36Sopenharmony_ci	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
21462306a36Sopenharmony_ci	fold_16_bytes	q0, q4
21562306a36Sopenharmony_ci	fold_16_bytes	q1, q5
21662306a36Sopenharmony_ci	fold_16_bytes	q2, q6
21762306a36Sopenharmony_ci	fold_16_bytes	q3, q7, 1
21862306a36Sopenharmony_ci	// Fold across 32 bytes.
21962306a36Sopenharmony_ci	fold_16_bytes	q4, q6
22062306a36Sopenharmony_ci	fold_16_bytes	q5, q7, 1
22162306a36Sopenharmony_ci	// Fold across 16 bytes.
22262306a36Sopenharmony_ci	fold_16_bytes	q6, q7
22362306a36Sopenharmony_ci
22462306a36Sopenharmony_ci	// Add 128 to get the correct number of data bytes remaining in 0...127
22562306a36Sopenharmony_ci	// (not counting q7), following the previous extra subtraction by 128.
22662306a36Sopenharmony_ci	// Then subtract 16 to simplify the termination condition of the
22762306a36Sopenharmony_ci	// following loop.
22862306a36Sopenharmony_ci	adds		len, len, #(128-16)
22962306a36Sopenharmony_ci
23062306a36Sopenharmony_ci	// While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
23162306a36Sopenharmony_ci	// into them, storing the result back into q7.
23262306a36Sopenharmony_ci	blt		.Lfold_16_bytes_loop_done
23362306a36Sopenharmony_ci.Lfold_16_bytes_loop:
23462306a36Sopenharmony_ci	vmull.p64	q8, q7l, FOLD_CONST_L
23562306a36Sopenharmony_ci	vmull.p64	q7, q7h, FOLD_CONST_H
23662306a36Sopenharmony_ci	veor.8		q7, q7, q8
23762306a36Sopenharmony_ci	vld1.64		{q0}, [buf]!
23862306a36Sopenharmony_ciCPU_LE(	vrev64.8	q0, q0	)
23962306a36Sopenharmony_ci	vswp		q0l, q0h
24062306a36Sopenharmony_ci	veor.8		q7, q7, q0
24162306a36Sopenharmony_ci	subs		len, len, #16
24262306a36Sopenharmony_ci	bge		.Lfold_16_bytes_loop
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci.Lfold_16_bytes_loop_done:
24562306a36Sopenharmony_ci	// Add 16 to get the correct number of data bytes remaining in 0...15
24662306a36Sopenharmony_ci	// (not counting q7), following the previous extra subtraction by 16.
24762306a36Sopenharmony_ci	adds		len, len, #16
24862306a36Sopenharmony_ci	beq		.Lreduce_final_16_bytes
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci.Lhandle_partial_segment:
25162306a36Sopenharmony_ci	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
25262306a36Sopenharmony_ci	// 16 bytes are in q7 and the rest are the remaining data in 'buf'.  To
25362306a36Sopenharmony_ci	// do this without needing a fold constant for each possible 'len',
25462306a36Sopenharmony_ci	// redivide the bytes into a first chunk of 'len' bytes and a second
25562306a36Sopenharmony_ci	// chunk of 16 bytes, then fold the first chunk into the second.
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci	// q0 = last 16 original data bytes
25862306a36Sopenharmony_ci	add		buf, buf, len
25962306a36Sopenharmony_ci	sub		buf, buf, #16
26062306a36Sopenharmony_ci	vld1.64		{q0}, [buf]
26162306a36Sopenharmony_ciCPU_LE(	vrev64.8	q0, q0	)
26262306a36Sopenharmony_ci	vswp		q0l, q0h
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ci	// q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
26562306a36Sopenharmony_ci	__adrl		r3, .Lbyteshift_table + 16
26662306a36Sopenharmony_ci	sub		r3, r3, len
26762306a36Sopenharmony_ci	vld1.8		{q2}, [r3]
26862306a36Sopenharmony_ci	vtbl.8		q1l, {q7l-q7h}, q2l
26962306a36Sopenharmony_ci	vtbl.8		q1h, {q7l-q7h}, q2h
27062306a36Sopenharmony_ci
27162306a36Sopenharmony_ci	// q3 = first chunk: q7 right-shifted by '16-len' bytes.
27262306a36Sopenharmony_ci	vmov.i8		q3, #0x80
27362306a36Sopenharmony_ci	veor.8		q2, q2, q3
27462306a36Sopenharmony_ci	vtbl.8		q3l, {q7l-q7h}, q2l
27562306a36Sopenharmony_ci	vtbl.8		q3h, {q7l-q7h}, q2h
27662306a36Sopenharmony_ci
27762306a36Sopenharmony_ci	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
27862306a36Sopenharmony_ci	vshr.s8		q2, q2, #7
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci	// q2 = second chunk: 'len' bytes from q0 (low-order bytes),
28162306a36Sopenharmony_ci	// then '16-len' bytes from q1 (high-order bytes).
28262306a36Sopenharmony_ci	vbsl.8		q2, q1, q0
28362306a36Sopenharmony_ci
28462306a36Sopenharmony_ci	// Fold the first chunk into the second chunk, storing the result in q7.
28562306a36Sopenharmony_ci	vmull.p64	q0, q3l, FOLD_CONST_L
28662306a36Sopenharmony_ci	vmull.p64	q7, q3h, FOLD_CONST_H
28762306a36Sopenharmony_ci	veor.8		q7, q7, q0
28862306a36Sopenharmony_ci	veor.8		q7, q7, q2
28962306a36Sopenharmony_ci
29062306a36Sopenharmony_ci.Lreduce_final_16_bytes:
29162306a36Sopenharmony_ci	// Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
29462306a36Sopenharmony_ci	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci	// Fold the high 64 bits into the low 64 bits, while also multiplying by
29762306a36Sopenharmony_ci	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
29862306a36Sopenharmony_ci	// whose low 48 bits are 0.
29962306a36Sopenharmony_ci	vmull.p64	q0, q7h, FOLD_CONST_H	// high bits * x^48 * (x^80 mod G(x))
30062306a36Sopenharmony_ci	veor.8		q0h, q0h, q7l		// + low bits * x^64
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
30362306a36Sopenharmony_ci	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
30462306a36Sopenharmony_ci	vmov.i8		q1, #0
30562306a36Sopenharmony_ci	vmov		s4, s3			// extract high 32 bits
30662306a36Sopenharmony_ci	vmov		s3, s5			// zero high 32 bits
30762306a36Sopenharmony_ci	vmull.p64	q1, q1l, FOLD_CONST_L	// high 32 bits * x^48 * (x^48 mod G(x))
30862306a36Sopenharmony_ci	veor.8		q0, q0, q1		// + low bits
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci	// Load G(x) and floor(x^48 / G(x)).
31162306a36Sopenharmony_ci	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci	// Use Barrett reduction to compute the final CRC value.
31462306a36Sopenharmony_ci	vmull.p64	q1, q0h, FOLD_CONST_H	// high 32 bits * floor(x^48 / G(x))
31562306a36Sopenharmony_ci	vshr.u64	q1l, q1l, #32		// /= x^32
31662306a36Sopenharmony_ci	vmull.p64	q1, q1l, FOLD_CONST_L	// *= G(x)
31762306a36Sopenharmony_ci	vshr.u64	q0l, q0l, #48
31862306a36Sopenharmony_ci	veor.8		q0l, q0l, q1l		// + low 16 nonzero bits
31962306a36Sopenharmony_ci	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0.
32062306a36Sopenharmony_ci
32162306a36Sopenharmony_ci	vmov.u16	r0, q0l[0]
32262306a36Sopenharmony_ci	bx		lr
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_ci.Lless_than_256_bytes:
32562306a36Sopenharmony_ci	// Checksumming a buffer of length 16...255 bytes
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_ci	__adrl		fold_consts_ptr, .Lfold_across_16_bytes_consts
32862306a36Sopenharmony_ci
32962306a36Sopenharmony_ci	// Load the first 16 data bytes.
33062306a36Sopenharmony_ci	vld1.64		{q7}, [buf]!
33162306a36Sopenharmony_ciCPU_LE(	vrev64.8	q7, q7	)
33262306a36Sopenharmony_ci	vswp		q7l, q7h
33362306a36Sopenharmony_ci
33462306a36Sopenharmony_ci	// XOR the first 16 data *bits* with the initial CRC value.
33562306a36Sopenharmony_ci	vmov.i8		q0h, #0
33662306a36Sopenharmony_ci	vmov.u16	q0h[3], init_crc
33762306a36Sopenharmony_ci	veor.8		q7h, q7h, q0h
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_ci	// Load the fold-across-16-bytes constants.
34062306a36Sopenharmony_ci	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
34162306a36Sopenharmony_ci
34262306a36Sopenharmony_ci	cmp		len, #16
34362306a36Sopenharmony_ci	beq		.Lreduce_final_16_bytes		// len == 16
34462306a36Sopenharmony_ci	subs		len, len, #32
34562306a36Sopenharmony_ci	addlt		len, len, #16
34662306a36Sopenharmony_ci	blt		.Lhandle_partial_segment	// 17 <= len <= 31
34762306a36Sopenharmony_ci	b		.Lfold_16_bytes_loop		// 32 <= len <= 255
34862306a36Sopenharmony_ciENDPROC(crc_t10dif_pmull)
34962306a36Sopenharmony_ci
35062306a36Sopenharmony_ci	.section	".rodata", "a"
35162306a36Sopenharmony_ci	.align		4
35262306a36Sopenharmony_ci
35362306a36Sopenharmony_ci// Fold constants precomputed from the polynomial 0x18bb7
35462306a36Sopenharmony_ci// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
35562306a36Sopenharmony_ci.Lfold_across_128_bytes_consts:
35662306a36Sopenharmony_ci	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
35762306a36Sopenharmony_ci	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
35862306a36Sopenharmony_ci// .Lfold_across_64_bytes_consts:
35962306a36Sopenharmony_ci	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
36062306a36Sopenharmony_ci	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
36162306a36Sopenharmony_ci// .Lfold_across_32_bytes_consts:
36262306a36Sopenharmony_ci	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
36362306a36Sopenharmony_ci	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
36462306a36Sopenharmony_ci.Lfold_across_16_bytes_consts:
36562306a36Sopenharmony_ci	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
36662306a36Sopenharmony_ci	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
36762306a36Sopenharmony_ci// .Lfinal_fold_consts:
36862306a36Sopenharmony_ci	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
36962306a36Sopenharmony_ci	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
37062306a36Sopenharmony_ci// .Lbarrett_reduction_consts:
37162306a36Sopenharmony_ci	.quad		0x0000000000018bb7	// G(x)
37262306a36Sopenharmony_ci	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
37362306a36Sopenharmony_ci
37462306a36Sopenharmony_ci// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
37562306a36Sopenharmony_ci// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
37662306a36Sopenharmony_ci// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
37762306a36Sopenharmony_ci.Lbyteshift_table:
37862306a36Sopenharmony_ci	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
37962306a36Sopenharmony_ci	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
38062306a36Sopenharmony_ci	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
38162306a36Sopenharmony_ci	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
382