18c2ecf20Sopenharmony_ci//
28c2ecf20Sopenharmony_ci// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
38c2ecf20Sopenharmony_ci//
48c2ecf20Sopenharmony_ci// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
58c2ecf20Sopenharmony_ci// Copyright (C) 2019 Google LLC <ebiggers@google.com>
68c2ecf20Sopenharmony_ci//
78c2ecf20Sopenharmony_ci// This program is free software; you can redistribute it and/or modify
88c2ecf20Sopenharmony_ci// it under the terms of the GNU General Public License version 2 as
98c2ecf20Sopenharmony_ci// published by the Free Software Foundation.
108c2ecf20Sopenharmony_ci//
118c2ecf20Sopenharmony_ci
128c2ecf20Sopenharmony_ci// Derived from the x86 version:
138c2ecf20Sopenharmony_ci//
148c2ecf20Sopenharmony_ci// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
158c2ecf20Sopenharmony_ci//
168c2ecf20Sopenharmony_ci// Copyright (c) 2013, Intel Corporation
178c2ecf20Sopenharmony_ci//
188c2ecf20Sopenharmony_ci// Authors:
198c2ecf20Sopenharmony_ci//     Erdinc Ozturk <erdinc.ozturk@intel.com>
208c2ecf20Sopenharmony_ci//     Vinodh Gopal <vinodh.gopal@intel.com>
218c2ecf20Sopenharmony_ci//     James Guilford <james.guilford@intel.com>
228c2ecf20Sopenharmony_ci//     Tim Chen <tim.c.chen@linux.intel.com>
238c2ecf20Sopenharmony_ci//
248c2ecf20Sopenharmony_ci// This software is available to you under a choice of one of two
258c2ecf20Sopenharmony_ci// licenses.  You may choose to be licensed under the terms of the GNU
268c2ecf20Sopenharmony_ci// General Public License (GPL) Version 2, available from the file
278c2ecf20Sopenharmony_ci// COPYING in the main directory of this source tree, or the
288c2ecf20Sopenharmony_ci// OpenIB.org BSD license below:
298c2ecf20Sopenharmony_ci//
308c2ecf20Sopenharmony_ci// Redistribution and use in source and binary forms, with or without
318c2ecf20Sopenharmony_ci// modification, are permitted provided that the following conditions are
328c2ecf20Sopenharmony_ci// met:
338c2ecf20Sopenharmony_ci//
348c2ecf20Sopenharmony_ci// * Redistributions of source code must retain the above copyright
358c2ecf20Sopenharmony_ci//   notice, this list of conditions and the following disclaimer.
368c2ecf20Sopenharmony_ci//
378c2ecf20Sopenharmony_ci// * Redistributions in binary form must reproduce the above copyright
388c2ecf20Sopenharmony_ci//   notice, this list of conditions and the following disclaimer in the
398c2ecf20Sopenharmony_ci//   documentation and/or other materials provided with the
408c2ecf20Sopenharmony_ci//   distribution.
418c2ecf20Sopenharmony_ci//
428c2ecf20Sopenharmony_ci// * Neither the name of the Intel Corporation nor the names of its
438c2ecf20Sopenharmony_ci//   contributors may be used to endorse or promote products derived from
448c2ecf20Sopenharmony_ci//   this software without specific prior written permission.
458c2ecf20Sopenharmony_ci//
468c2ecf20Sopenharmony_ci//
478c2ecf20Sopenharmony_ci// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
488c2ecf20Sopenharmony_ci// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
498c2ecf20Sopenharmony_ci// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
508c2ecf20Sopenharmony_ci// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
518c2ecf20Sopenharmony_ci// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
528c2ecf20Sopenharmony_ci// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
538c2ecf20Sopenharmony_ci// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
548c2ecf20Sopenharmony_ci// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
558c2ecf20Sopenharmony_ci// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
568c2ecf20Sopenharmony_ci// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
578c2ecf20Sopenharmony_ci// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
588c2ecf20Sopenharmony_ci//
598c2ecf20Sopenharmony_ci//       Reference paper titled "Fast CRC Computation for Generic
608c2ecf20Sopenharmony_ci//	Polynomials Using PCLMULQDQ Instruction"
618c2ecf20Sopenharmony_ci//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
628c2ecf20Sopenharmony_ci//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
638c2ecf20Sopenharmony_ci//
648c2ecf20Sopenharmony_ci
658c2ecf20Sopenharmony_ci#include <linux/linkage.h>
668c2ecf20Sopenharmony_ci#include <asm/assembler.h>
678c2ecf20Sopenharmony_ci
688c2ecf20Sopenharmony_ci	.text
698c2ecf20Sopenharmony_ci	.arch		armv8-a+crypto
708c2ecf20Sopenharmony_ci
718c2ecf20Sopenharmony_ci	init_crc	.req	w0
728c2ecf20Sopenharmony_ci	buf		.req	x1
738c2ecf20Sopenharmony_ci	len		.req	x2
748c2ecf20Sopenharmony_ci	fold_consts_ptr	.req	x3
758c2ecf20Sopenharmony_ci
768c2ecf20Sopenharmony_ci	fold_consts	.req	v10
778c2ecf20Sopenharmony_ci
788c2ecf20Sopenharmony_ci	ad		.req	v14
798c2ecf20Sopenharmony_ci
808c2ecf20Sopenharmony_ci	k00_16		.req	v15
818c2ecf20Sopenharmony_ci	k32_48		.req	v16
828c2ecf20Sopenharmony_ci
838c2ecf20Sopenharmony_ci	t3		.req	v17
848c2ecf20Sopenharmony_ci	t4		.req	v18
858c2ecf20Sopenharmony_ci	t5		.req	v19
868c2ecf20Sopenharmony_ci	t6		.req	v20
878c2ecf20Sopenharmony_ci	t7		.req	v21
888c2ecf20Sopenharmony_ci	t8		.req	v22
898c2ecf20Sopenharmony_ci	t9		.req	v23
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci	perm1		.req	v24
928c2ecf20Sopenharmony_ci	perm2		.req	v25
938c2ecf20Sopenharmony_ci	perm3		.req	v26
948c2ecf20Sopenharmony_ci	perm4		.req	v27
958c2ecf20Sopenharmony_ci
968c2ecf20Sopenharmony_ci	bd1		.req	v28
978c2ecf20Sopenharmony_ci	bd2		.req	v29
988c2ecf20Sopenharmony_ci	bd3		.req	v30
998c2ecf20Sopenharmony_ci	bd4		.req	v31
1008c2ecf20Sopenharmony_ci
1018c2ecf20Sopenharmony_ci	.macro		__pmull_init_p64
1028c2ecf20Sopenharmony_ci	.endm
1038c2ecf20Sopenharmony_ci
1048c2ecf20Sopenharmony_ci	.macro		__pmull_pre_p64, bd
1058c2ecf20Sopenharmony_ci	.endm
1068c2ecf20Sopenharmony_ci
1078c2ecf20Sopenharmony_ci	.macro		__pmull_init_p8
1088c2ecf20Sopenharmony_ci	// k00_16 := 0x0000000000000000_000000000000ffff
1098c2ecf20Sopenharmony_ci	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
1108c2ecf20Sopenharmony_ci	movi		k32_48.2d, #0xffffffff
1118c2ecf20Sopenharmony_ci	mov		k32_48.h[2], k32_48.h[0]
1128c2ecf20Sopenharmony_ci	ushr		k00_16.2d, k32_48.2d, #32
1138c2ecf20Sopenharmony_ci
1148c2ecf20Sopenharmony_ci	// prepare the permutation vectors
1158c2ecf20Sopenharmony_ci	mov_q		x5, 0x080f0e0d0c0b0a09
1168c2ecf20Sopenharmony_ci	movi		perm4.8b, #8
1178c2ecf20Sopenharmony_ci	dup		perm1.2d, x5
1188c2ecf20Sopenharmony_ci	eor		perm1.16b, perm1.16b, perm4.16b
1198c2ecf20Sopenharmony_ci	ushr		perm2.2d, perm1.2d, #8
1208c2ecf20Sopenharmony_ci	ushr		perm3.2d, perm1.2d, #16
1218c2ecf20Sopenharmony_ci	ushr		perm4.2d, perm1.2d, #24
1228c2ecf20Sopenharmony_ci	sli		perm2.2d, perm1.2d, #56
1238c2ecf20Sopenharmony_ci	sli		perm3.2d, perm1.2d, #48
1248c2ecf20Sopenharmony_ci	sli		perm4.2d, perm1.2d, #40
1258c2ecf20Sopenharmony_ci	.endm
1268c2ecf20Sopenharmony_ci
1278c2ecf20Sopenharmony_ci	.macro		__pmull_pre_p8, bd
1288c2ecf20Sopenharmony_ci	tbl		bd1.16b, {\bd\().16b}, perm1.16b
1298c2ecf20Sopenharmony_ci	tbl		bd2.16b, {\bd\().16b}, perm2.16b
1308c2ecf20Sopenharmony_ci	tbl		bd3.16b, {\bd\().16b}, perm3.16b
1318c2ecf20Sopenharmony_ci	tbl		bd4.16b, {\bd\().16b}, perm4.16b
1328c2ecf20Sopenharmony_ci	.endm
1338c2ecf20Sopenharmony_ci
1348c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__pmull_p8_core)
1358c2ecf20Sopenharmony_ci.L__pmull_p8_core:
1368c2ecf20Sopenharmony_ci	ext		t4.8b, ad.8b, ad.8b, #1			// A1
1378c2ecf20Sopenharmony_ci	ext		t5.8b, ad.8b, ad.8b, #2			// A2
1388c2ecf20Sopenharmony_ci	ext		t6.8b, ad.8b, ad.8b, #3			// A3
1398c2ecf20Sopenharmony_ci
1408c2ecf20Sopenharmony_ci	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
1418c2ecf20Sopenharmony_ci	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
1428c2ecf20Sopenharmony_ci	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
1438c2ecf20Sopenharmony_ci	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
1448c2ecf20Sopenharmony_ci	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
1458c2ecf20Sopenharmony_ci	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
1468c2ecf20Sopenharmony_ci	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
1478c2ecf20Sopenharmony_ci	b		0f
1488c2ecf20Sopenharmony_ci
1498c2ecf20Sopenharmony_ci.L__pmull_p8_core2:
1508c2ecf20Sopenharmony_ci	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
1518c2ecf20Sopenharmony_ci	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
1528c2ecf20Sopenharmony_ci	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
1538c2ecf20Sopenharmony_ci
1548c2ecf20Sopenharmony_ci	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
1558c2ecf20Sopenharmony_ci	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
1568c2ecf20Sopenharmony_ci	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
1578c2ecf20Sopenharmony_ci	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
1588c2ecf20Sopenharmony_ci	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
1598c2ecf20Sopenharmony_ci	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
1608c2ecf20Sopenharmony_ci	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
1618c2ecf20Sopenharmony_ci
1628c2ecf20Sopenharmony_ci0:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
1638c2ecf20Sopenharmony_ci	eor		t5.16b, t5.16b, t7.16b			// M = G + H
1648c2ecf20Sopenharmony_ci	eor		t6.16b, t6.16b, t9.16b			// N = I + J
1658c2ecf20Sopenharmony_ci
1668c2ecf20Sopenharmony_ci	uzp1		t8.2d, t4.2d, t5.2d
1678c2ecf20Sopenharmony_ci	uzp2		t4.2d, t4.2d, t5.2d
1688c2ecf20Sopenharmony_ci	uzp1		t7.2d, t6.2d, t3.2d
1698c2ecf20Sopenharmony_ci	uzp2		t6.2d, t6.2d, t3.2d
1708c2ecf20Sopenharmony_ci
1718c2ecf20Sopenharmony_ci	// t4 = (L) (P0 + P1) << 8
1728c2ecf20Sopenharmony_ci	// t5 = (M) (P2 + P3) << 16
1738c2ecf20Sopenharmony_ci	eor		t8.16b, t8.16b, t4.16b
1748c2ecf20Sopenharmony_ci	and		t4.16b, t4.16b, k32_48.16b
1758c2ecf20Sopenharmony_ci
1768c2ecf20Sopenharmony_ci	// t6 = (N) (P4 + P5) << 24
1778c2ecf20Sopenharmony_ci	// t7 = (K) (P6 + P7) << 32
1788c2ecf20Sopenharmony_ci	eor		t7.16b, t7.16b, t6.16b
1798c2ecf20Sopenharmony_ci	and		t6.16b, t6.16b, k00_16.16b
1808c2ecf20Sopenharmony_ci
1818c2ecf20Sopenharmony_ci	eor		t8.16b, t8.16b, t4.16b
1828c2ecf20Sopenharmony_ci	eor		t7.16b, t7.16b, t6.16b
1838c2ecf20Sopenharmony_ci
1848c2ecf20Sopenharmony_ci	zip2		t5.2d, t8.2d, t4.2d
1858c2ecf20Sopenharmony_ci	zip1		t4.2d, t8.2d, t4.2d
1868c2ecf20Sopenharmony_ci	zip2		t3.2d, t7.2d, t6.2d
1878c2ecf20Sopenharmony_ci	zip1		t6.2d, t7.2d, t6.2d
1888c2ecf20Sopenharmony_ci
1898c2ecf20Sopenharmony_ci	ext		t4.16b, t4.16b, t4.16b, #15
1908c2ecf20Sopenharmony_ci	ext		t5.16b, t5.16b, t5.16b, #14
1918c2ecf20Sopenharmony_ci	ext		t6.16b, t6.16b, t6.16b, #13
1928c2ecf20Sopenharmony_ci	ext		t3.16b, t3.16b, t3.16b, #12
1938c2ecf20Sopenharmony_ci
1948c2ecf20Sopenharmony_ci	eor		t4.16b, t4.16b, t5.16b
1958c2ecf20Sopenharmony_ci	eor		t6.16b, t6.16b, t3.16b
1968c2ecf20Sopenharmony_ci	ret
1978c2ecf20Sopenharmony_ciSYM_FUNC_END(__pmull_p8_core)
1988c2ecf20Sopenharmony_ci
1998c2ecf20Sopenharmony_ci	.macro		__pmull_p8, rq, ad, bd, i
2008c2ecf20Sopenharmony_ci	.ifnc		\bd, fold_consts
2018c2ecf20Sopenharmony_ci	.err
2028c2ecf20Sopenharmony_ci	.endif
2038c2ecf20Sopenharmony_ci	mov		ad.16b, \ad\().16b
2048c2ecf20Sopenharmony_ci	.ifb		\i
2058c2ecf20Sopenharmony_ci	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
2068c2ecf20Sopenharmony_ci	.else
2078c2ecf20Sopenharmony_ci	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
2088c2ecf20Sopenharmony_ci	.endif
2098c2ecf20Sopenharmony_ci
2108c2ecf20Sopenharmony_ci	bl		.L__pmull_p8_core\i
2118c2ecf20Sopenharmony_ci
2128c2ecf20Sopenharmony_ci	eor		\rq\().16b, \rq\().16b, t4.16b
2138c2ecf20Sopenharmony_ci	eor		\rq\().16b, \rq\().16b, t6.16b
2148c2ecf20Sopenharmony_ci	.endm
2158c2ecf20Sopenharmony_ci
2168c2ecf20Sopenharmony_ci	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
2178c2ecf20Sopenharmony_ci	// into reg1, reg2.
2188c2ecf20Sopenharmony_ci	.macro		fold_32_bytes, p, reg1, reg2
2198c2ecf20Sopenharmony_ci	ldp		q11, q12, [buf], #0x20
2208c2ecf20Sopenharmony_ci
2218c2ecf20Sopenharmony_ci	__pmull_\p	v8, \reg1, fold_consts, 2
2228c2ecf20Sopenharmony_ci	__pmull_\p	\reg1, \reg1, fold_consts
2238c2ecf20Sopenharmony_ci
2248c2ecf20Sopenharmony_ciCPU_LE(	rev64		v11.16b, v11.16b		)
2258c2ecf20Sopenharmony_ciCPU_LE(	rev64		v12.16b, v12.16b		)
2268c2ecf20Sopenharmony_ci
2278c2ecf20Sopenharmony_ci	__pmull_\p	v9, \reg2, fold_consts, 2
2288c2ecf20Sopenharmony_ci	__pmull_\p	\reg2, \reg2, fold_consts
2298c2ecf20Sopenharmony_ci
2308c2ecf20Sopenharmony_ciCPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
2318c2ecf20Sopenharmony_ciCPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
2328c2ecf20Sopenharmony_ci
2338c2ecf20Sopenharmony_ci	eor		\reg1\().16b, \reg1\().16b, v8.16b
2348c2ecf20Sopenharmony_ci	eor		\reg2\().16b, \reg2\().16b, v9.16b
2358c2ecf20Sopenharmony_ci	eor		\reg1\().16b, \reg1\().16b, v11.16b
2368c2ecf20Sopenharmony_ci	eor		\reg2\().16b, \reg2\().16b, v12.16b
2378c2ecf20Sopenharmony_ci	.endm
2388c2ecf20Sopenharmony_ci
2398c2ecf20Sopenharmony_ci	// Fold src_reg into dst_reg, optionally loading the next fold constants
2408c2ecf20Sopenharmony_ci	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
2418c2ecf20Sopenharmony_ci	__pmull_\p	v8, \src_reg, fold_consts
2428c2ecf20Sopenharmony_ci	__pmull_\p	\src_reg, \src_reg, fold_consts, 2
2438c2ecf20Sopenharmony_ci	.ifnb		\load_next_consts
2448c2ecf20Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
2458c2ecf20Sopenharmony_ci	__pmull_pre_\p	fold_consts
2468c2ecf20Sopenharmony_ci	.endif
2478c2ecf20Sopenharmony_ci	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
2488c2ecf20Sopenharmony_ci	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
2498c2ecf20Sopenharmony_ci	.endm
2508c2ecf20Sopenharmony_ci
2518c2ecf20Sopenharmony_ci	.macro		__pmull_p64, rd, rn, rm, n
2528c2ecf20Sopenharmony_ci	.ifb		\n
2538c2ecf20Sopenharmony_ci	pmull		\rd\().1q, \rn\().1d, \rm\().1d
2548c2ecf20Sopenharmony_ci	.else
2558c2ecf20Sopenharmony_ci	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
2568c2ecf20Sopenharmony_ci	.endif
2578c2ecf20Sopenharmony_ci	.endm
2588c2ecf20Sopenharmony_ci
2598c2ecf20Sopenharmony_ci	.macro		crc_t10dif_pmull, p
2608c2ecf20Sopenharmony_ci	__pmull_init_\p
2618c2ecf20Sopenharmony_ci
2628c2ecf20Sopenharmony_ci	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
2638c2ecf20Sopenharmony_ci	cmp		len, #256
2648c2ecf20Sopenharmony_ci	b.lt		.Lless_than_256_bytes_\@
2658c2ecf20Sopenharmony_ci
2668c2ecf20Sopenharmony_ci	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
2678c2ecf20Sopenharmony_ci
2688c2ecf20Sopenharmony_ci	// Load the first 128 data bytes.  Byte swapping is necessary to make
2698c2ecf20Sopenharmony_ci	// the bit order match the polynomial coefficient order.
2708c2ecf20Sopenharmony_ci	ldp		q0, q1, [buf]
2718c2ecf20Sopenharmony_ci	ldp		q2, q3, [buf, #0x20]
2728c2ecf20Sopenharmony_ci	ldp		q4, q5, [buf, #0x40]
2738c2ecf20Sopenharmony_ci	ldp		q6, q7, [buf, #0x60]
2748c2ecf20Sopenharmony_ci	add		buf, buf, #0x80
2758c2ecf20Sopenharmony_ciCPU_LE(	rev64		v0.16b, v0.16b			)
2768c2ecf20Sopenharmony_ciCPU_LE(	rev64		v1.16b, v1.16b			)
2778c2ecf20Sopenharmony_ciCPU_LE(	rev64		v2.16b, v2.16b			)
2788c2ecf20Sopenharmony_ciCPU_LE(	rev64		v3.16b, v3.16b			)
2798c2ecf20Sopenharmony_ciCPU_LE(	rev64		v4.16b, v4.16b			)
2808c2ecf20Sopenharmony_ciCPU_LE(	rev64		v5.16b, v5.16b			)
2818c2ecf20Sopenharmony_ciCPU_LE(	rev64		v6.16b, v6.16b			)
2828c2ecf20Sopenharmony_ciCPU_LE(	rev64		v7.16b, v7.16b			)
2838c2ecf20Sopenharmony_ciCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
2848c2ecf20Sopenharmony_ciCPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
2858c2ecf20Sopenharmony_ciCPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
2868c2ecf20Sopenharmony_ciCPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
2878c2ecf20Sopenharmony_ciCPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
2888c2ecf20Sopenharmony_ciCPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
2898c2ecf20Sopenharmony_ciCPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
2908c2ecf20Sopenharmony_ciCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
2918c2ecf20Sopenharmony_ci
2928c2ecf20Sopenharmony_ci	// XOR the first 16 data *bits* with the initial CRC value.
2938c2ecf20Sopenharmony_ci	movi		v8.16b, #0
2948c2ecf20Sopenharmony_ci	mov		v8.h[7], init_crc
2958c2ecf20Sopenharmony_ci	eor		v0.16b, v0.16b, v8.16b
2968c2ecf20Sopenharmony_ci
2978c2ecf20Sopenharmony_ci	// Load the constants for folding across 128 bytes.
2988c2ecf20Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr]
2998c2ecf20Sopenharmony_ci	__pmull_pre_\p	fold_consts
3008c2ecf20Sopenharmony_ci
3018c2ecf20Sopenharmony_ci	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
3028c2ecf20Sopenharmony_ci	// 128 to simplify the termination condition of the following loop.
3038c2ecf20Sopenharmony_ci	sub		len, len, #256
3048c2ecf20Sopenharmony_ci
3058c2ecf20Sopenharmony_ci	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
3068c2ecf20Sopenharmony_ci	// bytes v0-v7 into them, storing the result back into v0-v7.
3078c2ecf20Sopenharmony_ci.Lfold_128_bytes_loop_\@:
3088c2ecf20Sopenharmony_ci	fold_32_bytes	\p, v0, v1
3098c2ecf20Sopenharmony_ci	fold_32_bytes	\p, v2, v3
3108c2ecf20Sopenharmony_ci	fold_32_bytes	\p, v4, v5
3118c2ecf20Sopenharmony_ci	fold_32_bytes	\p, v6, v7
3128c2ecf20Sopenharmony_ci
3138c2ecf20Sopenharmony_ci	subs		len, len, #128
3148c2ecf20Sopenharmony_ci	b.ge		.Lfold_128_bytes_loop_\@
3158c2ecf20Sopenharmony_ci
3168c2ecf20Sopenharmony_ci	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
3178c2ecf20Sopenharmony_ci
3188c2ecf20Sopenharmony_ci	// Fold across 64 bytes.
3198c2ecf20Sopenharmony_ci	add		fold_consts_ptr, fold_consts_ptr, #16
3208c2ecf20Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
3218c2ecf20Sopenharmony_ci	__pmull_pre_\p	fold_consts
3228c2ecf20Sopenharmony_ci	fold_16_bytes	\p, v0, v4
3238c2ecf20Sopenharmony_ci	fold_16_bytes	\p, v1, v5
3248c2ecf20Sopenharmony_ci	fold_16_bytes	\p, v2, v6
3258c2ecf20Sopenharmony_ci	fold_16_bytes	\p, v3, v7, 1
3268c2ecf20Sopenharmony_ci	// Fold across 32 bytes.
3278c2ecf20Sopenharmony_ci	fold_16_bytes	\p, v4, v6
3288c2ecf20Sopenharmony_ci	fold_16_bytes	\p, v5, v7, 1
3298c2ecf20Sopenharmony_ci	// Fold across 16 bytes.
3308c2ecf20Sopenharmony_ci	fold_16_bytes	\p, v6, v7
3318c2ecf20Sopenharmony_ci
3328c2ecf20Sopenharmony_ci	// Add 128 to get the correct number of data bytes remaining in 0...127
3338c2ecf20Sopenharmony_ci	// (not counting v7), following the previous extra subtraction by 128.
3348c2ecf20Sopenharmony_ci	// Then subtract 16 to simplify the termination condition of the
3358c2ecf20Sopenharmony_ci	// following loop.
3368c2ecf20Sopenharmony_ci	adds		len, len, #(128-16)
3378c2ecf20Sopenharmony_ci
3388c2ecf20Sopenharmony_ci	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
3398c2ecf20Sopenharmony_ci	// into them, storing the result back into v7.
3408c2ecf20Sopenharmony_ci	b.lt		.Lfold_16_bytes_loop_done_\@
3418c2ecf20Sopenharmony_ci.Lfold_16_bytes_loop_\@:
3428c2ecf20Sopenharmony_ci	__pmull_\p	v8, v7, fold_consts
3438c2ecf20Sopenharmony_ci	__pmull_\p	v7, v7, fold_consts, 2
3448c2ecf20Sopenharmony_ci	eor		v7.16b, v7.16b, v8.16b
3458c2ecf20Sopenharmony_ci	ldr		q0, [buf], #16
3468c2ecf20Sopenharmony_ciCPU_LE(	rev64		v0.16b, v0.16b			)
3478c2ecf20Sopenharmony_ciCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
3488c2ecf20Sopenharmony_ci	eor		v7.16b, v7.16b, v0.16b
3498c2ecf20Sopenharmony_ci	subs		len, len, #16
3508c2ecf20Sopenharmony_ci	b.ge		.Lfold_16_bytes_loop_\@
3518c2ecf20Sopenharmony_ci
3528c2ecf20Sopenharmony_ci.Lfold_16_bytes_loop_done_\@:
3538c2ecf20Sopenharmony_ci	// Add 16 to get the correct number of data bytes remaining in 0...15
3548c2ecf20Sopenharmony_ci	// (not counting v7), following the previous extra subtraction by 16.
3558c2ecf20Sopenharmony_ci	adds		len, len, #16
3568c2ecf20Sopenharmony_ci	b.eq		.Lreduce_final_16_bytes_\@
3578c2ecf20Sopenharmony_ci
3588c2ecf20Sopenharmony_ci.Lhandle_partial_segment_\@:
3598c2ecf20Sopenharmony_ci	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
3608c2ecf20Sopenharmony_ci	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
3618c2ecf20Sopenharmony_ci	// do this without needing a fold constant for each possible 'len',
3628c2ecf20Sopenharmony_ci	// redivide the bytes into a first chunk of 'len' bytes and a second
3638c2ecf20Sopenharmony_ci	// chunk of 16 bytes, then fold the first chunk into the second.
3648c2ecf20Sopenharmony_ci
3658c2ecf20Sopenharmony_ci	// v0 = last 16 original data bytes
3668c2ecf20Sopenharmony_ci	add		buf, buf, len
3678c2ecf20Sopenharmony_ci	ldr		q0, [buf, #-16]
3688c2ecf20Sopenharmony_ciCPU_LE(	rev64		v0.16b, v0.16b			)
3698c2ecf20Sopenharmony_ciCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
3708c2ecf20Sopenharmony_ci
3718c2ecf20Sopenharmony_ci	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
3728c2ecf20Sopenharmony_ci	adr_l		x4, .Lbyteshift_table + 16
3738c2ecf20Sopenharmony_ci	sub		x4, x4, len
3748c2ecf20Sopenharmony_ci	ld1		{v2.16b}, [x4]
3758c2ecf20Sopenharmony_ci	tbl		v1.16b, {v7.16b}, v2.16b
3768c2ecf20Sopenharmony_ci
3778c2ecf20Sopenharmony_ci	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
3788c2ecf20Sopenharmony_ci	movi		v3.16b, #0x80
3798c2ecf20Sopenharmony_ci	eor		v2.16b, v2.16b, v3.16b
3808c2ecf20Sopenharmony_ci	tbl		v3.16b, {v7.16b}, v2.16b
3818c2ecf20Sopenharmony_ci
3828c2ecf20Sopenharmony_ci	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
3838c2ecf20Sopenharmony_ci	sshr		v2.16b, v2.16b, #7
3848c2ecf20Sopenharmony_ci
3858c2ecf20Sopenharmony_ci	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
3868c2ecf20Sopenharmony_ci	// then '16-len' bytes from v1 (high-order bytes).
3878c2ecf20Sopenharmony_ci	bsl		v2.16b, v1.16b, v0.16b
3888c2ecf20Sopenharmony_ci
3898c2ecf20Sopenharmony_ci	// Fold the first chunk into the second chunk, storing the result in v7.
3908c2ecf20Sopenharmony_ci	__pmull_\p	v0, v3, fold_consts
3918c2ecf20Sopenharmony_ci	__pmull_\p	v7, v3, fold_consts, 2
3928c2ecf20Sopenharmony_ci	eor		v7.16b, v7.16b, v0.16b
3938c2ecf20Sopenharmony_ci	eor		v7.16b, v7.16b, v2.16b
3948c2ecf20Sopenharmony_ci
3958c2ecf20Sopenharmony_ci.Lreduce_final_16_bytes_\@:
3968c2ecf20Sopenharmony_ci	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
3978c2ecf20Sopenharmony_ci
3988c2ecf20Sopenharmony_ci	movi		v2.16b, #0		// init zero register
3998c2ecf20Sopenharmony_ci
4008c2ecf20Sopenharmony_ci	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
4018c2ecf20Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
4028c2ecf20Sopenharmony_ci	__pmull_pre_\p	fold_consts
4038c2ecf20Sopenharmony_ci
4048c2ecf20Sopenharmony_ci	// Fold the high 64 bits into the low 64 bits, while also multiplying by
4058c2ecf20Sopenharmony_ci	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
4068c2ecf20Sopenharmony_ci	// whose low 48 bits are 0.
4078c2ecf20Sopenharmony_ci	ext		v0.16b, v2.16b, v7.16b, #8
4088c2ecf20Sopenharmony_ci	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
4098c2ecf20Sopenharmony_ci	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64
4108c2ecf20Sopenharmony_ci
4118c2ecf20Sopenharmony_ci	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
4128c2ecf20Sopenharmony_ci	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
4138c2ecf20Sopenharmony_ci	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
4148c2ecf20Sopenharmony_ci	mov		v0.s[3], v2.s[0]	// zero high 32 bits
4158c2ecf20Sopenharmony_ci	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
4168c2ecf20Sopenharmony_ci	eor		v0.16b, v0.16b, v1.16b	// + low bits
4178c2ecf20Sopenharmony_ci
4188c2ecf20Sopenharmony_ci	// Load G(x) and floor(x^48 / G(x)).
4198c2ecf20Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr]
4208c2ecf20Sopenharmony_ci	__pmull_pre_\p	fold_consts
4218c2ecf20Sopenharmony_ci
4228c2ecf20Sopenharmony_ci	// Use Barrett reduction to compute the final CRC value.
4238c2ecf20Sopenharmony_ci	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
4248c2ecf20Sopenharmony_ci	ushr		v1.2d, v1.2d, #32	// /= x^32
4258c2ecf20Sopenharmony_ci	__pmull_\p	v1, v1, fold_consts	// *= G(x)
4268c2ecf20Sopenharmony_ci	ushr		v0.2d, v0.2d, #48
4278c2ecf20Sopenharmony_ci	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
4288c2ecf20Sopenharmony_ci	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
4298c2ecf20Sopenharmony_ci
4308c2ecf20Sopenharmony_ci	umov		w0, v0.h[0]
4318c2ecf20Sopenharmony_ci	.ifc		\p, p8
4328c2ecf20Sopenharmony_ci	ldp		x29, x30, [sp], #16
4338c2ecf20Sopenharmony_ci	.endif
4348c2ecf20Sopenharmony_ci	ret
4358c2ecf20Sopenharmony_ci
4368c2ecf20Sopenharmony_ci.Lless_than_256_bytes_\@:
4378c2ecf20Sopenharmony_ci	// Checksumming a buffer of length 16...255 bytes
4388c2ecf20Sopenharmony_ci
4398c2ecf20Sopenharmony_ci	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
4408c2ecf20Sopenharmony_ci
4418c2ecf20Sopenharmony_ci	// Load the first 16 data bytes.
4428c2ecf20Sopenharmony_ci	ldr		q7, [buf], #0x10
4438c2ecf20Sopenharmony_ciCPU_LE(	rev64		v7.16b, v7.16b			)
4448c2ecf20Sopenharmony_ciCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
4458c2ecf20Sopenharmony_ci
4468c2ecf20Sopenharmony_ci	// XOR the first 16 data *bits* with the initial CRC value.
4478c2ecf20Sopenharmony_ci	movi		v0.16b, #0
4488c2ecf20Sopenharmony_ci	mov		v0.h[7], init_crc
4498c2ecf20Sopenharmony_ci	eor		v7.16b, v7.16b, v0.16b
4508c2ecf20Sopenharmony_ci
4518c2ecf20Sopenharmony_ci	// Load the fold-across-16-bytes constants.
4528c2ecf20Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
4538c2ecf20Sopenharmony_ci	__pmull_pre_\p	fold_consts
4548c2ecf20Sopenharmony_ci
4558c2ecf20Sopenharmony_ci	cmp		len, #16
4568c2ecf20Sopenharmony_ci	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
4578c2ecf20Sopenharmony_ci	subs		len, len, #32
4588c2ecf20Sopenharmony_ci	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
4598c2ecf20Sopenharmony_ci	add		len, len, #16
4608c2ecf20Sopenharmony_ci	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
4618c2ecf20Sopenharmony_ci	.endm
4628c2ecf20Sopenharmony_ci
4638c2ecf20Sopenharmony_ci//
4648c2ecf20Sopenharmony_ci// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
4658c2ecf20Sopenharmony_ci//
4668c2ecf20Sopenharmony_ci// Assumes len >= 16.
4678c2ecf20Sopenharmony_ci//
4688c2ecf20Sopenharmony_ciSYM_FUNC_START(crc_t10dif_pmull_p8)
4698c2ecf20Sopenharmony_ci	stp		x29, x30, [sp, #-16]!
4708c2ecf20Sopenharmony_ci	mov		x29, sp
4718c2ecf20Sopenharmony_ci	crc_t10dif_pmull p8
4728c2ecf20Sopenharmony_ciSYM_FUNC_END(crc_t10dif_pmull_p8)
4738c2ecf20Sopenharmony_ci
4748c2ecf20Sopenharmony_ci	.align		5
4758c2ecf20Sopenharmony_ci//
4768c2ecf20Sopenharmony_ci// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
4778c2ecf20Sopenharmony_ci//
4788c2ecf20Sopenharmony_ci// Assumes len >= 16.
4798c2ecf20Sopenharmony_ci//
4808c2ecf20Sopenharmony_ciSYM_FUNC_START(crc_t10dif_pmull_p64)
4818c2ecf20Sopenharmony_ci	crc_t10dif_pmull	p64
4828c2ecf20Sopenharmony_ciSYM_FUNC_END(crc_t10dif_pmull_p64)
4838c2ecf20Sopenharmony_ci
4848c2ecf20Sopenharmony_ci	.section	".rodata", "a"
4858c2ecf20Sopenharmony_ci	.align		4
4868c2ecf20Sopenharmony_ci
4878c2ecf20Sopenharmony_ci// Fold constants precomputed from the polynomial 0x18bb7
4888c2ecf20Sopenharmony_ci// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
4898c2ecf20Sopenharmony_ci.Lfold_across_128_bytes_consts:
4908c2ecf20Sopenharmony_ci	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
4918c2ecf20Sopenharmony_ci	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
4928c2ecf20Sopenharmony_ci// .Lfold_across_64_bytes_consts:
4938c2ecf20Sopenharmony_ci	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
4948c2ecf20Sopenharmony_ci	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
4958c2ecf20Sopenharmony_ci// .Lfold_across_32_bytes_consts:
4968c2ecf20Sopenharmony_ci	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
4978c2ecf20Sopenharmony_ci	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
4988c2ecf20Sopenharmony_ci.Lfold_across_16_bytes_consts:
4998c2ecf20Sopenharmony_ci	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
5008c2ecf20Sopenharmony_ci	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
5018c2ecf20Sopenharmony_ci// .Lfinal_fold_consts:
5028c2ecf20Sopenharmony_ci	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
5038c2ecf20Sopenharmony_ci	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
5048c2ecf20Sopenharmony_ci// .Lbarrett_reduction_consts:
5058c2ecf20Sopenharmony_ci	.quad		0x0000000000018bb7	// G(x)
5068c2ecf20Sopenharmony_ci	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
5078c2ecf20Sopenharmony_ci
5088c2ecf20Sopenharmony_ci// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
5098c2ecf20Sopenharmony_ci// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
5108c2ecf20Sopenharmony_ci// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
5118c2ecf20Sopenharmony_ci.Lbyteshift_table:
5128c2ecf20Sopenharmony_ci	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
5138c2ecf20Sopenharmony_ci	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
5148c2ecf20Sopenharmony_ci	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
5158c2ecf20Sopenharmony_ci	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
516