arm64/crypto/crct10dif-ce-core.S

8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
8c2ecf20Sopenharmony_ci// Copyright (C) 2019 Google LLC <ebiggers@google.com>
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// This program is free software; you can redistribute it and/or modify
8c2ecf20Sopenharmony_ci// it under the terms of the GNU General Public License version 2 as
8c2ecf20Sopenharmony_ci// published by the Free Software Foundation.
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci// Derived from the x86 version:
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// Copyright (c) 2013, Intel Corporation
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// Authors:
8c2ecf20Sopenharmony_ci//     Erdinc Ozturk <erdinc.ozturk@intel.com>
8c2ecf20Sopenharmony_ci//     Vinodh Gopal <vinodh.gopal@intel.com>
8c2ecf20Sopenharmony_ci//     James Guilford <james.guilford@intel.com>
8c2ecf20Sopenharmony_ci//     Tim Chen <tim.c.chen@linux.intel.com>
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// This software is available to you under a choice of one of two
8c2ecf20Sopenharmony_ci// licenses.  You may choose to be licensed under the terms of the GNU
8c2ecf20Sopenharmony_ci// General Public License (GPL) Version 2, available from the file
8c2ecf20Sopenharmony_ci// COPYING in the main directory of this source tree, or the
8c2ecf20Sopenharmony_ci// OpenIB.org BSD license below:
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// Redistribution and use in source and binary forms, with or without
8c2ecf20Sopenharmony_ci// modification, are permitted provided that the following conditions are
8c2ecf20Sopenharmony_ci// met:
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// * Redistributions of source code must retain the above copyright
8c2ecf20Sopenharmony_ci//   notice, this list of conditions and the following disclaimer.
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// * Redistributions in binary form must reproduce the above copyright
8c2ecf20Sopenharmony_ci//   notice, this list of conditions and the following disclaimer in the
8c2ecf20Sopenharmony_ci//   documentation and/or other materials provided with the
8c2ecf20Sopenharmony_ci//   distribution.
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// * Neither the name of the Intel Corporation nor the names of its
8c2ecf20Sopenharmony_ci//   contributors may be used to endorse or promote products derived from
8c2ecf20Sopenharmony_ci//   this software without specific prior written permission.
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
8c2ecf20Sopenharmony_ci// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
8c2ecf20Sopenharmony_ci// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
8c2ecf20Sopenharmony_ci// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
8c2ecf20Sopenharmony_ci// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
8c2ecf20Sopenharmony_ci// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
8c2ecf20Sopenharmony_ci// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
8c2ecf20Sopenharmony_ci// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
8c2ecf20Sopenharmony_ci// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
8c2ecf20Sopenharmony_ci// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
8c2ecf20Sopenharmony_ci// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci//       Reference paper titled "Fast CRC Computation for Generic
8c2ecf20Sopenharmony_ci//	Polynomials Using PCLMULQDQ Instruction"
8c2ecf20Sopenharmony_ci//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
8c2ecf20Sopenharmony_ci//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci#include <linux/linkage.h>
8c2ecf20Sopenharmony_ci#include <asm/assembler.h>
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	.text
8c2ecf20Sopenharmony_ci	.arch		armv8-a+crypto
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	init_crc	.req	w0
8c2ecf20Sopenharmony_ci	buf		.req	x1
8c2ecf20Sopenharmony_ci	len		.req	x2
8c2ecf20Sopenharmony_ci	fold_consts_ptr	.req	x3
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	fold_consts	.req	v10
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	ad		.req	v14
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	k00_16		.req	v15
8c2ecf20Sopenharmony_ci	k32_48		.req	v16
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	t3		.req	v17
8c2ecf20Sopenharmony_ci	t4		.req	v18
8c2ecf20Sopenharmony_ci	t5		.req	v19
8c2ecf20Sopenharmony_ci	t6		.req	v20
8c2ecf20Sopenharmony_ci	t7		.req	v21
8c2ecf20Sopenharmony_ci	t8		.req	v22
8c2ecf20Sopenharmony_ci	t9		.req	v23
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	perm1		.req	v24
8c2ecf20Sopenharmony_ci	perm2		.req	v25
8c2ecf20Sopenharmony_ci	perm3		.req	v26
8c2ecf20Sopenharmony_ci	perm4		.req	v27
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	bd1		.req	v28
8c2ecf20Sopenharmony_ci	bd2		.req	v29
8c2ecf20Sopenharmony_ci	bd3		.req	v30
8c2ecf20Sopenharmony_ci	bd4		.req	v31
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	.macro		__pmull_init_p64
8c2ecf20Sopenharmony_ci	.endm
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	.macro		__pmull_pre_p64, bd
8c2ecf20Sopenharmony_ci	.endm
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	.macro		__pmull_init_p8
8c2ecf20Sopenharmony_ci	// k00_16 := 0x0000000000000000_000000000000ffff
8c2ecf20Sopenharmony_ci	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
8c2ecf20Sopenharmony_ci	movi		k32_48.2d, #0xffffffff
8c2ecf20Sopenharmony_ci	mov		k32_48.h[2], k32_48.h[0]
8c2ecf20Sopenharmony_ci	ushr		k00_16.2d, k32_48.2d, #32
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// prepare the permutation vectors
8c2ecf20Sopenharmony_ci	mov_q		x5, 0x080f0e0d0c0b0a09
8c2ecf20Sopenharmony_ci	movi		perm4.8b, #8
8c2ecf20Sopenharmony_ci	dup		perm1.2d, x5
8c2ecf20Sopenharmony_ci	eor		perm1.16b, perm1.16b, perm4.16b
8c2ecf20Sopenharmony_ci	ushr		perm2.2d, perm1.2d, #8
8c2ecf20Sopenharmony_ci	ushr		perm3.2d, perm1.2d, #16
8c2ecf20Sopenharmony_ci	ushr		perm4.2d, perm1.2d, #24
8c2ecf20Sopenharmony_ci	sli		perm2.2d, perm1.2d, #56
8c2ecf20Sopenharmony_ci	sli		perm3.2d, perm1.2d, #48
8c2ecf20Sopenharmony_ci	sli		perm4.2d, perm1.2d, #40
8c2ecf20Sopenharmony_ci	.endm
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	.macro		__pmull_pre_p8, bd
8c2ecf20Sopenharmony_ci	tbl		bd1.16b, {\bd\().16b}, perm1.16b
8c2ecf20Sopenharmony_ci	tbl		bd2.16b, {\bd\().16b}, perm2.16b
8c2ecf20Sopenharmony_ci	tbl		bd3.16b, {\bd\().16b}, perm3.16b
8c2ecf20Sopenharmony_ci	tbl		bd4.16b, {\bd\().16b}, perm4.16b
8c2ecf20Sopenharmony_ci	.endm
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__pmull_p8_core)
8c2ecf20Sopenharmony_ci.L__pmull_p8_core:
8c2ecf20Sopenharmony_ci	ext		t4.8b, ad.8b, ad.8b, #1			// A1
8c2ecf20Sopenharmony_ci	ext		t5.8b, ad.8b, ad.8b, #2			// A2
8c2ecf20Sopenharmony_ci	ext		t6.8b, ad.8b, ad.8b, #3			// A3
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
8c2ecf20Sopenharmony_ci	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
8c2ecf20Sopenharmony_ci	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
8c2ecf20Sopenharmony_ci	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
8c2ecf20Sopenharmony_ci	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
8c2ecf20Sopenharmony_ci	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
8c2ecf20Sopenharmony_ci	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
8c2ecf20Sopenharmony_ci	b		0f
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci.L__pmull_p8_core2:
8c2ecf20Sopenharmony_ci	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
8c2ecf20Sopenharmony_ci	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
8c2ecf20Sopenharmony_ci	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
8c2ecf20Sopenharmony_ci	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
8c2ecf20Sopenharmony_ci	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
8c2ecf20Sopenharmony_ci	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
8c2ecf20Sopenharmony_ci	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
8c2ecf20Sopenharmony_ci	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
8c2ecf20Sopenharmony_ci	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci0:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
8c2ecf20Sopenharmony_ci	eor		t5.16b, t5.16b, t7.16b			// M = G + H
8c2ecf20Sopenharmony_ci	eor		t6.16b, t6.16b, t9.16b			// N = I + J
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	uzp1		t8.2d, t4.2d, t5.2d
8c2ecf20Sopenharmony_ci	uzp2		t4.2d, t4.2d, t5.2d
8c2ecf20Sopenharmony_ci	uzp1		t7.2d, t6.2d, t3.2d
8c2ecf20Sopenharmony_ci	uzp2		t6.2d, t6.2d, t3.2d
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// t4 = (L) (P0 + P1) << 8
8c2ecf20Sopenharmony_ci	// t5 = (M) (P2 + P3) << 16
8c2ecf20Sopenharmony_ci	eor		t8.16b, t8.16b, t4.16b
8c2ecf20Sopenharmony_ci	and		t4.16b, t4.16b, k32_48.16b
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// t6 = (N) (P4 + P5) << 24
8c2ecf20Sopenharmony_ci	// t7 = (K) (P6 + P7) << 32
8c2ecf20Sopenharmony_ci	eor		t7.16b, t7.16b, t6.16b
8c2ecf20Sopenharmony_ci	and		t6.16b, t6.16b, k00_16.16b
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	eor		t8.16b, t8.16b, t4.16b
8c2ecf20Sopenharmony_ci	eor		t7.16b, t7.16b, t6.16b
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	zip2		t5.2d, t8.2d, t4.2d
8c2ecf20Sopenharmony_ci	zip1		t4.2d, t8.2d, t4.2d
8c2ecf20Sopenharmony_ci	zip2		t3.2d, t7.2d, t6.2d
8c2ecf20Sopenharmony_ci	zip1		t6.2d, t7.2d, t6.2d
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	ext		t4.16b, t4.16b, t4.16b, #15
8c2ecf20Sopenharmony_ci	ext		t5.16b, t5.16b, t5.16b, #14
8c2ecf20Sopenharmony_ci	ext		t6.16b, t6.16b, t6.16b, #13
8c2ecf20Sopenharmony_ci	ext		t3.16b, t3.16b, t3.16b, #12
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	eor		t4.16b, t4.16b, t5.16b
8c2ecf20Sopenharmony_ci	eor		t6.16b, t6.16b, t3.16b
8c2ecf20Sopenharmony_ci	ret
8c2ecf20Sopenharmony_ciSYM_FUNC_END(__pmull_p8_core)
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	.macro		__pmull_p8, rq, ad, bd, i
8c2ecf20Sopenharmony_ci	.ifnc		\bd, fold_consts
8c2ecf20Sopenharmony_ci	.err
8c2ecf20Sopenharmony_ci	.endif
8c2ecf20Sopenharmony_ci	mov		ad.16b, \ad\().16b
8c2ecf20Sopenharmony_ci	.ifb		\i
8c2ecf20Sopenharmony_ci	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
8c2ecf20Sopenharmony_ci	.else
8c2ecf20Sopenharmony_ci	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
8c2ecf20Sopenharmony_ci	.endif
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	bl		.L__pmull_p8_core\i
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	eor		\rq\().16b, \rq\().16b, t4.16b
8c2ecf20Sopenharmony_ci	eor		\rq\().16b, \rq\().16b, t6.16b
8c2ecf20Sopenharmony_ci	.endm
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
8c2ecf20Sopenharmony_ci	// into reg1, reg2.
8c2ecf20Sopenharmony_ci	.macro		fold_32_bytes, p, reg1, reg2
8c2ecf20Sopenharmony_ci	ldp		q11, q12, [buf], #0x20
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	__pmull_\p	v8, \reg1, fold_consts, 2
8c2ecf20Sopenharmony_ci	__pmull_\p	\reg1, \reg1, fold_consts
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ciCPU_LE(	rev64		v11.16b, v11.16b		)
8c2ecf20Sopenharmony_ciCPU_LE(	rev64		v12.16b, v12.16b		)
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	__pmull_\p	v9, \reg2, fold_consts, 2
8c2ecf20Sopenharmony_ci	__pmull_\p	\reg2, \reg2, fold_consts
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ciCPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
8c2ecf20Sopenharmony_ciCPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	eor		\reg1\().16b, \reg1\().16b, v8.16b
8c2ecf20Sopenharmony_ci	eor		\reg2\().16b, \reg2\().16b, v9.16b
8c2ecf20Sopenharmony_ci	eor		\reg1\().16b, \reg1\().16b, v11.16b
8c2ecf20Sopenharmony_ci	eor		\reg2\().16b, \reg2\().16b, v12.16b
8c2ecf20Sopenharmony_ci	.endm
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Fold src_reg into dst_reg, optionally loading the next fold constants
8c2ecf20Sopenharmony_ci	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
8c2ecf20Sopenharmony_ci	__pmull_\p	v8, \src_reg, fold_consts
8c2ecf20Sopenharmony_ci	__pmull_\p	\src_reg, \src_reg, fold_consts, 2
8c2ecf20Sopenharmony_ci	.ifnb		\load_next_consts
8c2ecf20Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
8c2ecf20Sopenharmony_ci	__pmull_pre_\p	fold_consts
8c2ecf20Sopenharmony_ci	.endif
8c2ecf20Sopenharmony_ci	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
8c2ecf20Sopenharmony_ci	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
8c2ecf20Sopenharmony_ci	.endm
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	.macro		__pmull_p64, rd, rn, rm, n
8c2ecf20Sopenharmony_ci	.ifb		\n
8c2ecf20Sopenharmony_ci	pmull		\rd\().1q, \rn\().1d, \rm\().1d
8c2ecf20Sopenharmony_ci	.else
8c2ecf20Sopenharmony_ci	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
8c2ecf20Sopenharmony_ci	.endif
8c2ecf20Sopenharmony_ci	.endm
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	.macro		crc_t10dif_pmull, p
8c2ecf20Sopenharmony_ci	__pmull_init_\p
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
8c2ecf20Sopenharmony_ci	cmp		len, #256
8c2ecf20Sopenharmony_ci	b.lt		.Lless_than_256_bytes_\@
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Load the first 128 data bytes.  Byte swapping is necessary to make
8c2ecf20Sopenharmony_ci	// the bit order match the polynomial coefficient order.
8c2ecf20Sopenharmony_ci	ldp		q0, q1, [buf]
8c2ecf20Sopenharmony_ci	ldp		q2, q3, [buf, #0x20]
8c2ecf20Sopenharmony_ci	ldp		q4, q5, [buf, #0x40]
8c2ecf20Sopenharmony_ci	ldp		q6, q7, [buf, #0x60]
8c2ecf20Sopenharmony_ci	add		buf, buf, #0x80
8c2ecf20Sopenharmony_ciCPU_LE(	rev64		v0.16b, v0.16b			)
8c2ecf20Sopenharmony_ciCPU_LE(	rev64		v1.16b, v1.16b			)
8c2ecf20Sopenharmony_ciCPU_LE(	rev64		v2.16b, v2.16b			)
8c2ecf20Sopenharmony_ciCPU_LE(	rev64		v3.16b, v3.16b			)
8c2ecf20Sopenharmony_ciCPU_LE(	rev64		v4.16b, v4.16b			)
8c2ecf20Sopenharmony_ciCPU_LE(	rev64		v5.16b, v5.16b			)
8c2ecf20Sopenharmony_ciCPU_LE(	rev64		v6.16b, v6.16b			)
8c2ecf20Sopenharmony_ciCPU_LE(	rev64		v7.16b, v7.16b			)
8c2ecf20Sopenharmony_ciCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
8c2ecf20Sopenharmony_ciCPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
8c2ecf20Sopenharmony_ciCPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
8c2ecf20Sopenharmony_ciCPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
8c2ecf20Sopenharmony_ciCPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
8c2ecf20Sopenharmony_ciCPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
8c2ecf20Sopenharmony_ciCPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
8c2ecf20Sopenharmony_ciCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// XOR the first 16 data *bits* with the initial CRC value.
8c2ecf20Sopenharmony_ci	movi		v8.16b, #0
8c2ecf20Sopenharmony_ci	mov		v8.h[7], init_crc
8c2ecf20Sopenharmony_ci	eor		v0.16b, v0.16b, v8.16b
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Load the constants for folding across 128 bytes.
8c2ecf20Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr]
8c2ecf20Sopenharmony_ci	__pmull_pre_\p	fold_consts
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
8c2ecf20Sopenharmony_ci	// 128 to simplify the termination condition of the following loop.
8c2ecf20Sopenharmony_ci	sub		len, len, #256
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
8c2ecf20Sopenharmony_ci	// bytes v0-v7 into them, storing the result back into v0-v7.
8c2ecf20Sopenharmony_ci.Lfold_128_bytes_loop_\@:
8c2ecf20Sopenharmony_ci	fold_32_bytes	\p, v0, v1
8c2ecf20Sopenharmony_ci	fold_32_bytes	\p, v2, v3
8c2ecf20Sopenharmony_ci	fold_32_bytes	\p, v4, v5
8c2ecf20Sopenharmony_ci	fold_32_bytes	\p, v6, v7
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	subs		len, len, #128
8c2ecf20Sopenharmony_ci	b.ge		.Lfold_128_bytes_loop_\@
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Fold across 64 bytes.
8c2ecf20Sopenharmony_ci	add		fold_consts_ptr, fold_consts_ptr, #16
8c2ecf20Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
8c2ecf20Sopenharmony_ci	__pmull_pre_\p	fold_consts
8c2ecf20Sopenharmony_ci	fold_16_bytes	\p, v0, v4
8c2ecf20Sopenharmony_ci	fold_16_bytes	\p, v1, v5
8c2ecf20Sopenharmony_ci	fold_16_bytes	\p, v2, v6
8c2ecf20Sopenharmony_ci	fold_16_bytes	\p, v3, v7, 1
8c2ecf20Sopenharmony_ci	// Fold across 32 bytes.
8c2ecf20Sopenharmony_ci	fold_16_bytes	\p, v4, v6
8c2ecf20Sopenharmony_ci	fold_16_bytes	\p, v5, v7, 1
8c2ecf20Sopenharmony_ci	// Fold across 16 bytes.
8c2ecf20Sopenharmony_ci	fold_16_bytes	\p, v6, v7
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Add 128 to get the correct number of data bytes remaining in 0...127
8c2ecf20Sopenharmony_ci	// (not counting v7), following the previous extra subtraction by 128.
8c2ecf20Sopenharmony_ci	// Then subtract 16 to simplify the termination condition of the
8c2ecf20Sopenharmony_ci	// following loop.
8c2ecf20Sopenharmony_ci	adds		len, len, #(128-16)
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
8c2ecf20Sopenharmony_ci	// into them, storing the result back into v7.
8c2ecf20Sopenharmony_ci	b.lt		.Lfold_16_bytes_loop_done_\@
8c2ecf20Sopenharmony_ci.Lfold_16_bytes_loop_\@:
8c2ecf20Sopenharmony_ci	__pmull_\p	v8, v7, fold_consts
8c2ecf20Sopenharmony_ci	__pmull_\p	v7, v7, fold_consts, 2
8c2ecf20Sopenharmony_ci	eor		v7.16b, v7.16b, v8.16b
8c2ecf20Sopenharmony_ci	ldr		q0, [buf], #16
8c2ecf20Sopenharmony_ciCPU_LE(	rev64		v0.16b, v0.16b			)
8c2ecf20Sopenharmony_ciCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
8c2ecf20Sopenharmony_ci	eor		v7.16b, v7.16b, v0.16b
8c2ecf20Sopenharmony_ci	subs		len, len, #16
8c2ecf20Sopenharmony_ci	b.ge		.Lfold_16_bytes_loop_\@
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci.Lfold_16_bytes_loop_done_\@:
8c2ecf20Sopenharmony_ci	// Add 16 to get the correct number of data bytes remaining in 0...15
8c2ecf20Sopenharmony_ci	// (not counting v7), following the previous extra subtraction by 16.
8c2ecf20Sopenharmony_ci	adds		len, len, #16
8c2ecf20Sopenharmony_ci	b.eq		.Lreduce_final_16_bytes_\@
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci.Lhandle_partial_segment_\@:
8c2ecf20Sopenharmony_ci	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
8c2ecf20Sopenharmony_ci	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
8c2ecf20Sopenharmony_ci	// do this without needing a fold constant for each possible 'len',
8c2ecf20Sopenharmony_ci	// redivide the bytes into a first chunk of 'len' bytes and a second
8c2ecf20Sopenharmony_ci	// chunk of 16 bytes, then fold the first chunk into the second.
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// v0 = last 16 original data bytes
8c2ecf20Sopenharmony_ci	add		buf, buf, len
8c2ecf20Sopenharmony_ci	ldr		q0, [buf, #-16]
8c2ecf20Sopenharmony_ciCPU_LE(	rev64		v0.16b, v0.16b			)
8c2ecf20Sopenharmony_ciCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
8c2ecf20Sopenharmony_ci	adr_l		x4, .Lbyteshift_table + 16
8c2ecf20Sopenharmony_ci	sub		x4, x4, len
8c2ecf20Sopenharmony_ci	ld1		{v2.16b}, [x4]
8c2ecf20Sopenharmony_ci	tbl		v1.16b, {v7.16b}, v2.16b
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
8c2ecf20Sopenharmony_ci	movi		v3.16b, #0x80
8c2ecf20Sopenharmony_ci	eor		v2.16b, v2.16b, v3.16b
8c2ecf20Sopenharmony_ci	tbl		v3.16b, {v7.16b}, v2.16b
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
8c2ecf20Sopenharmony_ci	sshr		v2.16b, v2.16b, #7
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
8c2ecf20Sopenharmony_ci	// then '16-len' bytes from v1 (high-order bytes).
8c2ecf20Sopenharmony_ci	bsl		v2.16b, v1.16b, v0.16b
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Fold the first chunk into the second chunk, storing the result in v7.
8c2ecf20Sopenharmony_ci	__pmull_\p	v0, v3, fold_consts
8c2ecf20Sopenharmony_ci	__pmull_\p	v7, v3, fold_consts, 2
8c2ecf20Sopenharmony_ci	eor		v7.16b, v7.16b, v0.16b
8c2ecf20Sopenharmony_ci	eor		v7.16b, v7.16b, v2.16b
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci.Lreduce_final_16_bytes_\@:
8c2ecf20Sopenharmony_ci	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	movi		v2.16b, #0		// init zero register
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
8c2ecf20Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
8c2ecf20Sopenharmony_ci	__pmull_pre_\p	fold_consts
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Fold the high 64 bits into the low 64 bits, while also multiplying by
8c2ecf20Sopenharmony_ci	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
8c2ecf20Sopenharmony_ci	// whose low 48 bits are 0.
8c2ecf20Sopenharmony_ci	ext		v0.16b, v2.16b, v7.16b, #8
8c2ecf20Sopenharmony_ci	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
8c2ecf20Sopenharmony_ci	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
8c2ecf20Sopenharmony_ci	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
8c2ecf20Sopenharmony_ci	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
8c2ecf20Sopenharmony_ci	mov		v0.s[3], v2.s[0]	// zero high 32 bits
8c2ecf20Sopenharmony_ci	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
8c2ecf20Sopenharmony_ci	eor		v0.16b, v0.16b, v1.16b	// + low bits
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Load G(x) and floor(x^48 / G(x)).
8c2ecf20Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr]
8c2ecf20Sopenharmony_ci	__pmull_pre_\p	fold_consts
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Use Barrett reduction to compute the final CRC value.
8c2ecf20Sopenharmony_ci	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
8c2ecf20Sopenharmony_ci	ushr		v1.2d, v1.2d, #32	// /= x^32
8c2ecf20Sopenharmony_ci	__pmull_\p	v1, v1, fold_consts	// *= G(x)
8c2ecf20Sopenharmony_ci	ushr		v0.2d, v0.2d, #48
8c2ecf20Sopenharmony_ci	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
8c2ecf20Sopenharmony_ci	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	umov		w0, v0.h[0]
8c2ecf20Sopenharmony_ci	.ifc		\p, p8
8c2ecf20Sopenharmony_ci	ldp		x29, x30, [sp], #16
8c2ecf20Sopenharmony_ci	.endif
8c2ecf20Sopenharmony_ci	ret
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci.Lless_than_256_bytes_\@:
8c2ecf20Sopenharmony_ci	// Checksumming a buffer of length 16...255 bytes
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Load the first 16 data bytes.
8c2ecf20Sopenharmony_ci	ldr		q7, [buf], #0x10
8c2ecf20Sopenharmony_ciCPU_LE(	rev64		v7.16b, v7.16b			)
8c2ecf20Sopenharmony_ciCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// XOR the first 16 data *bits* with the initial CRC value.
8c2ecf20Sopenharmony_ci	movi		v0.16b, #0
8c2ecf20Sopenharmony_ci	mov		v0.h[7], init_crc
8c2ecf20Sopenharmony_ci	eor		v7.16b, v7.16b, v0.16b
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	// Load the fold-across-16-bytes constants.
8c2ecf20Sopenharmony_ci	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
8c2ecf20Sopenharmony_ci	__pmull_pre_\p	fold_consts
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	cmp		len, #16
8c2ecf20Sopenharmony_ci	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
8c2ecf20Sopenharmony_ci	subs		len, len, #32
8c2ecf20Sopenharmony_ci	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
8c2ecf20Sopenharmony_ci	add		len, len, #16
8c2ecf20Sopenharmony_ci	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
8c2ecf20Sopenharmony_ci	.endm
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// Assumes len >= 16.
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ciSYM_FUNC_START(crc_t10dif_pmull_p8)
8c2ecf20Sopenharmony_ci	stp		x29, x30, [sp, #-16]!
8c2ecf20Sopenharmony_ci	mov		x29, sp
8c2ecf20Sopenharmony_ci	crc_t10dif_pmull p8
8c2ecf20Sopenharmony_ciSYM_FUNC_END(crc_t10dif_pmull_p8)
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	.align		5
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ci// Assumes len >= 16.
8c2ecf20Sopenharmony_ci//
8c2ecf20Sopenharmony_ciSYM_FUNC_START(crc_t10dif_pmull_p64)
8c2ecf20Sopenharmony_ci	crc_t10dif_pmull	p64
8c2ecf20Sopenharmony_ciSYM_FUNC_END(crc_t10dif_pmull_p64)
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci	.section	".rodata", "a"
8c2ecf20Sopenharmony_ci	.align		4
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci// Fold constants precomputed from the polynomial 0x18bb7
8c2ecf20Sopenharmony_ci// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
8c2ecf20Sopenharmony_ci.Lfold_across_128_bytes_consts:
8c2ecf20Sopenharmony_ci	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
8c2ecf20Sopenharmony_ci	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
8c2ecf20Sopenharmony_ci// .Lfold_across_64_bytes_consts:
8c2ecf20Sopenharmony_ci	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
8c2ecf20Sopenharmony_ci	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
8c2ecf20Sopenharmony_ci// .Lfold_across_32_bytes_consts:
8c2ecf20Sopenharmony_ci	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
8c2ecf20Sopenharmony_ci	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
8c2ecf20Sopenharmony_ci.Lfold_across_16_bytes_consts:
8c2ecf20Sopenharmony_ci	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
8c2ecf20Sopenharmony_ci	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
8c2ecf20Sopenharmony_ci// .Lfinal_fold_consts:
8c2ecf20Sopenharmony_ci	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
8c2ecf20Sopenharmony_ci	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
8c2ecf20Sopenharmony_ci// .Lbarrett_reduction_consts:
8c2ecf20Sopenharmony_ci	.quad		0x0000000000018bb7	// G(x)
8c2ecf20Sopenharmony_ci	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
8c2ecf20Sopenharmony_ci
8c2ecf20Sopenharmony_ci// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
8c2ecf20Sopenharmony_ci// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
8c2ecf20Sopenharmony_ci// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
8c2ecf20Sopenharmony_ci.Lbyteshift_table:
8c2ecf20Sopenharmony_ci	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
8c2ecf20Sopenharmony_ci	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
8c2ecf20Sopenharmony_ci	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
8c2ecf20Sopenharmony_ci	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0