162306a36Sopenharmony_ci// 262306a36Sopenharmony_ci// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions 362306a36Sopenharmony_ci// 462306a36Sopenharmony_ci// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 562306a36Sopenharmony_ci// Copyright (C) 2019 Google LLC <ebiggers@google.com> 662306a36Sopenharmony_ci// 762306a36Sopenharmony_ci// This program is free software; you can redistribute it and/or modify 862306a36Sopenharmony_ci// it under the terms of the GNU General Public License version 2 as 962306a36Sopenharmony_ci// published by the Free Software Foundation. 1062306a36Sopenharmony_ci// 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci// Derived from the x86 version: 1362306a36Sopenharmony_ci// 1462306a36Sopenharmony_ci// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions 1562306a36Sopenharmony_ci// 1662306a36Sopenharmony_ci// Copyright (c) 2013, Intel Corporation 1762306a36Sopenharmony_ci// 1862306a36Sopenharmony_ci// Authors: 1962306a36Sopenharmony_ci// Erdinc Ozturk <erdinc.ozturk@intel.com> 2062306a36Sopenharmony_ci// Vinodh Gopal <vinodh.gopal@intel.com> 2162306a36Sopenharmony_ci// James Guilford <james.guilford@intel.com> 2262306a36Sopenharmony_ci// Tim Chen <tim.c.chen@linux.intel.com> 2362306a36Sopenharmony_ci// 2462306a36Sopenharmony_ci// This software is available to you under a choice of one of two 2562306a36Sopenharmony_ci// licenses. You may choose to be licensed under the terms of the GNU 2662306a36Sopenharmony_ci// General Public License (GPL) Version 2, available from the file 2762306a36Sopenharmony_ci// COPYING in the main directory of this source tree, or the 2862306a36Sopenharmony_ci// OpenIB.org BSD license below: 2962306a36Sopenharmony_ci// 3062306a36Sopenharmony_ci// Redistribution and use in source and binary forms, with or without 3162306a36Sopenharmony_ci// modification, are permitted provided that the following conditions are 3262306a36Sopenharmony_ci// met: 3362306a36Sopenharmony_ci// 3462306a36Sopenharmony_ci// * Redistributions of source code must retain the above copyright 3562306a36Sopenharmony_ci// notice, this list of conditions and the following disclaimer. 3662306a36Sopenharmony_ci// 3762306a36Sopenharmony_ci// * Redistributions in binary form must reproduce the above copyright 3862306a36Sopenharmony_ci// notice, this list of conditions and the following disclaimer in the 3962306a36Sopenharmony_ci// documentation and/or other materials provided with the 4062306a36Sopenharmony_ci// distribution. 4162306a36Sopenharmony_ci// 4262306a36Sopenharmony_ci// * Neither the name of the Intel Corporation nor the names of its 4362306a36Sopenharmony_ci// contributors may be used to endorse or promote products derived from 4462306a36Sopenharmony_ci// this software without specific prior written permission. 4562306a36Sopenharmony_ci// 4662306a36Sopenharmony_ci// 4762306a36Sopenharmony_ci// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 4862306a36Sopenharmony_ci// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 4962306a36Sopenharmony_ci// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 5062306a36Sopenharmony_ci// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 5162306a36Sopenharmony_ci// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 5262306a36Sopenharmony_ci// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 5362306a36Sopenharmony_ci// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 5462306a36Sopenharmony_ci// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 5562306a36Sopenharmony_ci// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 5662306a36Sopenharmony_ci// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 5762306a36Sopenharmony_ci// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 5862306a36Sopenharmony_ci// 5962306a36Sopenharmony_ci// Reference paper titled "Fast CRC Computation for Generic 6062306a36Sopenharmony_ci// Polynomials Using PCLMULQDQ Instruction" 6162306a36Sopenharmony_ci// URL: http://www.intel.com/content/dam/www/public/us/en/documents 6262306a36Sopenharmony_ci// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 6362306a36Sopenharmony_ci// 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci#include <linux/linkage.h> 6662306a36Sopenharmony_ci#include <asm/assembler.h> 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci .text 6962306a36Sopenharmony_ci .arch armv8-a+crypto 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci init_crc .req w0 7262306a36Sopenharmony_ci buf .req x1 7362306a36Sopenharmony_ci len .req x2 7462306a36Sopenharmony_ci fold_consts_ptr .req x3 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ci fold_consts .req v10 7762306a36Sopenharmony_ci 7862306a36Sopenharmony_ci ad .req v14 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci k00_16 .req v15 8162306a36Sopenharmony_ci k32_48 .req v16 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_ci t3 .req v17 8462306a36Sopenharmony_ci t4 .req v18 8562306a36Sopenharmony_ci t5 .req v19 8662306a36Sopenharmony_ci t6 .req v20 8762306a36Sopenharmony_ci t7 .req v21 8862306a36Sopenharmony_ci t8 .req v22 8962306a36Sopenharmony_ci t9 .req v23 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci perm1 .req v24 9262306a36Sopenharmony_ci perm2 .req v25 9362306a36Sopenharmony_ci perm3 .req v26 9462306a36Sopenharmony_ci perm4 .req v27 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci bd1 .req v28 9762306a36Sopenharmony_ci bd2 .req v29 9862306a36Sopenharmony_ci bd3 .req v30 9962306a36Sopenharmony_ci bd4 .req v31 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_ci .macro __pmull_init_p64 10262306a36Sopenharmony_ci .endm 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_ci .macro __pmull_pre_p64, bd 10562306a36Sopenharmony_ci .endm 10662306a36Sopenharmony_ci 10762306a36Sopenharmony_ci .macro __pmull_init_p8 10862306a36Sopenharmony_ci // k00_16 := 0x0000000000000000_000000000000ffff 10962306a36Sopenharmony_ci // k32_48 := 0x00000000ffffffff_0000ffffffffffff 11062306a36Sopenharmony_ci movi k32_48.2d, #0xffffffff 11162306a36Sopenharmony_ci mov k32_48.h[2], k32_48.h[0] 11262306a36Sopenharmony_ci ushr k00_16.2d, k32_48.2d, #32 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci // prepare the permutation vectors 11562306a36Sopenharmony_ci mov_q x5, 0x080f0e0d0c0b0a09 11662306a36Sopenharmony_ci movi perm4.8b, #8 11762306a36Sopenharmony_ci dup perm1.2d, x5 11862306a36Sopenharmony_ci eor perm1.16b, perm1.16b, perm4.16b 11962306a36Sopenharmony_ci ushr perm2.2d, perm1.2d, #8 12062306a36Sopenharmony_ci ushr perm3.2d, perm1.2d, #16 12162306a36Sopenharmony_ci ushr perm4.2d, perm1.2d, #24 12262306a36Sopenharmony_ci sli perm2.2d, perm1.2d, #56 12362306a36Sopenharmony_ci sli perm3.2d, perm1.2d, #48 12462306a36Sopenharmony_ci sli perm4.2d, perm1.2d, #40 12562306a36Sopenharmony_ci .endm 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_ci .macro __pmull_pre_p8, bd 12862306a36Sopenharmony_ci tbl bd1.16b, {\bd\().16b}, perm1.16b 12962306a36Sopenharmony_ci tbl bd2.16b, {\bd\().16b}, perm2.16b 13062306a36Sopenharmony_ci tbl bd3.16b, {\bd\().16b}, perm3.16b 13162306a36Sopenharmony_ci tbl bd4.16b, {\bd\().16b}, perm4.16b 13262306a36Sopenharmony_ci .endm 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__pmull_p8_core) 13562306a36Sopenharmony_ci.L__pmull_p8_core: 13662306a36Sopenharmony_ci ext t4.8b, ad.8b, ad.8b, #1 // A1 13762306a36Sopenharmony_ci ext t5.8b, ad.8b, ad.8b, #2 // A2 13862306a36Sopenharmony_ci ext t6.8b, ad.8b, ad.8b, #3 // A3 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_ci pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B 14162306a36Sopenharmony_ci pmull t8.8h, ad.8b, bd1.8b // E = A*B1 14262306a36Sopenharmony_ci pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B 14362306a36Sopenharmony_ci pmull t7.8h, ad.8b, bd2.8b // G = A*B2 14462306a36Sopenharmony_ci pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B 14562306a36Sopenharmony_ci pmull t9.8h, ad.8b, bd3.8b // I = A*B3 14662306a36Sopenharmony_ci pmull t3.8h, ad.8b, bd4.8b // K = A*B4 14762306a36Sopenharmony_ci b 0f 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci.L__pmull_p8_core2: 15062306a36Sopenharmony_ci tbl t4.16b, {ad.16b}, perm1.16b // A1 15162306a36Sopenharmony_ci tbl t5.16b, {ad.16b}, perm2.16b // A2 15262306a36Sopenharmony_ci tbl t6.16b, {ad.16b}, perm3.16b // A3 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B 15562306a36Sopenharmony_ci pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1 15662306a36Sopenharmony_ci pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B 15762306a36Sopenharmony_ci pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2 15862306a36Sopenharmony_ci pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B 15962306a36Sopenharmony_ci pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3 16062306a36Sopenharmony_ci pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_ci0: eor t4.16b, t4.16b, t8.16b // L = E + F 16362306a36Sopenharmony_ci eor t5.16b, t5.16b, t7.16b // M = G + H 16462306a36Sopenharmony_ci eor t6.16b, t6.16b, t9.16b // N = I + J 16562306a36Sopenharmony_ci 16662306a36Sopenharmony_ci uzp1 t8.2d, t4.2d, t5.2d 16762306a36Sopenharmony_ci uzp2 t4.2d, t4.2d, t5.2d 16862306a36Sopenharmony_ci uzp1 t7.2d, t6.2d, t3.2d 16962306a36Sopenharmony_ci uzp2 t6.2d, t6.2d, t3.2d 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci // t4 = (L) (P0 + P1) << 8 17262306a36Sopenharmony_ci // t5 = (M) (P2 + P3) << 16 17362306a36Sopenharmony_ci eor t8.16b, t8.16b, t4.16b 17462306a36Sopenharmony_ci and t4.16b, t4.16b, k32_48.16b 17562306a36Sopenharmony_ci 17662306a36Sopenharmony_ci // t6 = (N) (P4 + P5) << 24 17762306a36Sopenharmony_ci // t7 = (K) (P6 + P7) << 32 17862306a36Sopenharmony_ci eor t7.16b, t7.16b, t6.16b 17962306a36Sopenharmony_ci and t6.16b, t6.16b, k00_16.16b 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_ci eor t8.16b, t8.16b, t4.16b 18262306a36Sopenharmony_ci eor t7.16b, t7.16b, t6.16b 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_ci zip2 t5.2d, t8.2d, t4.2d 18562306a36Sopenharmony_ci zip1 t4.2d, t8.2d, t4.2d 18662306a36Sopenharmony_ci zip2 t3.2d, t7.2d, t6.2d 18762306a36Sopenharmony_ci zip1 t6.2d, t7.2d, t6.2d 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci ext t4.16b, t4.16b, t4.16b, #15 19062306a36Sopenharmony_ci ext t5.16b, t5.16b, t5.16b, #14 19162306a36Sopenharmony_ci ext t6.16b, t6.16b, t6.16b, #13 19262306a36Sopenharmony_ci ext t3.16b, t3.16b, t3.16b, #12 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_ci eor t4.16b, t4.16b, t5.16b 19562306a36Sopenharmony_ci eor t6.16b, t6.16b, t3.16b 19662306a36Sopenharmony_ci ret 19762306a36Sopenharmony_ciSYM_FUNC_END(__pmull_p8_core) 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci .macro __pmull_p8, rq, ad, bd, i 20062306a36Sopenharmony_ci .ifnc \bd, fold_consts 20162306a36Sopenharmony_ci .err 20262306a36Sopenharmony_ci .endif 20362306a36Sopenharmony_ci mov ad.16b, \ad\().16b 20462306a36Sopenharmony_ci .ifb \i 20562306a36Sopenharmony_ci pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B 20662306a36Sopenharmony_ci .else 20762306a36Sopenharmony_ci pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B 20862306a36Sopenharmony_ci .endif 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_ci bl .L__pmull_p8_core\i 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci eor \rq\().16b, \rq\().16b, t4.16b 21362306a36Sopenharmony_ci eor \rq\().16b, \rq\().16b, t6.16b 21462306a36Sopenharmony_ci .endm 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_ci // Fold reg1, reg2 into the next 32 data bytes, storing the result back 21762306a36Sopenharmony_ci // into reg1, reg2. 21862306a36Sopenharmony_ci .macro fold_32_bytes, p, reg1, reg2 21962306a36Sopenharmony_ci ldp q11, q12, [buf], #0x20 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ci __pmull_\p v8, \reg1, fold_consts, 2 22262306a36Sopenharmony_ci __pmull_\p \reg1, \reg1, fold_consts 22362306a36Sopenharmony_ci 22462306a36Sopenharmony_ciCPU_LE( rev64 v11.16b, v11.16b ) 22562306a36Sopenharmony_ciCPU_LE( rev64 v12.16b, v12.16b ) 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci __pmull_\p v9, \reg2, fold_consts, 2 22862306a36Sopenharmony_ci __pmull_\p \reg2, \reg2, fold_consts 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_ciCPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) 23162306a36Sopenharmony_ciCPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci eor \reg1\().16b, \reg1\().16b, v8.16b 23462306a36Sopenharmony_ci eor \reg2\().16b, \reg2\().16b, v9.16b 23562306a36Sopenharmony_ci eor \reg1\().16b, \reg1\().16b, v11.16b 23662306a36Sopenharmony_ci eor \reg2\().16b, \reg2\().16b, v12.16b 23762306a36Sopenharmony_ci .endm 23862306a36Sopenharmony_ci 23962306a36Sopenharmony_ci // Fold src_reg into dst_reg, optionally loading the next fold constants 24062306a36Sopenharmony_ci .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts 24162306a36Sopenharmony_ci __pmull_\p v8, \src_reg, fold_consts 24262306a36Sopenharmony_ci __pmull_\p \src_reg, \src_reg, fold_consts, 2 24362306a36Sopenharmony_ci .ifnb \load_next_consts 24462306a36Sopenharmony_ci ld1 {fold_consts.2d}, [fold_consts_ptr], #16 24562306a36Sopenharmony_ci __pmull_pre_\p fold_consts 24662306a36Sopenharmony_ci .endif 24762306a36Sopenharmony_ci eor \dst_reg\().16b, \dst_reg\().16b, v8.16b 24862306a36Sopenharmony_ci eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b 24962306a36Sopenharmony_ci .endm 25062306a36Sopenharmony_ci 25162306a36Sopenharmony_ci .macro __pmull_p64, rd, rn, rm, n 25262306a36Sopenharmony_ci .ifb \n 25362306a36Sopenharmony_ci pmull \rd\().1q, \rn\().1d, \rm\().1d 25462306a36Sopenharmony_ci .else 25562306a36Sopenharmony_ci pmull2 \rd\().1q, \rn\().2d, \rm\().2d 25662306a36Sopenharmony_ci .endif 25762306a36Sopenharmony_ci .endm 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci .macro crc_t10dif_pmull, p 26062306a36Sopenharmony_ci __pmull_init_\p 26162306a36Sopenharmony_ci 26262306a36Sopenharmony_ci // For sizes less than 256 bytes, we can't fold 128 bytes at a time. 26362306a36Sopenharmony_ci cmp len, #256 26462306a36Sopenharmony_ci b.lt .Lless_than_256_bytes_\@ 26562306a36Sopenharmony_ci 26662306a36Sopenharmony_ci adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci // Load the first 128 data bytes. Byte swapping is necessary to make 26962306a36Sopenharmony_ci // the bit order match the polynomial coefficient order. 27062306a36Sopenharmony_ci ldp q0, q1, [buf] 27162306a36Sopenharmony_ci ldp q2, q3, [buf, #0x20] 27262306a36Sopenharmony_ci ldp q4, q5, [buf, #0x40] 27362306a36Sopenharmony_ci ldp q6, q7, [buf, #0x60] 27462306a36Sopenharmony_ci add buf, buf, #0x80 27562306a36Sopenharmony_ciCPU_LE( rev64 v0.16b, v0.16b ) 27662306a36Sopenharmony_ciCPU_LE( rev64 v1.16b, v1.16b ) 27762306a36Sopenharmony_ciCPU_LE( rev64 v2.16b, v2.16b ) 27862306a36Sopenharmony_ciCPU_LE( rev64 v3.16b, v3.16b ) 27962306a36Sopenharmony_ciCPU_LE( rev64 v4.16b, v4.16b ) 28062306a36Sopenharmony_ciCPU_LE( rev64 v5.16b, v5.16b ) 28162306a36Sopenharmony_ciCPU_LE( rev64 v6.16b, v6.16b ) 28262306a36Sopenharmony_ciCPU_LE( rev64 v7.16b, v7.16b ) 28362306a36Sopenharmony_ciCPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) 28462306a36Sopenharmony_ciCPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) 28562306a36Sopenharmony_ciCPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) 28662306a36Sopenharmony_ciCPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) 28762306a36Sopenharmony_ciCPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) 28862306a36Sopenharmony_ciCPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) 28962306a36Sopenharmony_ciCPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) 29062306a36Sopenharmony_ciCPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_ci // XOR the first 16 data *bits* with the initial CRC value. 29362306a36Sopenharmony_ci movi v8.16b, #0 29462306a36Sopenharmony_ci mov v8.h[7], init_crc 29562306a36Sopenharmony_ci eor v0.16b, v0.16b, v8.16b 29662306a36Sopenharmony_ci 29762306a36Sopenharmony_ci // Load the constants for folding across 128 bytes. 29862306a36Sopenharmony_ci ld1 {fold_consts.2d}, [fold_consts_ptr] 29962306a36Sopenharmony_ci __pmull_pre_\p fold_consts 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ci // Subtract 128 for the 128 data bytes just consumed. Subtract another 30262306a36Sopenharmony_ci // 128 to simplify the termination condition of the following loop. 30362306a36Sopenharmony_ci sub len, len, #256 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ci // While >= 128 data bytes remain (not counting v0-v7), fold the 128 30662306a36Sopenharmony_ci // bytes v0-v7 into them, storing the result back into v0-v7. 30762306a36Sopenharmony_ci.Lfold_128_bytes_loop_\@: 30862306a36Sopenharmony_ci fold_32_bytes \p, v0, v1 30962306a36Sopenharmony_ci fold_32_bytes \p, v2, v3 31062306a36Sopenharmony_ci fold_32_bytes \p, v4, v5 31162306a36Sopenharmony_ci fold_32_bytes \p, v6, v7 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci subs len, len, #128 31462306a36Sopenharmony_ci b.ge .Lfold_128_bytes_loop_\@ 31562306a36Sopenharmony_ci 31662306a36Sopenharmony_ci // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7. 31762306a36Sopenharmony_ci 31862306a36Sopenharmony_ci // Fold across 64 bytes. 31962306a36Sopenharmony_ci add fold_consts_ptr, fold_consts_ptr, #16 32062306a36Sopenharmony_ci ld1 {fold_consts.2d}, [fold_consts_ptr], #16 32162306a36Sopenharmony_ci __pmull_pre_\p fold_consts 32262306a36Sopenharmony_ci fold_16_bytes \p, v0, v4 32362306a36Sopenharmony_ci fold_16_bytes \p, v1, v5 32462306a36Sopenharmony_ci fold_16_bytes \p, v2, v6 32562306a36Sopenharmony_ci fold_16_bytes \p, v3, v7, 1 32662306a36Sopenharmony_ci // Fold across 32 bytes. 32762306a36Sopenharmony_ci fold_16_bytes \p, v4, v6 32862306a36Sopenharmony_ci fold_16_bytes \p, v5, v7, 1 32962306a36Sopenharmony_ci // Fold across 16 bytes. 33062306a36Sopenharmony_ci fold_16_bytes \p, v6, v7 33162306a36Sopenharmony_ci 33262306a36Sopenharmony_ci // Add 128 to get the correct number of data bytes remaining in 0...127 33362306a36Sopenharmony_ci // (not counting v7), following the previous extra subtraction by 128. 33462306a36Sopenharmony_ci // Then subtract 16 to simplify the termination condition of the 33562306a36Sopenharmony_ci // following loop. 33662306a36Sopenharmony_ci adds len, len, #(128-16) 33762306a36Sopenharmony_ci 33862306a36Sopenharmony_ci // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7 33962306a36Sopenharmony_ci // into them, storing the result back into v7. 34062306a36Sopenharmony_ci b.lt .Lfold_16_bytes_loop_done_\@ 34162306a36Sopenharmony_ci.Lfold_16_bytes_loop_\@: 34262306a36Sopenharmony_ci __pmull_\p v8, v7, fold_consts 34362306a36Sopenharmony_ci __pmull_\p v7, v7, fold_consts, 2 34462306a36Sopenharmony_ci eor v7.16b, v7.16b, v8.16b 34562306a36Sopenharmony_ci ldr q0, [buf], #16 34662306a36Sopenharmony_ciCPU_LE( rev64 v0.16b, v0.16b ) 34762306a36Sopenharmony_ciCPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) 34862306a36Sopenharmony_ci eor v7.16b, v7.16b, v0.16b 34962306a36Sopenharmony_ci subs len, len, #16 35062306a36Sopenharmony_ci b.ge .Lfold_16_bytes_loop_\@ 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_ci.Lfold_16_bytes_loop_done_\@: 35362306a36Sopenharmony_ci // Add 16 to get the correct number of data bytes remaining in 0...15 35462306a36Sopenharmony_ci // (not counting v7), following the previous extra subtraction by 16. 35562306a36Sopenharmony_ci adds len, len, #16 35662306a36Sopenharmony_ci b.eq .Lreduce_final_16_bytes_\@ 35762306a36Sopenharmony_ci 35862306a36Sopenharmony_ci.Lhandle_partial_segment_\@: 35962306a36Sopenharmony_ci // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 36062306a36Sopenharmony_ci // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To 36162306a36Sopenharmony_ci // do this without needing a fold constant for each possible 'len', 36262306a36Sopenharmony_ci // redivide the bytes into a first chunk of 'len' bytes and a second 36362306a36Sopenharmony_ci // chunk of 16 bytes, then fold the first chunk into the second. 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_ci // v0 = last 16 original data bytes 36662306a36Sopenharmony_ci add buf, buf, len 36762306a36Sopenharmony_ci ldr q0, [buf, #-16] 36862306a36Sopenharmony_ciCPU_LE( rev64 v0.16b, v0.16b ) 36962306a36Sopenharmony_ciCPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) 37062306a36Sopenharmony_ci 37162306a36Sopenharmony_ci // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes. 37262306a36Sopenharmony_ci adr_l x4, .Lbyteshift_table + 16 37362306a36Sopenharmony_ci sub x4, x4, len 37462306a36Sopenharmony_ci ld1 {v2.16b}, [x4] 37562306a36Sopenharmony_ci tbl v1.16b, {v7.16b}, v2.16b 37662306a36Sopenharmony_ci 37762306a36Sopenharmony_ci // v3 = first chunk: v7 right-shifted by '16-len' bytes. 37862306a36Sopenharmony_ci movi v3.16b, #0x80 37962306a36Sopenharmony_ci eor v2.16b, v2.16b, v3.16b 38062306a36Sopenharmony_ci tbl v3.16b, {v7.16b}, v2.16b 38162306a36Sopenharmony_ci 38262306a36Sopenharmony_ci // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. 38362306a36Sopenharmony_ci sshr v2.16b, v2.16b, #7 38462306a36Sopenharmony_ci 38562306a36Sopenharmony_ci // v2 = second chunk: 'len' bytes from v0 (low-order bytes), 38662306a36Sopenharmony_ci // then '16-len' bytes from v1 (high-order bytes). 38762306a36Sopenharmony_ci bsl v2.16b, v1.16b, v0.16b 38862306a36Sopenharmony_ci 38962306a36Sopenharmony_ci // Fold the first chunk into the second chunk, storing the result in v7. 39062306a36Sopenharmony_ci __pmull_\p v0, v3, fold_consts 39162306a36Sopenharmony_ci __pmull_\p v7, v3, fold_consts, 2 39262306a36Sopenharmony_ci eor v7.16b, v7.16b, v0.16b 39362306a36Sopenharmony_ci eor v7.16b, v7.16b, v2.16b 39462306a36Sopenharmony_ci 39562306a36Sopenharmony_ci.Lreduce_final_16_bytes_\@: 39662306a36Sopenharmony_ci // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC. 39762306a36Sopenharmony_ci 39862306a36Sopenharmony_ci movi v2.16b, #0 // init zero register 39962306a36Sopenharmony_ci 40062306a36Sopenharmony_ci // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. 40162306a36Sopenharmony_ci ld1 {fold_consts.2d}, [fold_consts_ptr], #16 40262306a36Sopenharmony_ci __pmull_pre_\p fold_consts 40362306a36Sopenharmony_ci 40462306a36Sopenharmony_ci // Fold the high 64 bits into the low 64 bits, while also multiplying by 40562306a36Sopenharmony_ci // x^64. This produces a 128-bit value congruent to x^64 * M(x) and 40662306a36Sopenharmony_ci // whose low 48 bits are 0. 40762306a36Sopenharmony_ci ext v0.16b, v2.16b, v7.16b, #8 40862306a36Sopenharmony_ci __pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x)) 40962306a36Sopenharmony_ci eor v0.16b, v0.16b, v7.16b // + low bits * x^64 41062306a36Sopenharmony_ci 41162306a36Sopenharmony_ci // Fold the high 32 bits into the low 96 bits. This produces a 96-bit 41262306a36Sopenharmony_ci // value congruent to x^64 * M(x) and whose low 48 bits are 0. 41362306a36Sopenharmony_ci ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits 41462306a36Sopenharmony_ci mov v0.s[3], v2.s[0] // zero high 32 bits 41562306a36Sopenharmony_ci __pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x)) 41662306a36Sopenharmony_ci eor v0.16b, v0.16b, v1.16b // + low bits 41762306a36Sopenharmony_ci 41862306a36Sopenharmony_ci // Load G(x) and floor(x^48 / G(x)). 41962306a36Sopenharmony_ci ld1 {fold_consts.2d}, [fold_consts_ptr] 42062306a36Sopenharmony_ci __pmull_pre_\p fold_consts 42162306a36Sopenharmony_ci 42262306a36Sopenharmony_ci // Use Barrett reduction to compute the final CRC value. 42362306a36Sopenharmony_ci __pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x)) 42462306a36Sopenharmony_ci ushr v1.2d, v1.2d, #32 // /= x^32 42562306a36Sopenharmony_ci __pmull_\p v1, v1, fold_consts // *= G(x) 42662306a36Sopenharmony_ci ushr v0.2d, v0.2d, #48 42762306a36Sopenharmony_ci eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits 42862306a36Sopenharmony_ci // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. 42962306a36Sopenharmony_ci 43062306a36Sopenharmony_ci umov w0, v0.h[0] 43162306a36Sopenharmony_ci .ifc \p, p8 43262306a36Sopenharmony_ci frame_pop 43362306a36Sopenharmony_ci .endif 43462306a36Sopenharmony_ci ret 43562306a36Sopenharmony_ci 43662306a36Sopenharmony_ci.Lless_than_256_bytes_\@: 43762306a36Sopenharmony_ci // Checksumming a buffer of length 16...255 bytes 43862306a36Sopenharmony_ci 43962306a36Sopenharmony_ci adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts 44062306a36Sopenharmony_ci 44162306a36Sopenharmony_ci // Load the first 16 data bytes. 44262306a36Sopenharmony_ci ldr q7, [buf], #0x10 44362306a36Sopenharmony_ciCPU_LE( rev64 v7.16b, v7.16b ) 44462306a36Sopenharmony_ciCPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) 44562306a36Sopenharmony_ci 44662306a36Sopenharmony_ci // XOR the first 16 data *bits* with the initial CRC value. 44762306a36Sopenharmony_ci movi v0.16b, #0 44862306a36Sopenharmony_ci mov v0.h[7], init_crc 44962306a36Sopenharmony_ci eor v7.16b, v7.16b, v0.16b 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci // Load the fold-across-16-bytes constants. 45262306a36Sopenharmony_ci ld1 {fold_consts.2d}, [fold_consts_ptr], #16 45362306a36Sopenharmony_ci __pmull_pre_\p fold_consts 45462306a36Sopenharmony_ci 45562306a36Sopenharmony_ci cmp len, #16 45662306a36Sopenharmony_ci b.eq .Lreduce_final_16_bytes_\@ // len == 16 45762306a36Sopenharmony_ci subs len, len, #32 45862306a36Sopenharmony_ci b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255 45962306a36Sopenharmony_ci add len, len, #16 46062306a36Sopenharmony_ci b .Lhandle_partial_segment_\@ // 17 <= len <= 31 46162306a36Sopenharmony_ci .endm 46262306a36Sopenharmony_ci 46362306a36Sopenharmony_ci// 46462306a36Sopenharmony_ci// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len); 46562306a36Sopenharmony_ci// 46662306a36Sopenharmony_ci// Assumes len >= 16. 46762306a36Sopenharmony_ci// 46862306a36Sopenharmony_ciSYM_FUNC_START(crc_t10dif_pmull_p8) 46962306a36Sopenharmony_ci frame_push 1 47062306a36Sopenharmony_ci crc_t10dif_pmull p8 47162306a36Sopenharmony_ciSYM_FUNC_END(crc_t10dif_pmull_p8) 47262306a36Sopenharmony_ci 47362306a36Sopenharmony_ci .align 5 47462306a36Sopenharmony_ci// 47562306a36Sopenharmony_ci// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); 47662306a36Sopenharmony_ci// 47762306a36Sopenharmony_ci// Assumes len >= 16. 47862306a36Sopenharmony_ci// 47962306a36Sopenharmony_ciSYM_FUNC_START(crc_t10dif_pmull_p64) 48062306a36Sopenharmony_ci crc_t10dif_pmull p64 48162306a36Sopenharmony_ciSYM_FUNC_END(crc_t10dif_pmull_p64) 48262306a36Sopenharmony_ci 48362306a36Sopenharmony_ci .section ".rodata", "a" 48462306a36Sopenharmony_ci .align 4 48562306a36Sopenharmony_ci 48662306a36Sopenharmony_ci// Fold constants precomputed from the polynomial 0x18bb7 48762306a36Sopenharmony_ci// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 48862306a36Sopenharmony_ci.Lfold_across_128_bytes_consts: 48962306a36Sopenharmony_ci .quad 0x0000000000006123 // x^(8*128) mod G(x) 49062306a36Sopenharmony_ci .quad 0x0000000000002295 // x^(8*128+64) mod G(x) 49162306a36Sopenharmony_ci// .Lfold_across_64_bytes_consts: 49262306a36Sopenharmony_ci .quad 0x0000000000001069 // x^(4*128) mod G(x) 49362306a36Sopenharmony_ci .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) 49462306a36Sopenharmony_ci// .Lfold_across_32_bytes_consts: 49562306a36Sopenharmony_ci .quad 0x000000000000857d // x^(2*128) mod G(x) 49662306a36Sopenharmony_ci .quad 0x0000000000007acc // x^(2*128+64) mod G(x) 49762306a36Sopenharmony_ci.Lfold_across_16_bytes_consts: 49862306a36Sopenharmony_ci .quad 0x000000000000a010 // x^(1*128) mod G(x) 49962306a36Sopenharmony_ci .quad 0x0000000000001faa // x^(1*128+64) mod G(x) 50062306a36Sopenharmony_ci// .Lfinal_fold_consts: 50162306a36Sopenharmony_ci .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) 50262306a36Sopenharmony_ci .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) 50362306a36Sopenharmony_ci// .Lbarrett_reduction_consts: 50462306a36Sopenharmony_ci .quad 0x0000000000018bb7 // G(x) 50562306a36Sopenharmony_ci .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) 50662306a36Sopenharmony_ci 50762306a36Sopenharmony_ci// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - 50862306a36Sopenharmony_ci// len] is the index vector to shift left by 'len' bytes, and is also {0x80, 50962306a36Sopenharmony_ci// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. 51062306a36Sopenharmony_ci.Lbyteshift_table: 51162306a36Sopenharmony_ci .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 51262306a36Sopenharmony_ci .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f 51362306a36Sopenharmony_ci .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 51462306a36Sopenharmony_ci .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 515