162306a36Sopenharmony_ci// 262306a36Sopenharmony_ci// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions 362306a36Sopenharmony_ci// 462306a36Sopenharmony_ci// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 562306a36Sopenharmony_ci// Copyright (C) 2019 Google LLC <ebiggers@google.com> 662306a36Sopenharmony_ci// 762306a36Sopenharmony_ci// This program is free software; you can redistribute it and/or modify 862306a36Sopenharmony_ci// it under the terms of the GNU General Public License version 2 as 962306a36Sopenharmony_ci// published by the Free Software Foundation. 1062306a36Sopenharmony_ci// 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci// Derived from the x86 version: 1362306a36Sopenharmony_ci// 1462306a36Sopenharmony_ci// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions 1562306a36Sopenharmony_ci// 1662306a36Sopenharmony_ci// Copyright (c) 2013, Intel Corporation 1762306a36Sopenharmony_ci// 1862306a36Sopenharmony_ci// Authors: 1962306a36Sopenharmony_ci// Erdinc Ozturk <erdinc.ozturk@intel.com> 2062306a36Sopenharmony_ci// Vinodh Gopal <vinodh.gopal@intel.com> 2162306a36Sopenharmony_ci// James Guilford <james.guilford@intel.com> 2262306a36Sopenharmony_ci// Tim Chen <tim.c.chen@linux.intel.com> 2362306a36Sopenharmony_ci// 2462306a36Sopenharmony_ci// This software is available to you under a choice of one of two 2562306a36Sopenharmony_ci// licenses. You may choose to be licensed under the terms of the GNU 2662306a36Sopenharmony_ci// General Public License (GPL) Version 2, available from the file 2762306a36Sopenharmony_ci// COPYING in the main directory of this source tree, or the 2862306a36Sopenharmony_ci// OpenIB.org BSD license below: 2962306a36Sopenharmony_ci// 3062306a36Sopenharmony_ci// Redistribution and use in source and binary forms, with or without 3162306a36Sopenharmony_ci// modification, are permitted provided that the following conditions are 3262306a36Sopenharmony_ci// met: 3362306a36Sopenharmony_ci// 3462306a36Sopenharmony_ci// * Redistributions of source code must retain the above copyright 3562306a36Sopenharmony_ci// notice, this list of conditions and the following disclaimer. 3662306a36Sopenharmony_ci// 3762306a36Sopenharmony_ci// * Redistributions in binary form must reproduce the above copyright 3862306a36Sopenharmony_ci// notice, this list of conditions and the following disclaimer in the 3962306a36Sopenharmony_ci// documentation and/or other materials provided with the 4062306a36Sopenharmony_ci// distribution. 4162306a36Sopenharmony_ci// 4262306a36Sopenharmony_ci// * Neither the name of the Intel Corporation nor the names of its 4362306a36Sopenharmony_ci// contributors may be used to endorse or promote products derived from 4462306a36Sopenharmony_ci// this software without specific prior written permission. 4562306a36Sopenharmony_ci// 4662306a36Sopenharmony_ci// 4762306a36Sopenharmony_ci// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 4862306a36Sopenharmony_ci// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 4962306a36Sopenharmony_ci// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 5062306a36Sopenharmony_ci// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 5162306a36Sopenharmony_ci// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 5262306a36Sopenharmony_ci// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 5362306a36Sopenharmony_ci// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 5462306a36Sopenharmony_ci// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 5562306a36Sopenharmony_ci// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 5662306a36Sopenharmony_ci// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 5762306a36Sopenharmony_ci// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 5862306a36Sopenharmony_ci// 5962306a36Sopenharmony_ci// Reference paper titled "Fast CRC Computation for Generic 6062306a36Sopenharmony_ci// Polynomials Using PCLMULQDQ Instruction" 6162306a36Sopenharmony_ci// URL: http://www.intel.com/content/dam/www/public/us/en/documents 6262306a36Sopenharmony_ci// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 6362306a36Sopenharmony_ci// 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci#include <linux/linkage.h> 6662306a36Sopenharmony_ci#include <asm/assembler.h> 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci#ifdef CONFIG_CPU_ENDIAN_BE8 6962306a36Sopenharmony_ci#define CPU_LE(code...) 7062306a36Sopenharmony_ci#else 7162306a36Sopenharmony_ci#define CPU_LE(code...) code 7262306a36Sopenharmony_ci#endif 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_ci .text 7562306a36Sopenharmony_ci .arch armv8-a 7662306a36Sopenharmony_ci .fpu crypto-neon-fp-armv8 7762306a36Sopenharmony_ci 7862306a36Sopenharmony_ci init_crc .req r0 7962306a36Sopenharmony_ci buf .req r1 8062306a36Sopenharmony_ci len .req r2 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci fold_consts_ptr .req ip 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci q0l .req d0 8562306a36Sopenharmony_ci q0h .req d1 8662306a36Sopenharmony_ci q1l .req d2 8762306a36Sopenharmony_ci q1h .req d3 8862306a36Sopenharmony_ci q2l .req d4 8962306a36Sopenharmony_ci q2h .req d5 9062306a36Sopenharmony_ci q3l .req d6 9162306a36Sopenharmony_ci q3h .req d7 9262306a36Sopenharmony_ci q4l .req d8 9362306a36Sopenharmony_ci q4h .req d9 9462306a36Sopenharmony_ci q5l .req d10 9562306a36Sopenharmony_ci q5h .req d11 9662306a36Sopenharmony_ci q6l .req d12 9762306a36Sopenharmony_ci q6h .req d13 9862306a36Sopenharmony_ci q7l .req d14 9962306a36Sopenharmony_ci q7h .req d15 10062306a36Sopenharmony_ci q8l .req d16 10162306a36Sopenharmony_ci q8h .req d17 10262306a36Sopenharmony_ci q9l .req d18 10362306a36Sopenharmony_ci q9h .req d19 10462306a36Sopenharmony_ci q10l .req d20 10562306a36Sopenharmony_ci q10h .req d21 10662306a36Sopenharmony_ci q11l .req d22 10762306a36Sopenharmony_ci q11h .req d23 10862306a36Sopenharmony_ci q12l .req d24 10962306a36Sopenharmony_ci q12h .req d25 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_ci FOLD_CONSTS .req q10 11262306a36Sopenharmony_ci FOLD_CONST_L .req q10l 11362306a36Sopenharmony_ci FOLD_CONST_H .req q10h 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_ci // Fold reg1, reg2 into the next 32 data bytes, storing the result back 11662306a36Sopenharmony_ci // into reg1, reg2. 11762306a36Sopenharmony_ci .macro fold_32_bytes, reg1, reg2 11862306a36Sopenharmony_ci vld1.64 {q11-q12}, [buf]! 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ci vmull.p64 q8, \reg1\()h, FOLD_CONST_H 12162306a36Sopenharmony_ci vmull.p64 \reg1, \reg1\()l, FOLD_CONST_L 12262306a36Sopenharmony_ci vmull.p64 q9, \reg2\()h, FOLD_CONST_H 12362306a36Sopenharmony_ci vmull.p64 \reg2, \reg2\()l, FOLD_CONST_L 12462306a36Sopenharmony_ci 12562306a36Sopenharmony_ciCPU_LE( vrev64.8 q11, q11 ) 12662306a36Sopenharmony_ciCPU_LE( vrev64.8 q12, q12 ) 12762306a36Sopenharmony_ci vswp q11l, q11h 12862306a36Sopenharmony_ci vswp q12l, q12h 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci veor.8 \reg1, \reg1, q8 13162306a36Sopenharmony_ci veor.8 \reg2, \reg2, q9 13262306a36Sopenharmony_ci veor.8 \reg1, \reg1, q11 13362306a36Sopenharmony_ci veor.8 \reg2, \reg2, q12 13462306a36Sopenharmony_ci .endm 13562306a36Sopenharmony_ci 13662306a36Sopenharmony_ci // Fold src_reg into dst_reg, optionally loading the next fold constants 13762306a36Sopenharmony_ci .macro fold_16_bytes, src_reg, dst_reg, load_next_consts 13862306a36Sopenharmony_ci vmull.p64 q8, \src_reg\()l, FOLD_CONST_L 13962306a36Sopenharmony_ci vmull.p64 \src_reg, \src_reg\()h, FOLD_CONST_H 14062306a36Sopenharmony_ci .ifnb \load_next_consts 14162306a36Sopenharmony_ci vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! 14262306a36Sopenharmony_ci .endif 14362306a36Sopenharmony_ci veor.8 \dst_reg, \dst_reg, q8 14462306a36Sopenharmony_ci veor.8 \dst_reg, \dst_reg, \src_reg 14562306a36Sopenharmony_ci .endm 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci .macro __adrl, out, sym 14862306a36Sopenharmony_ci movw \out, #:lower16:\sym 14962306a36Sopenharmony_ci movt \out, #:upper16:\sym 15062306a36Sopenharmony_ci .endm 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci// 15362306a36Sopenharmony_ci// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len); 15462306a36Sopenharmony_ci// 15562306a36Sopenharmony_ci// Assumes len >= 16. 15662306a36Sopenharmony_ci// 15762306a36Sopenharmony_ciENTRY(crc_t10dif_pmull) 15862306a36Sopenharmony_ci 15962306a36Sopenharmony_ci // For sizes less than 256 bytes, we can't fold 128 bytes at a time. 16062306a36Sopenharmony_ci cmp len, #256 16162306a36Sopenharmony_ci blt .Lless_than_256_bytes 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci __adrl fold_consts_ptr, .Lfold_across_128_bytes_consts 16462306a36Sopenharmony_ci 16562306a36Sopenharmony_ci // Load the first 128 data bytes. Byte swapping is necessary to make 16662306a36Sopenharmony_ci // the bit order match the polynomial coefficient order. 16762306a36Sopenharmony_ci vld1.64 {q0-q1}, [buf]! 16862306a36Sopenharmony_ci vld1.64 {q2-q3}, [buf]! 16962306a36Sopenharmony_ci vld1.64 {q4-q5}, [buf]! 17062306a36Sopenharmony_ci vld1.64 {q6-q7}, [buf]! 17162306a36Sopenharmony_ciCPU_LE( vrev64.8 q0, q0 ) 17262306a36Sopenharmony_ciCPU_LE( vrev64.8 q1, q1 ) 17362306a36Sopenharmony_ciCPU_LE( vrev64.8 q2, q2 ) 17462306a36Sopenharmony_ciCPU_LE( vrev64.8 q3, q3 ) 17562306a36Sopenharmony_ciCPU_LE( vrev64.8 q4, q4 ) 17662306a36Sopenharmony_ciCPU_LE( vrev64.8 q5, q5 ) 17762306a36Sopenharmony_ciCPU_LE( vrev64.8 q6, q6 ) 17862306a36Sopenharmony_ciCPU_LE( vrev64.8 q7, q7 ) 17962306a36Sopenharmony_ci vswp q0l, q0h 18062306a36Sopenharmony_ci vswp q1l, q1h 18162306a36Sopenharmony_ci vswp q2l, q2h 18262306a36Sopenharmony_ci vswp q3l, q3h 18362306a36Sopenharmony_ci vswp q4l, q4h 18462306a36Sopenharmony_ci vswp q5l, q5h 18562306a36Sopenharmony_ci vswp q6l, q6h 18662306a36Sopenharmony_ci vswp q7l, q7h 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci // XOR the first 16 data *bits* with the initial CRC value. 18962306a36Sopenharmony_ci vmov.i8 q8h, #0 19062306a36Sopenharmony_ci vmov.u16 q8h[3], init_crc 19162306a36Sopenharmony_ci veor q0h, q0h, q8h 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci // Load the constants for folding across 128 bytes. 19462306a36Sopenharmony_ci vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci // Subtract 128 for the 128 data bytes just consumed. Subtract another 19762306a36Sopenharmony_ci // 128 to simplify the termination condition of the following loop. 19862306a36Sopenharmony_ci sub len, len, #256 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ci // While >= 128 data bytes remain (not counting q0-q7), fold the 128 20162306a36Sopenharmony_ci // bytes q0-q7 into them, storing the result back into q0-q7. 20262306a36Sopenharmony_ci.Lfold_128_bytes_loop: 20362306a36Sopenharmony_ci fold_32_bytes q0, q1 20462306a36Sopenharmony_ci fold_32_bytes q2, q3 20562306a36Sopenharmony_ci fold_32_bytes q4, q5 20662306a36Sopenharmony_ci fold_32_bytes q6, q7 20762306a36Sopenharmony_ci subs len, len, #128 20862306a36Sopenharmony_ci bge .Lfold_128_bytes_loop 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_ci // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7. 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci // Fold across 64 bytes. 21362306a36Sopenharmony_ci vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! 21462306a36Sopenharmony_ci fold_16_bytes q0, q4 21562306a36Sopenharmony_ci fold_16_bytes q1, q5 21662306a36Sopenharmony_ci fold_16_bytes q2, q6 21762306a36Sopenharmony_ci fold_16_bytes q3, q7, 1 21862306a36Sopenharmony_ci // Fold across 32 bytes. 21962306a36Sopenharmony_ci fold_16_bytes q4, q6 22062306a36Sopenharmony_ci fold_16_bytes q5, q7, 1 22162306a36Sopenharmony_ci // Fold across 16 bytes. 22262306a36Sopenharmony_ci fold_16_bytes q6, q7 22362306a36Sopenharmony_ci 22462306a36Sopenharmony_ci // Add 128 to get the correct number of data bytes remaining in 0...127 22562306a36Sopenharmony_ci // (not counting q7), following the previous extra subtraction by 128. 22662306a36Sopenharmony_ci // Then subtract 16 to simplify the termination condition of the 22762306a36Sopenharmony_ci // following loop. 22862306a36Sopenharmony_ci adds len, len, #(128-16) 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_ci // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7 23162306a36Sopenharmony_ci // into them, storing the result back into q7. 23262306a36Sopenharmony_ci blt .Lfold_16_bytes_loop_done 23362306a36Sopenharmony_ci.Lfold_16_bytes_loop: 23462306a36Sopenharmony_ci vmull.p64 q8, q7l, FOLD_CONST_L 23562306a36Sopenharmony_ci vmull.p64 q7, q7h, FOLD_CONST_H 23662306a36Sopenharmony_ci veor.8 q7, q7, q8 23762306a36Sopenharmony_ci vld1.64 {q0}, [buf]! 23862306a36Sopenharmony_ciCPU_LE( vrev64.8 q0, q0 ) 23962306a36Sopenharmony_ci vswp q0l, q0h 24062306a36Sopenharmony_ci veor.8 q7, q7, q0 24162306a36Sopenharmony_ci subs len, len, #16 24262306a36Sopenharmony_ci bge .Lfold_16_bytes_loop 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci.Lfold_16_bytes_loop_done: 24562306a36Sopenharmony_ci // Add 16 to get the correct number of data bytes remaining in 0...15 24662306a36Sopenharmony_ci // (not counting q7), following the previous extra subtraction by 16. 24762306a36Sopenharmony_ci adds len, len, #16 24862306a36Sopenharmony_ci beq .Lreduce_final_16_bytes 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci.Lhandle_partial_segment: 25162306a36Sopenharmony_ci // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 25262306a36Sopenharmony_ci // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To 25362306a36Sopenharmony_ci // do this without needing a fold constant for each possible 'len', 25462306a36Sopenharmony_ci // redivide the bytes into a first chunk of 'len' bytes and a second 25562306a36Sopenharmony_ci // chunk of 16 bytes, then fold the first chunk into the second. 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_ci // q0 = last 16 original data bytes 25862306a36Sopenharmony_ci add buf, buf, len 25962306a36Sopenharmony_ci sub buf, buf, #16 26062306a36Sopenharmony_ci vld1.64 {q0}, [buf] 26162306a36Sopenharmony_ciCPU_LE( vrev64.8 q0, q0 ) 26262306a36Sopenharmony_ci vswp q0l, q0h 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_ci // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes. 26562306a36Sopenharmony_ci __adrl r3, .Lbyteshift_table + 16 26662306a36Sopenharmony_ci sub r3, r3, len 26762306a36Sopenharmony_ci vld1.8 {q2}, [r3] 26862306a36Sopenharmony_ci vtbl.8 q1l, {q7l-q7h}, q2l 26962306a36Sopenharmony_ci vtbl.8 q1h, {q7l-q7h}, q2h 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_ci // q3 = first chunk: q7 right-shifted by '16-len' bytes. 27262306a36Sopenharmony_ci vmov.i8 q3, #0x80 27362306a36Sopenharmony_ci veor.8 q2, q2, q3 27462306a36Sopenharmony_ci vtbl.8 q3l, {q7l-q7h}, q2l 27562306a36Sopenharmony_ci vtbl.8 q3h, {q7l-q7h}, q2h 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_ci // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. 27862306a36Sopenharmony_ci vshr.s8 q2, q2, #7 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci // q2 = second chunk: 'len' bytes from q0 (low-order bytes), 28162306a36Sopenharmony_ci // then '16-len' bytes from q1 (high-order bytes). 28262306a36Sopenharmony_ci vbsl.8 q2, q1, q0 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci // Fold the first chunk into the second chunk, storing the result in q7. 28562306a36Sopenharmony_ci vmull.p64 q0, q3l, FOLD_CONST_L 28662306a36Sopenharmony_ci vmull.p64 q7, q3h, FOLD_CONST_H 28762306a36Sopenharmony_ci veor.8 q7, q7, q0 28862306a36Sopenharmony_ci veor.8 q7, q7, q2 28962306a36Sopenharmony_ci 29062306a36Sopenharmony_ci.Lreduce_final_16_bytes: 29162306a36Sopenharmony_ci // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC. 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. 29462306a36Sopenharmony_ci vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_ci // Fold the high 64 bits into the low 64 bits, while also multiplying by 29762306a36Sopenharmony_ci // x^64. This produces a 128-bit value congruent to x^64 * M(x) and 29862306a36Sopenharmony_ci // whose low 48 bits are 0. 29962306a36Sopenharmony_ci vmull.p64 q0, q7h, FOLD_CONST_H // high bits * x^48 * (x^80 mod G(x)) 30062306a36Sopenharmony_ci veor.8 q0h, q0h, q7l // + low bits * x^64 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci // Fold the high 32 bits into the low 96 bits. This produces a 96-bit 30362306a36Sopenharmony_ci // value congruent to x^64 * M(x) and whose low 48 bits are 0. 30462306a36Sopenharmony_ci vmov.i8 q1, #0 30562306a36Sopenharmony_ci vmov s4, s3 // extract high 32 bits 30662306a36Sopenharmony_ci vmov s3, s5 // zero high 32 bits 30762306a36Sopenharmony_ci vmull.p64 q1, q1l, FOLD_CONST_L // high 32 bits * x^48 * (x^48 mod G(x)) 30862306a36Sopenharmony_ci veor.8 q0, q0, q1 // + low bits 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci // Load G(x) and floor(x^48 / G(x)). 31162306a36Sopenharmony_ci vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128] 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci // Use Barrett reduction to compute the final CRC value. 31462306a36Sopenharmony_ci vmull.p64 q1, q0h, FOLD_CONST_H // high 32 bits * floor(x^48 / G(x)) 31562306a36Sopenharmony_ci vshr.u64 q1l, q1l, #32 // /= x^32 31662306a36Sopenharmony_ci vmull.p64 q1, q1l, FOLD_CONST_L // *= G(x) 31762306a36Sopenharmony_ci vshr.u64 q0l, q0l, #48 31862306a36Sopenharmony_ci veor.8 q0l, q0l, q1l // + low 16 nonzero bits 31962306a36Sopenharmony_ci // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0. 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci vmov.u16 r0, q0l[0] 32262306a36Sopenharmony_ci bx lr 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ci.Lless_than_256_bytes: 32562306a36Sopenharmony_ci // Checksumming a buffer of length 16...255 bytes 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_ci __adrl fold_consts_ptr, .Lfold_across_16_bytes_consts 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_ci // Load the first 16 data bytes. 33062306a36Sopenharmony_ci vld1.64 {q7}, [buf]! 33162306a36Sopenharmony_ciCPU_LE( vrev64.8 q7, q7 ) 33262306a36Sopenharmony_ci vswp q7l, q7h 33362306a36Sopenharmony_ci 33462306a36Sopenharmony_ci // XOR the first 16 data *bits* with the initial CRC value. 33562306a36Sopenharmony_ci vmov.i8 q0h, #0 33662306a36Sopenharmony_ci vmov.u16 q0h[3], init_crc 33762306a36Sopenharmony_ci veor.8 q7h, q7h, q0h 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci // Load the fold-across-16-bytes constants. 34062306a36Sopenharmony_ci vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci cmp len, #16 34362306a36Sopenharmony_ci beq .Lreduce_final_16_bytes // len == 16 34462306a36Sopenharmony_ci subs len, len, #32 34562306a36Sopenharmony_ci addlt len, len, #16 34662306a36Sopenharmony_ci blt .Lhandle_partial_segment // 17 <= len <= 31 34762306a36Sopenharmony_ci b .Lfold_16_bytes_loop // 32 <= len <= 255 34862306a36Sopenharmony_ciENDPROC(crc_t10dif_pmull) 34962306a36Sopenharmony_ci 35062306a36Sopenharmony_ci .section ".rodata", "a" 35162306a36Sopenharmony_ci .align 4 35262306a36Sopenharmony_ci 35362306a36Sopenharmony_ci// Fold constants precomputed from the polynomial 0x18bb7 35462306a36Sopenharmony_ci// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 35562306a36Sopenharmony_ci.Lfold_across_128_bytes_consts: 35662306a36Sopenharmony_ci .quad 0x0000000000006123 // x^(8*128) mod G(x) 35762306a36Sopenharmony_ci .quad 0x0000000000002295 // x^(8*128+64) mod G(x) 35862306a36Sopenharmony_ci// .Lfold_across_64_bytes_consts: 35962306a36Sopenharmony_ci .quad 0x0000000000001069 // x^(4*128) mod G(x) 36062306a36Sopenharmony_ci .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) 36162306a36Sopenharmony_ci// .Lfold_across_32_bytes_consts: 36262306a36Sopenharmony_ci .quad 0x000000000000857d // x^(2*128) mod G(x) 36362306a36Sopenharmony_ci .quad 0x0000000000007acc // x^(2*128+64) mod G(x) 36462306a36Sopenharmony_ci.Lfold_across_16_bytes_consts: 36562306a36Sopenharmony_ci .quad 0x000000000000a010 // x^(1*128) mod G(x) 36662306a36Sopenharmony_ci .quad 0x0000000000001faa // x^(1*128+64) mod G(x) 36762306a36Sopenharmony_ci// .Lfinal_fold_consts: 36862306a36Sopenharmony_ci .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) 36962306a36Sopenharmony_ci .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) 37062306a36Sopenharmony_ci// .Lbarrett_reduction_consts: 37162306a36Sopenharmony_ci .quad 0x0000000000018bb7 // G(x) 37262306a36Sopenharmony_ci .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) 37362306a36Sopenharmony_ci 37462306a36Sopenharmony_ci// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - 37562306a36Sopenharmony_ci// len] is the index vector to shift left by 'len' bytes, and is also {0x80, 37662306a36Sopenharmony_ci// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. 37762306a36Sopenharmony_ci.Lbyteshift_table: 37862306a36Sopenharmony_ci .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 37962306a36Sopenharmony_ci .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f 38062306a36Sopenharmony_ci .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 38162306a36Sopenharmony_ci .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 382