18c2ecf20Sopenharmony_ci######################################################################## 28c2ecf20Sopenharmony_ci# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions 38c2ecf20Sopenharmony_ci# 48c2ecf20Sopenharmony_ci# Copyright (c) 2013, Intel Corporation 58c2ecf20Sopenharmony_ci# 68c2ecf20Sopenharmony_ci# Authors: 78c2ecf20Sopenharmony_ci# Erdinc Ozturk <erdinc.ozturk@intel.com> 88c2ecf20Sopenharmony_ci# Vinodh Gopal <vinodh.gopal@intel.com> 98c2ecf20Sopenharmony_ci# James Guilford <james.guilford@intel.com> 108c2ecf20Sopenharmony_ci# Tim Chen <tim.c.chen@linux.intel.com> 118c2ecf20Sopenharmony_ci# 128c2ecf20Sopenharmony_ci# This software is available to you under a choice of one of two 138c2ecf20Sopenharmony_ci# licenses. You may choose to be licensed under the terms of the GNU 148c2ecf20Sopenharmony_ci# General Public License (GPL) Version 2, available from the file 158c2ecf20Sopenharmony_ci# COPYING in the main directory of this source tree, or the 168c2ecf20Sopenharmony_ci# OpenIB.org BSD license below: 178c2ecf20Sopenharmony_ci# 188c2ecf20Sopenharmony_ci# Redistribution and use in source and binary forms, with or without 198c2ecf20Sopenharmony_ci# modification, are permitted provided that the following conditions are 208c2ecf20Sopenharmony_ci# met: 218c2ecf20Sopenharmony_ci# 228c2ecf20Sopenharmony_ci# * Redistributions of source code must retain the above copyright 238c2ecf20Sopenharmony_ci# notice, this list of conditions and the following disclaimer. 248c2ecf20Sopenharmony_ci# 258c2ecf20Sopenharmony_ci# * Redistributions in binary form must reproduce the above copyright 268c2ecf20Sopenharmony_ci# notice, this list of conditions and the following disclaimer in the 278c2ecf20Sopenharmony_ci# documentation and/or other materials provided with the 288c2ecf20Sopenharmony_ci# distribution. 298c2ecf20Sopenharmony_ci# 308c2ecf20Sopenharmony_ci# * Neither the name of the Intel Corporation nor the names of its 318c2ecf20Sopenharmony_ci# contributors may be used to endorse or promote products derived from 328c2ecf20Sopenharmony_ci# this software without specific prior written permission. 338c2ecf20Sopenharmony_ci# 348c2ecf20Sopenharmony_ci# 358c2ecf20Sopenharmony_ci# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 368c2ecf20Sopenharmony_ci# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 378c2ecf20Sopenharmony_ci# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 388c2ecf20Sopenharmony_ci# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 398c2ecf20Sopenharmony_ci# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 408c2ecf20Sopenharmony_ci# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 418c2ecf20Sopenharmony_ci# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 428c2ecf20Sopenharmony_ci# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 438c2ecf20Sopenharmony_ci# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 448c2ecf20Sopenharmony_ci# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 458c2ecf20Sopenharmony_ci# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 468c2ecf20Sopenharmony_ci# 478c2ecf20Sopenharmony_ci# Reference paper titled "Fast CRC Computation for Generic 488c2ecf20Sopenharmony_ci# Polynomials Using PCLMULQDQ Instruction" 498c2ecf20Sopenharmony_ci# URL: http://www.intel.com/content/dam/www/public/us/en/documents 508c2ecf20Sopenharmony_ci# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 518c2ecf20Sopenharmony_ci# 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_ci#include <linux/linkage.h> 548c2ecf20Sopenharmony_ci 558c2ecf20Sopenharmony_ci.text 568c2ecf20Sopenharmony_ci 578c2ecf20Sopenharmony_ci#define init_crc %edi 588c2ecf20Sopenharmony_ci#define buf %rsi 598c2ecf20Sopenharmony_ci#define len %rdx 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci#define FOLD_CONSTS %xmm10 628c2ecf20Sopenharmony_ci#define BSWAP_MASK %xmm11 638c2ecf20Sopenharmony_ci 648c2ecf20Sopenharmony_ci# Fold reg1, reg2 into the next 32 data bytes, storing the result back into 658c2ecf20Sopenharmony_ci# reg1, reg2. 668c2ecf20Sopenharmony_ci.macro fold_32_bytes offset, reg1, reg2 678c2ecf20Sopenharmony_ci movdqu \offset(buf), %xmm9 688c2ecf20Sopenharmony_ci movdqu \offset+16(buf), %xmm12 698c2ecf20Sopenharmony_ci pshufb BSWAP_MASK, %xmm9 708c2ecf20Sopenharmony_ci pshufb BSWAP_MASK, %xmm12 718c2ecf20Sopenharmony_ci movdqa \reg1, %xmm8 728c2ecf20Sopenharmony_ci movdqa \reg2, %xmm13 738c2ecf20Sopenharmony_ci pclmulqdq $0x00, FOLD_CONSTS, \reg1 748c2ecf20Sopenharmony_ci pclmulqdq $0x11, FOLD_CONSTS, %xmm8 758c2ecf20Sopenharmony_ci pclmulqdq $0x00, FOLD_CONSTS, \reg2 768c2ecf20Sopenharmony_ci pclmulqdq $0x11, FOLD_CONSTS, %xmm13 778c2ecf20Sopenharmony_ci pxor %xmm9 , \reg1 788c2ecf20Sopenharmony_ci xorps %xmm8 , \reg1 798c2ecf20Sopenharmony_ci pxor %xmm12, \reg2 808c2ecf20Sopenharmony_ci xorps %xmm13, \reg2 818c2ecf20Sopenharmony_ci.endm 828c2ecf20Sopenharmony_ci 838c2ecf20Sopenharmony_ci# Fold src_reg into dst_reg. 848c2ecf20Sopenharmony_ci.macro fold_16_bytes src_reg, dst_reg 858c2ecf20Sopenharmony_ci movdqa \src_reg, %xmm8 868c2ecf20Sopenharmony_ci pclmulqdq $0x11, FOLD_CONSTS, \src_reg 878c2ecf20Sopenharmony_ci pclmulqdq $0x00, FOLD_CONSTS, %xmm8 888c2ecf20Sopenharmony_ci pxor %xmm8, \dst_reg 898c2ecf20Sopenharmony_ci xorps \src_reg, \dst_reg 908c2ecf20Sopenharmony_ci.endm 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_ci# 938c2ecf20Sopenharmony_ci# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len); 948c2ecf20Sopenharmony_ci# 958c2ecf20Sopenharmony_ci# Assumes len >= 16. 968c2ecf20Sopenharmony_ci# 978c2ecf20Sopenharmony_ci.align 16 988c2ecf20Sopenharmony_ciSYM_FUNC_START(crc_t10dif_pcl) 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_ci movdqa .Lbswap_mask(%rip), BSWAP_MASK 1018c2ecf20Sopenharmony_ci 1028c2ecf20Sopenharmony_ci # For sizes less than 256 bytes, we can't fold 128 bytes at a time. 1038c2ecf20Sopenharmony_ci cmp $256, len 1048c2ecf20Sopenharmony_ci jl .Lless_than_256_bytes 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci # Load the first 128 data bytes. Byte swapping is necessary to make the 1078c2ecf20Sopenharmony_ci # bit order match the polynomial coefficient order. 1088c2ecf20Sopenharmony_ci movdqu 16*0(buf), %xmm0 1098c2ecf20Sopenharmony_ci movdqu 16*1(buf), %xmm1 1108c2ecf20Sopenharmony_ci movdqu 16*2(buf), %xmm2 1118c2ecf20Sopenharmony_ci movdqu 16*3(buf), %xmm3 1128c2ecf20Sopenharmony_ci movdqu 16*4(buf), %xmm4 1138c2ecf20Sopenharmony_ci movdqu 16*5(buf), %xmm5 1148c2ecf20Sopenharmony_ci movdqu 16*6(buf), %xmm6 1158c2ecf20Sopenharmony_ci movdqu 16*7(buf), %xmm7 1168c2ecf20Sopenharmony_ci add $128, buf 1178c2ecf20Sopenharmony_ci pshufb BSWAP_MASK, %xmm0 1188c2ecf20Sopenharmony_ci pshufb BSWAP_MASK, %xmm1 1198c2ecf20Sopenharmony_ci pshufb BSWAP_MASK, %xmm2 1208c2ecf20Sopenharmony_ci pshufb BSWAP_MASK, %xmm3 1218c2ecf20Sopenharmony_ci pshufb BSWAP_MASK, %xmm4 1228c2ecf20Sopenharmony_ci pshufb BSWAP_MASK, %xmm5 1238c2ecf20Sopenharmony_ci pshufb BSWAP_MASK, %xmm6 1248c2ecf20Sopenharmony_ci pshufb BSWAP_MASK, %xmm7 1258c2ecf20Sopenharmony_ci 1268c2ecf20Sopenharmony_ci # XOR the first 16 data *bits* with the initial CRC value. 1278c2ecf20Sopenharmony_ci pxor %xmm8, %xmm8 1288c2ecf20Sopenharmony_ci pinsrw $7, init_crc, %xmm8 1298c2ecf20Sopenharmony_ci pxor %xmm8, %xmm0 1308c2ecf20Sopenharmony_ci 1318c2ecf20Sopenharmony_ci movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS 1328c2ecf20Sopenharmony_ci 1338c2ecf20Sopenharmony_ci # Subtract 128 for the 128 data bytes just consumed. Subtract another 1348c2ecf20Sopenharmony_ci # 128 to simplify the termination condition of the following loop. 1358c2ecf20Sopenharmony_ci sub $256, len 1368c2ecf20Sopenharmony_ci 1378c2ecf20Sopenharmony_ci # While >= 128 data bytes remain (not counting xmm0-7), fold the 128 1388c2ecf20Sopenharmony_ci # bytes xmm0-7 into them, storing the result back into xmm0-7. 1398c2ecf20Sopenharmony_ci.Lfold_128_bytes_loop: 1408c2ecf20Sopenharmony_ci fold_32_bytes 0, %xmm0, %xmm1 1418c2ecf20Sopenharmony_ci fold_32_bytes 32, %xmm2, %xmm3 1428c2ecf20Sopenharmony_ci fold_32_bytes 64, %xmm4, %xmm5 1438c2ecf20Sopenharmony_ci fold_32_bytes 96, %xmm6, %xmm7 1448c2ecf20Sopenharmony_ci add $128, buf 1458c2ecf20Sopenharmony_ci sub $128, len 1468c2ecf20Sopenharmony_ci jge .Lfold_128_bytes_loop 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_ci # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7. 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_ci # Fold across 64 bytes. 1518c2ecf20Sopenharmony_ci movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS 1528c2ecf20Sopenharmony_ci fold_16_bytes %xmm0, %xmm4 1538c2ecf20Sopenharmony_ci fold_16_bytes %xmm1, %xmm5 1548c2ecf20Sopenharmony_ci fold_16_bytes %xmm2, %xmm6 1558c2ecf20Sopenharmony_ci fold_16_bytes %xmm3, %xmm7 1568c2ecf20Sopenharmony_ci # Fold across 32 bytes. 1578c2ecf20Sopenharmony_ci movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS 1588c2ecf20Sopenharmony_ci fold_16_bytes %xmm4, %xmm6 1598c2ecf20Sopenharmony_ci fold_16_bytes %xmm5, %xmm7 1608c2ecf20Sopenharmony_ci # Fold across 16 bytes. 1618c2ecf20Sopenharmony_ci movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS 1628c2ecf20Sopenharmony_ci fold_16_bytes %xmm6, %xmm7 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_ci # Add 128 to get the correct number of data bytes remaining in 0...127 1658c2ecf20Sopenharmony_ci # (not counting xmm7), following the previous extra subtraction by 128. 1668c2ecf20Sopenharmony_ci # Then subtract 16 to simplify the termination condition of the 1678c2ecf20Sopenharmony_ci # following loop. 1688c2ecf20Sopenharmony_ci add $128-16, len 1698c2ecf20Sopenharmony_ci 1708c2ecf20Sopenharmony_ci # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes 1718c2ecf20Sopenharmony_ci # xmm7 into them, storing the result back into xmm7. 1728c2ecf20Sopenharmony_ci jl .Lfold_16_bytes_loop_done 1738c2ecf20Sopenharmony_ci.Lfold_16_bytes_loop: 1748c2ecf20Sopenharmony_ci movdqa %xmm7, %xmm8 1758c2ecf20Sopenharmony_ci pclmulqdq $0x11, FOLD_CONSTS, %xmm7 1768c2ecf20Sopenharmony_ci pclmulqdq $0x00, FOLD_CONSTS, %xmm8 1778c2ecf20Sopenharmony_ci pxor %xmm8, %xmm7 1788c2ecf20Sopenharmony_ci movdqu (buf), %xmm0 1798c2ecf20Sopenharmony_ci pshufb BSWAP_MASK, %xmm0 1808c2ecf20Sopenharmony_ci pxor %xmm0 , %xmm7 1818c2ecf20Sopenharmony_ci add $16, buf 1828c2ecf20Sopenharmony_ci sub $16, len 1838c2ecf20Sopenharmony_ci jge .Lfold_16_bytes_loop 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci.Lfold_16_bytes_loop_done: 1868c2ecf20Sopenharmony_ci # Add 16 to get the correct number of data bytes remaining in 0...15 1878c2ecf20Sopenharmony_ci # (not counting xmm7), following the previous extra subtraction by 16. 1888c2ecf20Sopenharmony_ci add $16, len 1898c2ecf20Sopenharmony_ci je .Lreduce_final_16_bytes 1908c2ecf20Sopenharmony_ci 1918c2ecf20Sopenharmony_ci.Lhandle_partial_segment: 1928c2ecf20Sopenharmony_ci # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16 1938c2ecf20Sopenharmony_ci # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do 1948c2ecf20Sopenharmony_ci # this without needing a fold constant for each possible 'len', redivide 1958c2ecf20Sopenharmony_ci # the bytes into a first chunk of 'len' bytes and a second chunk of 16 1968c2ecf20Sopenharmony_ci # bytes, then fold the first chunk into the second. 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_ci movdqa %xmm7, %xmm2 1998c2ecf20Sopenharmony_ci 2008c2ecf20Sopenharmony_ci # xmm1 = last 16 original data bytes 2018c2ecf20Sopenharmony_ci movdqu -16(buf, len), %xmm1 2028c2ecf20Sopenharmony_ci pshufb BSWAP_MASK, %xmm1 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes. 2058c2ecf20Sopenharmony_ci lea .Lbyteshift_table+16(%rip), %rax 2068c2ecf20Sopenharmony_ci sub len, %rax 2078c2ecf20Sopenharmony_ci movdqu (%rax), %xmm0 2088c2ecf20Sopenharmony_ci pshufb %xmm0, %xmm2 2098c2ecf20Sopenharmony_ci 2108c2ecf20Sopenharmony_ci # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes. 2118c2ecf20Sopenharmony_ci pxor .Lmask1(%rip), %xmm0 2128c2ecf20Sopenharmony_ci pshufb %xmm0, %xmm7 2138c2ecf20Sopenharmony_ci 2148c2ecf20Sopenharmony_ci # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes), 2158c2ecf20Sopenharmony_ci # then '16-len' bytes from xmm2 (high-order bytes). 2168c2ecf20Sopenharmony_ci pblendvb %xmm2, %xmm1 #xmm0 is implicit 2178c2ecf20Sopenharmony_ci 2188c2ecf20Sopenharmony_ci # Fold the first chunk into the second chunk, storing the result in xmm7. 2198c2ecf20Sopenharmony_ci movdqa %xmm7, %xmm8 2208c2ecf20Sopenharmony_ci pclmulqdq $0x11, FOLD_CONSTS, %xmm7 2218c2ecf20Sopenharmony_ci pclmulqdq $0x00, FOLD_CONSTS, %xmm8 2228c2ecf20Sopenharmony_ci pxor %xmm8, %xmm7 2238c2ecf20Sopenharmony_ci pxor %xmm1, %xmm7 2248c2ecf20Sopenharmony_ci 2258c2ecf20Sopenharmony_ci.Lreduce_final_16_bytes: 2268c2ecf20Sopenharmony_ci # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_ci # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. 2298c2ecf20Sopenharmony_ci movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS 2308c2ecf20Sopenharmony_ci 2318c2ecf20Sopenharmony_ci # Fold the high 64 bits into the low 64 bits, while also multiplying by 2328c2ecf20Sopenharmony_ci # x^64. This produces a 128-bit value congruent to x^64 * M(x) and 2338c2ecf20Sopenharmony_ci # whose low 48 bits are 0. 2348c2ecf20Sopenharmony_ci movdqa %xmm7, %xmm0 2358c2ecf20Sopenharmony_ci pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x)) 2368c2ecf20Sopenharmony_ci pslldq $8, %xmm0 2378c2ecf20Sopenharmony_ci pxor %xmm0, %xmm7 # + low bits * x^64 2388c2ecf20Sopenharmony_ci 2398c2ecf20Sopenharmony_ci # Fold the high 32 bits into the low 96 bits. This produces a 96-bit 2408c2ecf20Sopenharmony_ci # value congruent to x^64 * M(x) and whose low 48 bits are 0. 2418c2ecf20Sopenharmony_ci movdqa %xmm7, %xmm0 2428c2ecf20Sopenharmony_ci pand .Lmask2(%rip), %xmm0 # zero high 32 bits 2438c2ecf20Sopenharmony_ci psrldq $12, %xmm7 # extract high 32 bits 2448c2ecf20Sopenharmony_ci pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x)) 2458c2ecf20Sopenharmony_ci pxor %xmm0, %xmm7 # + low bits 2468c2ecf20Sopenharmony_ci 2478c2ecf20Sopenharmony_ci # Load G(x) and floor(x^48 / G(x)). 2488c2ecf20Sopenharmony_ci movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS 2498c2ecf20Sopenharmony_ci 2508c2ecf20Sopenharmony_ci # Use Barrett reduction to compute the final CRC value. 2518c2ecf20Sopenharmony_ci movdqa %xmm7, %xmm0 2528c2ecf20Sopenharmony_ci pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x)) 2538c2ecf20Sopenharmony_ci psrlq $32, %xmm7 # /= x^32 2548c2ecf20Sopenharmony_ci pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x) 2558c2ecf20Sopenharmony_ci psrlq $48, %xmm0 2568c2ecf20Sopenharmony_ci pxor %xmm7, %xmm0 # + low 16 nonzero bits 2578c2ecf20Sopenharmony_ci # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0. 2588c2ecf20Sopenharmony_ci 2598c2ecf20Sopenharmony_ci pextrw $0, %xmm0, %eax 2608c2ecf20Sopenharmony_ci RET 2618c2ecf20Sopenharmony_ci 2628c2ecf20Sopenharmony_ci.align 16 2638c2ecf20Sopenharmony_ci.Lless_than_256_bytes: 2648c2ecf20Sopenharmony_ci # Checksumming a buffer of length 16...255 bytes 2658c2ecf20Sopenharmony_ci 2668c2ecf20Sopenharmony_ci # Load the first 16 data bytes. 2678c2ecf20Sopenharmony_ci movdqu (buf), %xmm7 2688c2ecf20Sopenharmony_ci pshufb BSWAP_MASK, %xmm7 2698c2ecf20Sopenharmony_ci add $16, buf 2708c2ecf20Sopenharmony_ci 2718c2ecf20Sopenharmony_ci # XOR the first 16 data *bits* with the initial CRC value. 2728c2ecf20Sopenharmony_ci pxor %xmm0, %xmm0 2738c2ecf20Sopenharmony_ci pinsrw $7, init_crc, %xmm0 2748c2ecf20Sopenharmony_ci pxor %xmm0, %xmm7 2758c2ecf20Sopenharmony_ci 2768c2ecf20Sopenharmony_ci movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS 2778c2ecf20Sopenharmony_ci cmp $16, len 2788c2ecf20Sopenharmony_ci je .Lreduce_final_16_bytes # len == 16 2798c2ecf20Sopenharmony_ci sub $32, len 2808c2ecf20Sopenharmony_ci jge .Lfold_16_bytes_loop # 32 <= len <= 255 2818c2ecf20Sopenharmony_ci add $16, len 2828c2ecf20Sopenharmony_ci jmp .Lhandle_partial_segment # 17 <= len <= 31 2838c2ecf20Sopenharmony_ciSYM_FUNC_END(crc_t10dif_pcl) 2848c2ecf20Sopenharmony_ci 2858c2ecf20Sopenharmony_ci.section .rodata, "a", @progbits 2868c2ecf20Sopenharmony_ci.align 16 2878c2ecf20Sopenharmony_ci 2888c2ecf20Sopenharmony_ci# Fold constants precomputed from the polynomial 0x18bb7 2898c2ecf20Sopenharmony_ci# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 2908c2ecf20Sopenharmony_ci.Lfold_across_128_bytes_consts: 2918c2ecf20Sopenharmony_ci .quad 0x0000000000006123 # x^(8*128) mod G(x) 2928c2ecf20Sopenharmony_ci .quad 0x0000000000002295 # x^(8*128+64) mod G(x) 2938c2ecf20Sopenharmony_ci.Lfold_across_64_bytes_consts: 2948c2ecf20Sopenharmony_ci .quad 0x0000000000001069 # x^(4*128) mod G(x) 2958c2ecf20Sopenharmony_ci .quad 0x000000000000dd31 # x^(4*128+64) mod G(x) 2968c2ecf20Sopenharmony_ci.Lfold_across_32_bytes_consts: 2978c2ecf20Sopenharmony_ci .quad 0x000000000000857d # x^(2*128) mod G(x) 2988c2ecf20Sopenharmony_ci .quad 0x0000000000007acc # x^(2*128+64) mod G(x) 2998c2ecf20Sopenharmony_ci.Lfold_across_16_bytes_consts: 3008c2ecf20Sopenharmony_ci .quad 0x000000000000a010 # x^(1*128) mod G(x) 3018c2ecf20Sopenharmony_ci .quad 0x0000000000001faa # x^(1*128+64) mod G(x) 3028c2ecf20Sopenharmony_ci.Lfinal_fold_consts: 3038c2ecf20Sopenharmony_ci .quad 0x1368000000000000 # x^48 * (x^48 mod G(x)) 3048c2ecf20Sopenharmony_ci .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x)) 3058c2ecf20Sopenharmony_ci.Lbarrett_reduction_consts: 3068c2ecf20Sopenharmony_ci .quad 0x0000000000018bb7 # G(x) 3078c2ecf20Sopenharmony_ci .quad 0x00000001f65a57f8 # floor(x^48 / G(x)) 3088c2ecf20Sopenharmony_ci 3098c2ecf20Sopenharmony_ci.section .rodata.cst16.mask1, "aM", @progbits, 16 3108c2ecf20Sopenharmony_ci.align 16 3118c2ecf20Sopenharmony_ci.Lmask1: 3128c2ecf20Sopenharmony_ci .octa 0x80808080808080808080808080808080 3138c2ecf20Sopenharmony_ci 3148c2ecf20Sopenharmony_ci.section .rodata.cst16.mask2, "aM", @progbits, 16 3158c2ecf20Sopenharmony_ci.align 16 3168c2ecf20Sopenharmony_ci.Lmask2: 3178c2ecf20Sopenharmony_ci .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF 3188c2ecf20Sopenharmony_ci 3198c2ecf20Sopenharmony_ci.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 3208c2ecf20Sopenharmony_ci.align 16 3218c2ecf20Sopenharmony_ci.Lbswap_mask: 3228c2ecf20Sopenharmony_ci .octa 0x000102030405060708090A0B0C0D0E0F 3238c2ecf20Sopenharmony_ci 3248c2ecf20Sopenharmony_ci.section .rodata.cst32.byteshift_table, "aM", @progbits, 32 3258c2ecf20Sopenharmony_ci.align 16 3268c2ecf20Sopenharmony_ci# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len] 3278c2ecf20Sopenharmony_ci# is the index vector to shift left by 'len' bytes, and is also {0x80, ..., 3288c2ecf20Sopenharmony_ci# 0x80} XOR the index vector to shift right by '16 - len' bytes. 3298c2ecf20Sopenharmony_ci.Lbyteshift_table: 3308c2ecf20Sopenharmony_ci .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 3318c2ecf20Sopenharmony_ci .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f 3328c2ecf20Sopenharmony_ci .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 3338c2ecf20Sopenharmony_ci .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 334