162306a36Sopenharmony_ci######################################################################## 262306a36Sopenharmony_ci# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions 362306a36Sopenharmony_ci# 462306a36Sopenharmony_ci# Copyright (c) 2013, Intel Corporation 562306a36Sopenharmony_ci# 662306a36Sopenharmony_ci# Authors: 762306a36Sopenharmony_ci# Erdinc Ozturk <erdinc.ozturk@intel.com> 862306a36Sopenharmony_ci# Vinodh Gopal <vinodh.gopal@intel.com> 962306a36Sopenharmony_ci# James Guilford <james.guilford@intel.com> 1062306a36Sopenharmony_ci# Tim Chen <tim.c.chen@linux.intel.com> 1162306a36Sopenharmony_ci# 1262306a36Sopenharmony_ci# This software is available to you under a choice of one of two 1362306a36Sopenharmony_ci# licenses. You may choose to be licensed under the terms of the GNU 1462306a36Sopenharmony_ci# General Public License (GPL) Version 2, available from the file 1562306a36Sopenharmony_ci# COPYING in the main directory of this source tree, or the 1662306a36Sopenharmony_ci# OpenIB.org BSD license below: 1762306a36Sopenharmony_ci# 1862306a36Sopenharmony_ci# Redistribution and use in source and binary forms, with or without 1962306a36Sopenharmony_ci# modification, are permitted provided that the following conditions are 2062306a36Sopenharmony_ci# met: 2162306a36Sopenharmony_ci# 2262306a36Sopenharmony_ci# * Redistributions of source code must retain the above copyright 2362306a36Sopenharmony_ci# notice, this list of conditions and the following disclaimer. 2462306a36Sopenharmony_ci# 2562306a36Sopenharmony_ci# * Redistributions in binary form must reproduce the above copyright 2662306a36Sopenharmony_ci# notice, this list of conditions and the following disclaimer in the 2762306a36Sopenharmony_ci# documentation and/or other materials provided with the 2862306a36Sopenharmony_ci# distribution. 2962306a36Sopenharmony_ci# 3062306a36Sopenharmony_ci# * Neither the name of the Intel Corporation nor the names of its 3162306a36Sopenharmony_ci# contributors may be used to endorse or promote products derived from 3262306a36Sopenharmony_ci# this software without specific prior written permission. 3362306a36Sopenharmony_ci# 3462306a36Sopenharmony_ci# 3562306a36Sopenharmony_ci# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 3662306a36Sopenharmony_ci# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 3762306a36Sopenharmony_ci# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 3862306a36Sopenharmony_ci# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 3962306a36Sopenharmony_ci# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 4062306a36Sopenharmony_ci# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 4162306a36Sopenharmony_ci# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 4262306a36Sopenharmony_ci# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 4362306a36Sopenharmony_ci# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 4462306a36Sopenharmony_ci# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 4562306a36Sopenharmony_ci# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 4662306a36Sopenharmony_ci# 4762306a36Sopenharmony_ci# Reference paper titled "Fast CRC Computation for Generic 4862306a36Sopenharmony_ci# Polynomials Using PCLMULQDQ Instruction" 4962306a36Sopenharmony_ci# URL: http://www.intel.com/content/dam/www/public/us/en/documents 5062306a36Sopenharmony_ci# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 5162306a36Sopenharmony_ci# 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci#include <linux/linkage.h> 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_ci.text 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_ci#define init_crc %edi 5862306a36Sopenharmony_ci#define buf %rsi 5962306a36Sopenharmony_ci#define len %rdx 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_ci#define FOLD_CONSTS %xmm10 6262306a36Sopenharmony_ci#define BSWAP_MASK %xmm11 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_ci# Fold reg1, reg2 into the next 32 data bytes, storing the result back into 6562306a36Sopenharmony_ci# reg1, reg2. 6662306a36Sopenharmony_ci.macro fold_32_bytes offset, reg1, reg2 6762306a36Sopenharmony_ci movdqu \offset(buf), %xmm9 6862306a36Sopenharmony_ci movdqu \offset+16(buf), %xmm12 6962306a36Sopenharmony_ci pshufb BSWAP_MASK, %xmm9 7062306a36Sopenharmony_ci pshufb BSWAP_MASK, %xmm12 7162306a36Sopenharmony_ci movdqa \reg1, %xmm8 7262306a36Sopenharmony_ci movdqa \reg2, %xmm13 7362306a36Sopenharmony_ci pclmulqdq $0x00, FOLD_CONSTS, \reg1 7462306a36Sopenharmony_ci pclmulqdq $0x11, FOLD_CONSTS, %xmm8 7562306a36Sopenharmony_ci pclmulqdq $0x00, FOLD_CONSTS, \reg2 7662306a36Sopenharmony_ci pclmulqdq $0x11, FOLD_CONSTS, %xmm13 7762306a36Sopenharmony_ci pxor %xmm9 , \reg1 7862306a36Sopenharmony_ci xorps %xmm8 , \reg1 7962306a36Sopenharmony_ci pxor %xmm12, \reg2 8062306a36Sopenharmony_ci xorps %xmm13, \reg2 8162306a36Sopenharmony_ci.endm 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_ci# Fold src_reg into dst_reg. 8462306a36Sopenharmony_ci.macro fold_16_bytes src_reg, dst_reg 8562306a36Sopenharmony_ci movdqa \src_reg, %xmm8 8662306a36Sopenharmony_ci pclmulqdq $0x11, FOLD_CONSTS, \src_reg 8762306a36Sopenharmony_ci pclmulqdq $0x00, FOLD_CONSTS, %xmm8 8862306a36Sopenharmony_ci pxor %xmm8, \dst_reg 8962306a36Sopenharmony_ci xorps \src_reg, \dst_reg 9062306a36Sopenharmony_ci.endm 9162306a36Sopenharmony_ci 9262306a36Sopenharmony_ci# 9362306a36Sopenharmony_ci# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len); 9462306a36Sopenharmony_ci# 9562306a36Sopenharmony_ci# Assumes len >= 16. 9662306a36Sopenharmony_ci# 9762306a36Sopenharmony_ciSYM_FUNC_START(crc_t10dif_pcl) 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci movdqa .Lbswap_mask(%rip), BSWAP_MASK 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_ci # For sizes less than 256 bytes, we can't fold 128 bytes at a time. 10262306a36Sopenharmony_ci cmp $256, len 10362306a36Sopenharmony_ci jl .Lless_than_256_bytes 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ci # Load the first 128 data bytes. Byte swapping is necessary to make the 10662306a36Sopenharmony_ci # bit order match the polynomial coefficient order. 10762306a36Sopenharmony_ci movdqu 16*0(buf), %xmm0 10862306a36Sopenharmony_ci movdqu 16*1(buf), %xmm1 10962306a36Sopenharmony_ci movdqu 16*2(buf), %xmm2 11062306a36Sopenharmony_ci movdqu 16*3(buf), %xmm3 11162306a36Sopenharmony_ci movdqu 16*4(buf), %xmm4 11262306a36Sopenharmony_ci movdqu 16*5(buf), %xmm5 11362306a36Sopenharmony_ci movdqu 16*6(buf), %xmm6 11462306a36Sopenharmony_ci movdqu 16*7(buf), %xmm7 11562306a36Sopenharmony_ci add $128, buf 11662306a36Sopenharmony_ci pshufb BSWAP_MASK, %xmm0 11762306a36Sopenharmony_ci pshufb BSWAP_MASK, %xmm1 11862306a36Sopenharmony_ci pshufb BSWAP_MASK, %xmm2 11962306a36Sopenharmony_ci pshufb BSWAP_MASK, %xmm3 12062306a36Sopenharmony_ci pshufb BSWAP_MASK, %xmm4 12162306a36Sopenharmony_ci pshufb BSWAP_MASK, %xmm5 12262306a36Sopenharmony_ci pshufb BSWAP_MASK, %xmm6 12362306a36Sopenharmony_ci pshufb BSWAP_MASK, %xmm7 12462306a36Sopenharmony_ci 12562306a36Sopenharmony_ci # XOR the first 16 data *bits* with the initial CRC value. 12662306a36Sopenharmony_ci pxor %xmm8, %xmm8 12762306a36Sopenharmony_ci pinsrw $7, init_crc, %xmm8 12862306a36Sopenharmony_ci pxor %xmm8, %xmm0 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_ci # Subtract 128 for the 128 data bytes just consumed. Subtract another 13362306a36Sopenharmony_ci # 128 to simplify the termination condition of the following loop. 13462306a36Sopenharmony_ci sub $256, len 13562306a36Sopenharmony_ci 13662306a36Sopenharmony_ci # While >= 128 data bytes remain (not counting xmm0-7), fold the 128 13762306a36Sopenharmony_ci # bytes xmm0-7 into them, storing the result back into xmm0-7. 13862306a36Sopenharmony_ci.Lfold_128_bytes_loop: 13962306a36Sopenharmony_ci fold_32_bytes 0, %xmm0, %xmm1 14062306a36Sopenharmony_ci fold_32_bytes 32, %xmm2, %xmm3 14162306a36Sopenharmony_ci fold_32_bytes 64, %xmm4, %xmm5 14262306a36Sopenharmony_ci fold_32_bytes 96, %xmm6, %xmm7 14362306a36Sopenharmony_ci add $128, buf 14462306a36Sopenharmony_ci sub $128, len 14562306a36Sopenharmony_ci jge .Lfold_128_bytes_loop 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7. 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci # Fold across 64 bytes. 15062306a36Sopenharmony_ci movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS 15162306a36Sopenharmony_ci fold_16_bytes %xmm0, %xmm4 15262306a36Sopenharmony_ci fold_16_bytes %xmm1, %xmm5 15362306a36Sopenharmony_ci fold_16_bytes %xmm2, %xmm6 15462306a36Sopenharmony_ci fold_16_bytes %xmm3, %xmm7 15562306a36Sopenharmony_ci # Fold across 32 bytes. 15662306a36Sopenharmony_ci movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS 15762306a36Sopenharmony_ci fold_16_bytes %xmm4, %xmm6 15862306a36Sopenharmony_ci fold_16_bytes %xmm5, %xmm7 15962306a36Sopenharmony_ci # Fold across 16 bytes. 16062306a36Sopenharmony_ci movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS 16162306a36Sopenharmony_ci fold_16_bytes %xmm6, %xmm7 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci # Add 128 to get the correct number of data bytes remaining in 0...127 16462306a36Sopenharmony_ci # (not counting xmm7), following the previous extra subtraction by 128. 16562306a36Sopenharmony_ci # Then subtract 16 to simplify the termination condition of the 16662306a36Sopenharmony_ci # following loop. 16762306a36Sopenharmony_ci add $128-16, len 16862306a36Sopenharmony_ci 16962306a36Sopenharmony_ci # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes 17062306a36Sopenharmony_ci # xmm7 into them, storing the result back into xmm7. 17162306a36Sopenharmony_ci jl .Lfold_16_bytes_loop_done 17262306a36Sopenharmony_ci.Lfold_16_bytes_loop: 17362306a36Sopenharmony_ci movdqa %xmm7, %xmm8 17462306a36Sopenharmony_ci pclmulqdq $0x11, FOLD_CONSTS, %xmm7 17562306a36Sopenharmony_ci pclmulqdq $0x00, FOLD_CONSTS, %xmm8 17662306a36Sopenharmony_ci pxor %xmm8, %xmm7 17762306a36Sopenharmony_ci movdqu (buf), %xmm0 17862306a36Sopenharmony_ci pshufb BSWAP_MASK, %xmm0 17962306a36Sopenharmony_ci pxor %xmm0 , %xmm7 18062306a36Sopenharmony_ci add $16, buf 18162306a36Sopenharmony_ci sub $16, len 18262306a36Sopenharmony_ci jge .Lfold_16_bytes_loop 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_ci.Lfold_16_bytes_loop_done: 18562306a36Sopenharmony_ci # Add 16 to get the correct number of data bytes remaining in 0...15 18662306a36Sopenharmony_ci # (not counting xmm7), following the previous extra subtraction by 16. 18762306a36Sopenharmony_ci add $16, len 18862306a36Sopenharmony_ci je .Lreduce_final_16_bytes 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci.Lhandle_partial_segment: 19162306a36Sopenharmony_ci # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16 19262306a36Sopenharmony_ci # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do 19362306a36Sopenharmony_ci # this without needing a fold constant for each possible 'len', redivide 19462306a36Sopenharmony_ci # the bytes into a first chunk of 'len' bytes and a second chunk of 16 19562306a36Sopenharmony_ci # bytes, then fold the first chunk into the second. 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci movdqa %xmm7, %xmm2 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci # xmm1 = last 16 original data bytes 20062306a36Sopenharmony_ci movdqu -16(buf, len), %xmm1 20162306a36Sopenharmony_ci pshufb BSWAP_MASK, %xmm1 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes. 20462306a36Sopenharmony_ci lea .Lbyteshift_table+16(%rip), %rax 20562306a36Sopenharmony_ci sub len, %rax 20662306a36Sopenharmony_ci movdqu (%rax), %xmm0 20762306a36Sopenharmony_ci pshufb %xmm0, %xmm2 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ci # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes. 21062306a36Sopenharmony_ci pxor .Lmask1(%rip), %xmm0 21162306a36Sopenharmony_ci pshufb %xmm0, %xmm7 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes), 21462306a36Sopenharmony_ci # then '16-len' bytes from xmm2 (high-order bytes). 21562306a36Sopenharmony_ci pblendvb %xmm2, %xmm1 #xmm0 is implicit 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci # Fold the first chunk into the second chunk, storing the result in xmm7. 21862306a36Sopenharmony_ci movdqa %xmm7, %xmm8 21962306a36Sopenharmony_ci pclmulqdq $0x11, FOLD_CONSTS, %xmm7 22062306a36Sopenharmony_ci pclmulqdq $0x00, FOLD_CONSTS, %xmm8 22162306a36Sopenharmony_ci pxor %xmm8, %xmm7 22262306a36Sopenharmony_ci pxor %xmm1, %xmm7 22362306a36Sopenharmony_ci 22462306a36Sopenharmony_ci.Lreduce_final_16_bytes: 22562306a36Sopenharmony_ci # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. 22862306a36Sopenharmony_ci movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_ci # Fold the high 64 bits into the low 64 bits, while also multiplying by 23162306a36Sopenharmony_ci # x^64. This produces a 128-bit value congruent to x^64 * M(x) and 23262306a36Sopenharmony_ci # whose low 48 bits are 0. 23362306a36Sopenharmony_ci movdqa %xmm7, %xmm0 23462306a36Sopenharmony_ci pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x)) 23562306a36Sopenharmony_ci pslldq $8, %xmm0 23662306a36Sopenharmony_ci pxor %xmm0, %xmm7 # + low bits * x^64 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci # Fold the high 32 bits into the low 96 bits. This produces a 96-bit 23962306a36Sopenharmony_ci # value congruent to x^64 * M(x) and whose low 48 bits are 0. 24062306a36Sopenharmony_ci movdqa %xmm7, %xmm0 24162306a36Sopenharmony_ci pand .Lmask2(%rip), %xmm0 # zero high 32 bits 24262306a36Sopenharmony_ci psrldq $12, %xmm7 # extract high 32 bits 24362306a36Sopenharmony_ci pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x)) 24462306a36Sopenharmony_ci pxor %xmm0, %xmm7 # + low bits 24562306a36Sopenharmony_ci 24662306a36Sopenharmony_ci # Load G(x) and floor(x^48 / G(x)). 24762306a36Sopenharmony_ci movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS 24862306a36Sopenharmony_ci 24962306a36Sopenharmony_ci # Use Barrett reduction to compute the final CRC value. 25062306a36Sopenharmony_ci movdqa %xmm7, %xmm0 25162306a36Sopenharmony_ci pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x)) 25262306a36Sopenharmony_ci psrlq $32, %xmm7 # /= x^32 25362306a36Sopenharmony_ci pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x) 25462306a36Sopenharmony_ci psrlq $48, %xmm0 25562306a36Sopenharmony_ci pxor %xmm7, %xmm0 # + low 16 nonzero bits 25662306a36Sopenharmony_ci # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0. 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci pextrw $0, %xmm0, %eax 25962306a36Sopenharmony_ci RET 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_ci.align 16 26262306a36Sopenharmony_ci.Lless_than_256_bytes: 26362306a36Sopenharmony_ci # Checksumming a buffer of length 16...255 bytes 26462306a36Sopenharmony_ci 26562306a36Sopenharmony_ci # Load the first 16 data bytes. 26662306a36Sopenharmony_ci movdqu (buf), %xmm7 26762306a36Sopenharmony_ci pshufb BSWAP_MASK, %xmm7 26862306a36Sopenharmony_ci add $16, buf 26962306a36Sopenharmony_ci 27062306a36Sopenharmony_ci # XOR the first 16 data *bits* with the initial CRC value. 27162306a36Sopenharmony_ci pxor %xmm0, %xmm0 27262306a36Sopenharmony_ci pinsrw $7, init_crc, %xmm0 27362306a36Sopenharmony_ci pxor %xmm0, %xmm7 27462306a36Sopenharmony_ci 27562306a36Sopenharmony_ci movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS 27662306a36Sopenharmony_ci cmp $16, len 27762306a36Sopenharmony_ci je .Lreduce_final_16_bytes # len == 16 27862306a36Sopenharmony_ci sub $32, len 27962306a36Sopenharmony_ci jge .Lfold_16_bytes_loop # 32 <= len <= 255 28062306a36Sopenharmony_ci add $16, len 28162306a36Sopenharmony_ci jmp .Lhandle_partial_segment # 17 <= len <= 31 28262306a36Sopenharmony_ciSYM_FUNC_END(crc_t10dif_pcl) 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci.section .rodata, "a", @progbits 28562306a36Sopenharmony_ci.align 16 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci# Fold constants precomputed from the polynomial 0x18bb7 28862306a36Sopenharmony_ci# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 28962306a36Sopenharmony_ci.Lfold_across_128_bytes_consts: 29062306a36Sopenharmony_ci .quad 0x0000000000006123 # x^(8*128) mod G(x) 29162306a36Sopenharmony_ci .quad 0x0000000000002295 # x^(8*128+64) mod G(x) 29262306a36Sopenharmony_ci.Lfold_across_64_bytes_consts: 29362306a36Sopenharmony_ci .quad 0x0000000000001069 # x^(4*128) mod G(x) 29462306a36Sopenharmony_ci .quad 0x000000000000dd31 # x^(4*128+64) mod G(x) 29562306a36Sopenharmony_ci.Lfold_across_32_bytes_consts: 29662306a36Sopenharmony_ci .quad 0x000000000000857d # x^(2*128) mod G(x) 29762306a36Sopenharmony_ci .quad 0x0000000000007acc # x^(2*128+64) mod G(x) 29862306a36Sopenharmony_ci.Lfold_across_16_bytes_consts: 29962306a36Sopenharmony_ci .quad 0x000000000000a010 # x^(1*128) mod G(x) 30062306a36Sopenharmony_ci .quad 0x0000000000001faa # x^(1*128+64) mod G(x) 30162306a36Sopenharmony_ci.Lfinal_fold_consts: 30262306a36Sopenharmony_ci .quad 0x1368000000000000 # x^48 * (x^48 mod G(x)) 30362306a36Sopenharmony_ci .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x)) 30462306a36Sopenharmony_ci.Lbarrett_reduction_consts: 30562306a36Sopenharmony_ci .quad 0x0000000000018bb7 # G(x) 30662306a36Sopenharmony_ci .quad 0x00000001f65a57f8 # floor(x^48 / G(x)) 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci.section .rodata.cst16.mask1, "aM", @progbits, 16 30962306a36Sopenharmony_ci.align 16 31062306a36Sopenharmony_ci.Lmask1: 31162306a36Sopenharmony_ci .octa 0x80808080808080808080808080808080 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci.section .rodata.cst16.mask2, "aM", @progbits, 16 31462306a36Sopenharmony_ci.align 16 31562306a36Sopenharmony_ci.Lmask2: 31662306a36Sopenharmony_ci .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF 31762306a36Sopenharmony_ci 31862306a36Sopenharmony_ci.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 31962306a36Sopenharmony_ci.align 16 32062306a36Sopenharmony_ci.Lbswap_mask: 32162306a36Sopenharmony_ci .octa 0x000102030405060708090A0B0C0D0E0F 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_ci.section .rodata.cst32.byteshift_table, "aM", @progbits, 32 32462306a36Sopenharmony_ci.align 16 32562306a36Sopenharmony_ci# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len] 32662306a36Sopenharmony_ci# is the index vector to shift left by 'len' bytes, and is also {0x80, ..., 32762306a36Sopenharmony_ci# 0x80} XOR the index vector to shift right by '16 - len' bytes. 32862306a36Sopenharmony_ci.Lbyteshift_table: 32962306a36Sopenharmony_ci .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 33062306a36Sopenharmony_ci .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f 33162306a36Sopenharmony_ci .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 33262306a36Sopenharmony_ci .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 333