162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * SM4 Cipher Algorithm for ARMv8 NEON 462306a36Sopenharmony_ci * as specified in 562306a36Sopenharmony_ci * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * Copyright (C) 2022, Alibaba Group. 862306a36Sopenharmony_ci * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 962306a36Sopenharmony_ci */ 1062306a36Sopenharmony_ci 1162306a36Sopenharmony_ci#include <linux/linkage.h> 1262306a36Sopenharmony_ci#include <asm/assembler.h> 1362306a36Sopenharmony_ci 1462306a36Sopenharmony_ci/* Register macros */ 1562306a36Sopenharmony_ci 1662306a36Sopenharmony_ci#define RTMP0 v8 1762306a36Sopenharmony_ci#define RTMP1 v9 1862306a36Sopenharmony_ci#define RTMP2 v10 1962306a36Sopenharmony_ci#define RTMP3 v11 2062306a36Sopenharmony_ci 2162306a36Sopenharmony_ci#define RTMP4 v12 2262306a36Sopenharmony_ci#define RTMP5 v13 2362306a36Sopenharmony_ci#define RTMP6 v14 2462306a36Sopenharmony_ci#define RTMP7 v15 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_ci#define RX0 v12 2762306a36Sopenharmony_ci#define RX1 v13 2862306a36Sopenharmony_ci#define RKEY v14 2962306a36Sopenharmony_ci#define RIV v15 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_ci/* Helper macros. */ 3262306a36Sopenharmony_ci 3362306a36Sopenharmony_ci#define SM4_PREPARE() \ 3462306a36Sopenharmony_ci adr_l x5, crypto_sm4_sbox; \ 3562306a36Sopenharmony_ci ld1 {v16.16b-v19.16b}, [x5], #64; \ 3662306a36Sopenharmony_ci ld1 {v20.16b-v23.16b}, [x5], #64; \ 3762306a36Sopenharmony_ci ld1 {v24.16b-v27.16b}, [x5], #64; \ 3862306a36Sopenharmony_ci ld1 {v28.16b-v31.16b}, [x5]; 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_ci#define transpose_4x4(s0, s1, s2, s3) \ 4162306a36Sopenharmony_ci zip1 RTMP0.4s, s0.4s, s1.4s; \ 4262306a36Sopenharmony_ci zip1 RTMP1.4s, s2.4s, s3.4s; \ 4362306a36Sopenharmony_ci zip2 RTMP2.4s, s0.4s, s1.4s; \ 4462306a36Sopenharmony_ci zip2 RTMP3.4s, s2.4s, s3.4s; \ 4562306a36Sopenharmony_ci zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ 4662306a36Sopenharmony_ci zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ 4762306a36Sopenharmony_ci zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ 4862306a36Sopenharmony_ci zip2 s3.2d, RTMP2.2d, RTMP3.2d; 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_ci#define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \ 5162306a36Sopenharmony_ci zip1 RTMP0.4s, s0.4s, s1.4s; \ 5262306a36Sopenharmony_ci zip1 RTMP1.4s, s2.4s, s3.4s; \ 5362306a36Sopenharmony_ci zip2 RTMP2.4s, s0.4s, s1.4s; \ 5462306a36Sopenharmony_ci zip2 RTMP3.4s, s2.4s, s3.4s; \ 5562306a36Sopenharmony_ci zip1 RTMP4.4s, s4.4s, s5.4s; \ 5662306a36Sopenharmony_ci zip1 RTMP5.4s, s6.4s, s7.4s; \ 5762306a36Sopenharmony_ci zip2 RTMP6.4s, s4.4s, s5.4s; \ 5862306a36Sopenharmony_ci zip2 RTMP7.4s, s6.4s, s7.4s; \ 5962306a36Sopenharmony_ci zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ 6062306a36Sopenharmony_ci zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ 6162306a36Sopenharmony_ci zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ 6262306a36Sopenharmony_ci zip2 s3.2d, RTMP2.2d, RTMP3.2d; \ 6362306a36Sopenharmony_ci zip1 s4.2d, RTMP4.2d, RTMP5.2d; \ 6462306a36Sopenharmony_ci zip2 s5.2d, RTMP4.2d, RTMP5.2d; \ 6562306a36Sopenharmony_ci zip1 s6.2d, RTMP6.2d, RTMP7.2d; \ 6662306a36Sopenharmony_ci zip2 s7.2d, RTMP6.2d, RTMP7.2d; 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci#define rotate_clockwise_4x4(s0, s1, s2, s3) \ 6962306a36Sopenharmony_ci zip1 RTMP0.4s, s1.4s, s0.4s; \ 7062306a36Sopenharmony_ci zip2 RTMP1.4s, s1.4s, s0.4s; \ 7162306a36Sopenharmony_ci zip1 RTMP2.4s, s3.4s, s2.4s; \ 7262306a36Sopenharmony_ci zip2 RTMP3.4s, s3.4s, s2.4s; \ 7362306a36Sopenharmony_ci zip1 s0.2d, RTMP2.2d, RTMP0.2d; \ 7462306a36Sopenharmony_ci zip2 s1.2d, RTMP2.2d, RTMP0.2d; \ 7562306a36Sopenharmony_ci zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ 7662306a36Sopenharmony_ci zip2 s3.2d, RTMP3.2d, RTMP1.2d; 7762306a36Sopenharmony_ci 7862306a36Sopenharmony_ci#define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \ 7962306a36Sopenharmony_ci zip1 RTMP0.4s, s1.4s, s0.4s; \ 8062306a36Sopenharmony_ci zip1 RTMP2.4s, s3.4s, s2.4s; \ 8162306a36Sopenharmony_ci zip2 RTMP1.4s, s1.4s, s0.4s; \ 8262306a36Sopenharmony_ci zip2 RTMP3.4s, s3.4s, s2.4s; \ 8362306a36Sopenharmony_ci zip1 RTMP4.4s, s5.4s, s4.4s; \ 8462306a36Sopenharmony_ci zip1 RTMP6.4s, s7.4s, s6.4s; \ 8562306a36Sopenharmony_ci zip2 RTMP5.4s, s5.4s, s4.4s; \ 8662306a36Sopenharmony_ci zip2 RTMP7.4s, s7.4s, s6.4s; \ 8762306a36Sopenharmony_ci zip1 s0.2d, RTMP2.2d, RTMP0.2d; \ 8862306a36Sopenharmony_ci zip2 s1.2d, RTMP2.2d, RTMP0.2d; \ 8962306a36Sopenharmony_ci zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ 9062306a36Sopenharmony_ci zip2 s3.2d, RTMP3.2d, RTMP1.2d; \ 9162306a36Sopenharmony_ci zip1 s4.2d, RTMP6.2d, RTMP4.2d; \ 9262306a36Sopenharmony_ci zip2 s5.2d, RTMP6.2d, RTMP4.2d; \ 9362306a36Sopenharmony_ci zip1 s6.2d, RTMP7.2d, RTMP5.2d; \ 9462306a36Sopenharmony_ci zip2 s7.2d, RTMP7.2d, RTMP5.2d; 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci#define ROUND4(round, s0, s1, s2, s3) \ 9762306a36Sopenharmony_ci dup RX0.4s, RKEY.s[round]; \ 9862306a36Sopenharmony_ci /* rk ^ s1 ^ s2 ^ s3 */ \ 9962306a36Sopenharmony_ci eor RTMP1.16b, s2.16b, s3.16b; \ 10062306a36Sopenharmony_ci eor RX0.16b, RX0.16b, s1.16b; \ 10162306a36Sopenharmony_ci eor RX0.16b, RX0.16b, RTMP1.16b; \ 10262306a36Sopenharmony_ci \ 10362306a36Sopenharmony_ci /* sbox, non-linear part */ \ 10462306a36Sopenharmony_ci movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \ 10562306a36Sopenharmony_ci tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \ 10662306a36Sopenharmony_ci sub RX0.16b, RX0.16b, RTMP3.16b; \ 10762306a36Sopenharmony_ci tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \ 10862306a36Sopenharmony_ci sub RX0.16b, RX0.16b, RTMP3.16b; \ 10962306a36Sopenharmony_ci tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \ 11062306a36Sopenharmony_ci sub RX0.16b, RX0.16b, RTMP3.16b; \ 11162306a36Sopenharmony_ci tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \ 11262306a36Sopenharmony_ci \ 11362306a36Sopenharmony_ci /* linear part */ \ 11462306a36Sopenharmony_ci shl RTMP1.4s, RTMP0.4s, #8; \ 11562306a36Sopenharmony_ci shl RTMP2.4s, RTMP0.4s, #16; \ 11662306a36Sopenharmony_ci shl RTMP3.4s, RTMP0.4s, #24; \ 11762306a36Sopenharmony_ci sri RTMP1.4s, RTMP0.4s, #(32-8); \ 11862306a36Sopenharmony_ci sri RTMP2.4s, RTMP0.4s, #(32-16); \ 11962306a36Sopenharmony_ci sri RTMP3.4s, RTMP0.4s, #(32-24); \ 12062306a36Sopenharmony_ci /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \ 12162306a36Sopenharmony_ci eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \ 12262306a36Sopenharmony_ci eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \ 12362306a36Sopenharmony_ci /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \ 12462306a36Sopenharmony_ci eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \ 12562306a36Sopenharmony_ci shl RTMP2.4s, RTMP1.4s, 2; \ 12662306a36Sopenharmony_ci sri RTMP2.4s, RTMP1.4s, #(32-2); \ 12762306a36Sopenharmony_ci eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \ 12862306a36Sopenharmony_ci /* s0 ^= RTMP3 */ \ 12962306a36Sopenharmony_ci eor s0.16b, s0.16b, RTMP3.16b; 13062306a36Sopenharmony_ci 13162306a36Sopenharmony_ci#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \ 13262306a36Sopenharmony_ci mov x6, 8; \ 13362306a36Sopenharmony_ci4: \ 13462306a36Sopenharmony_ci ld1 {RKEY.4s}, [x0], #16; \ 13562306a36Sopenharmony_ci subs x6, x6, #1; \ 13662306a36Sopenharmony_ci \ 13762306a36Sopenharmony_ci ROUND4(0, b0, b1, b2, b3); \ 13862306a36Sopenharmony_ci ROUND4(1, b1, b2, b3, b0); \ 13962306a36Sopenharmony_ci ROUND4(2, b2, b3, b0, b1); \ 14062306a36Sopenharmony_ci ROUND4(3, b3, b0, b1, b2); \ 14162306a36Sopenharmony_ci \ 14262306a36Sopenharmony_ci bne 4b; \ 14362306a36Sopenharmony_ci \ 14462306a36Sopenharmony_ci rev32 b0.16b, b0.16b; \ 14562306a36Sopenharmony_ci rev32 b1.16b, b1.16b; \ 14662306a36Sopenharmony_ci rev32 b2.16b, b2.16b; \ 14762306a36Sopenharmony_ci rev32 b3.16b, b3.16b; \ 14862306a36Sopenharmony_ci \ 14962306a36Sopenharmony_ci rotate_clockwise_4x4(b0, b1, b2, b3); \ 15062306a36Sopenharmony_ci \ 15162306a36Sopenharmony_ci /* repoint to rkey */ \ 15262306a36Sopenharmony_ci sub x0, x0, #128; 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ 15562306a36Sopenharmony_ci rev32 b0.16b, b0.16b; \ 15662306a36Sopenharmony_ci rev32 b1.16b, b1.16b; \ 15762306a36Sopenharmony_ci rev32 b2.16b, b2.16b; \ 15862306a36Sopenharmony_ci rev32 b3.16b, b3.16b; \ 15962306a36Sopenharmony_ci SM4_CRYPT_BLK4_BE(b0, b1, b2, b3); 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \ 16262306a36Sopenharmony_ci /* rk ^ s1 ^ s2 ^ s3 */ \ 16362306a36Sopenharmony_ci dup RX0.4s, RKEY.s[round]; \ 16462306a36Sopenharmony_ci eor RTMP0.16b, s2.16b, s3.16b; \ 16562306a36Sopenharmony_ci mov RX1.16b, RX0.16b; \ 16662306a36Sopenharmony_ci eor RTMP1.16b, t2.16b, t3.16b; \ 16762306a36Sopenharmony_ci eor RX0.16b, RX0.16b, s1.16b; \ 16862306a36Sopenharmony_ci eor RX1.16b, RX1.16b, t1.16b; \ 16962306a36Sopenharmony_ci eor RX0.16b, RX0.16b, RTMP0.16b; \ 17062306a36Sopenharmony_ci eor RX1.16b, RX1.16b, RTMP1.16b; \ 17162306a36Sopenharmony_ci \ 17262306a36Sopenharmony_ci /* sbox, non-linear part */ \ 17362306a36Sopenharmony_ci movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \ 17462306a36Sopenharmony_ci tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \ 17562306a36Sopenharmony_ci tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \ 17662306a36Sopenharmony_ci sub RX0.16b, RX0.16b, RTMP3.16b; \ 17762306a36Sopenharmony_ci sub RX1.16b, RX1.16b, RTMP3.16b; \ 17862306a36Sopenharmony_ci tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \ 17962306a36Sopenharmony_ci tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \ 18062306a36Sopenharmony_ci sub RX0.16b, RX0.16b, RTMP3.16b; \ 18162306a36Sopenharmony_ci sub RX1.16b, RX1.16b, RTMP3.16b; \ 18262306a36Sopenharmony_ci tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \ 18362306a36Sopenharmony_ci tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \ 18462306a36Sopenharmony_ci sub RX0.16b, RX0.16b, RTMP3.16b; \ 18562306a36Sopenharmony_ci sub RX1.16b, RX1.16b, RTMP3.16b; \ 18662306a36Sopenharmony_ci tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \ 18762306a36Sopenharmony_ci tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \ 18862306a36Sopenharmony_ci \ 18962306a36Sopenharmony_ci /* linear part */ \ 19062306a36Sopenharmony_ci shl RX0.4s, RTMP0.4s, #8; \ 19162306a36Sopenharmony_ci shl RX1.4s, RTMP1.4s, #8; \ 19262306a36Sopenharmony_ci shl RTMP2.4s, RTMP0.4s, #16; \ 19362306a36Sopenharmony_ci shl RTMP3.4s, RTMP1.4s, #16; \ 19462306a36Sopenharmony_ci sri RX0.4s, RTMP0.4s, #(32 - 8); \ 19562306a36Sopenharmony_ci sri RX1.4s, RTMP1.4s, #(32 - 8); \ 19662306a36Sopenharmony_ci sri RTMP2.4s, RTMP0.4s, #(32 - 16); \ 19762306a36Sopenharmony_ci sri RTMP3.4s, RTMP1.4s, #(32 - 16); \ 19862306a36Sopenharmony_ci /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \ 19962306a36Sopenharmony_ci eor RX0.16b, RX0.16b, RTMP0.16b; \ 20062306a36Sopenharmony_ci eor RX1.16b, RX1.16b, RTMP1.16b; \ 20162306a36Sopenharmony_ci eor RX0.16b, RX0.16b, RTMP2.16b; \ 20262306a36Sopenharmony_ci eor RX1.16b, RX1.16b, RTMP3.16b; \ 20362306a36Sopenharmony_ci /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \ 20462306a36Sopenharmony_ci shl RTMP2.4s, RTMP0.4s, #24; \ 20562306a36Sopenharmony_ci shl RTMP3.4s, RTMP1.4s, #24; \ 20662306a36Sopenharmony_ci sri RTMP2.4s, RTMP0.4s, #(32 - 24); \ 20762306a36Sopenharmony_ci sri RTMP3.4s, RTMP1.4s, #(32 - 24); \ 20862306a36Sopenharmony_ci eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \ 20962306a36Sopenharmony_ci eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \ 21062306a36Sopenharmony_ci shl RTMP2.4s, RX0.4s, #2; \ 21162306a36Sopenharmony_ci shl RTMP3.4s, RX1.4s, #2; \ 21262306a36Sopenharmony_ci sri RTMP2.4s, RX0.4s, #(32 - 2); \ 21362306a36Sopenharmony_ci sri RTMP3.4s, RX1.4s, #(32 - 2); \ 21462306a36Sopenharmony_ci eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \ 21562306a36Sopenharmony_ci eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \ 21662306a36Sopenharmony_ci /* s0/t0 ^= RTMP0/1 */ \ 21762306a36Sopenharmony_ci eor s0.16b, s0.16b, RTMP0.16b; \ 21862306a36Sopenharmony_ci eor t0.16b, t0.16b, RTMP1.16b; 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_ci#define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \ 22162306a36Sopenharmony_ci rev32 b0.16b, b0.16b; \ 22262306a36Sopenharmony_ci rev32 b1.16b, b1.16b; \ 22362306a36Sopenharmony_ci rev32 b2.16b, b2.16b; \ 22462306a36Sopenharmony_ci rev32 b3.16b, b3.16b; \ 22562306a36Sopenharmony_ci rev32 b4.16b, b4.16b; \ 22662306a36Sopenharmony_ci rev32 b5.16b, b5.16b; \ 22762306a36Sopenharmony_ci rev32 b6.16b, b6.16b; \ 22862306a36Sopenharmony_ci rev32 b7.16b, b7.16b; \ 22962306a36Sopenharmony_ci \ 23062306a36Sopenharmony_ci mov x6, 8; \ 23162306a36Sopenharmony_ci8: \ 23262306a36Sopenharmony_ci ld1 {RKEY.4s}, [x0], #16; \ 23362306a36Sopenharmony_ci subs x6, x6, #1; \ 23462306a36Sopenharmony_ci \ 23562306a36Sopenharmony_ci ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \ 23662306a36Sopenharmony_ci ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \ 23762306a36Sopenharmony_ci ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \ 23862306a36Sopenharmony_ci ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \ 23962306a36Sopenharmony_ci \ 24062306a36Sopenharmony_ci bne 8b; \ 24162306a36Sopenharmony_ci \ 24262306a36Sopenharmony_ci rev32 b0.16b, b0.16b; \ 24362306a36Sopenharmony_ci rev32 b1.16b, b1.16b; \ 24462306a36Sopenharmony_ci rev32 b2.16b, b2.16b; \ 24562306a36Sopenharmony_ci rev32 b3.16b, b3.16b; \ 24662306a36Sopenharmony_ci rev32 b4.16b, b4.16b; \ 24762306a36Sopenharmony_ci rev32 b5.16b, b5.16b; \ 24862306a36Sopenharmony_ci rev32 b6.16b, b6.16b; \ 24962306a36Sopenharmony_ci rev32 b7.16b, b7.16b; \ 25062306a36Sopenharmony_ci \ 25162306a36Sopenharmony_ci /* repoint to rkey */ \ 25262306a36Sopenharmony_ci sub x0, x0, #128; 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ci#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ 25562306a36Sopenharmony_ci SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7); \ 25662306a36Sopenharmony_ci rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7); \ 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci.align 3 26062306a36Sopenharmony_ciSYM_FUNC_START(sm4_neon_crypt) 26162306a36Sopenharmony_ci /* input: 26262306a36Sopenharmony_ci * x0: round key array, CTX 26362306a36Sopenharmony_ci * x1: dst 26462306a36Sopenharmony_ci * x2: src 26562306a36Sopenharmony_ci * w3: nblocks 26662306a36Sopenharmony_ci */ 26762306a36Sopenharmony_ci SM4_PREPARE() 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_ci.Lcrypt_loop_8x: 27062306a36Sopenharmony_ci sub w3, w3, #8 27162306a36Sopenharmony_ci tbnz w3, #31, .Lcrypt_4x 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci ld4 {v0.4s-v3.4s}, [x2], #64 27462306a36Sopenharmony_ci ld4 {v4.4s-v7.4s}, [x2], #64 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci st1 {v0.16b-v3.16b}, [x1], #64 27962306a36Sopenharmony_ci st1 {v4.16b-v7.16b}, [x1], #64 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_ci cbz w3, .Lcrypt_end 28262306a36Sopenharmony_ci b .Lcrypt_loop_8x 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci.Lcrypt_4x: 28562306a36Sopenharmony_ci add w3, w3, #8 28662306a36Sopenharmony_ci cmp w3, #4 28762306a36Sopenharmony_ci blt .Lcrypt_tail 28862306a36Sopenharmony_ci 28962306a36Sopenharmony_ci sub w3, w3, #4 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_ci ld4 {v0.4s-v3.4s}, [x2], #64 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci SM4_CRYPT_BLK4(v0, v1, v2, v3) 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci st1 {v0.16b-v3.16b}, [x1], #64 29662306a36Sopenharmony_ci 29762306a36Sopenharmony_ci cbz w3, .Lcrypt_end 29862306a36Sopenharmony_ci 29962306a36Sopenharmony_ci.Lcrypt_tail: 30062306a36Sopenharmony_ci cmp w3, #2 30162306a36Sopenharmony_ci ld1 {v0.16b}, [x2], #16 30262306a36Sopenharmony_ci blt .Lcrypt_tail_load_done 30362306a36Sopenharmony_ci ld1 {v1.16b}, [x2], #16 30462306a36Sopenharmony_ci beq .Lcrypt_tail_load_done 30562306a36Sopenharmony_ci ld1 {v2.16b}, [x2], #16 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_ci.Lcrypt_tail_load_done: 30862306a36Sopenharmony_ci transpose_4x4(v0, v1, v2, v3) 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci SM4_CRYPT_BLK4(v0, v1, v2, v3) 31162306a36Sopenharmony_ci 31262306a36Sopenharmony_ci cmp w3, #2 31362306a36Sopenharmony_ci st1 {v0.16b}, [x1], #16 31462306a36Sopenharmony_ci blt .Lcrypt_end 31562306a36Sopenharmony_ci st1 {v1.16b}, [x1], #16 31662306a36Sopenharmony_ci beq .Lcrypt_end 31762306a36Sopenharmony_ci st1 {v2.16b}, [x1], #16 31862306a36Sopenharmony_ci 31962306a36Sopenharmony_ci.Lcrypt_end: 32062306a36Sopenharmony_ci ret 32162306a36Sopenharmony_ciSYM_FUNC_END(sm4_neon_crypt) 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_ci.align 3 32462306a36Sopenharmony_ciSYM_FUNC_START(sm4_neon_cbc_dec) 32562306a36Sopenharmony_ci /* input: 32662306a36Sopenharmony_ci * x0: round key array, CTX 32762306a36Sopenharmony_ci * x1: dst 32862306a36Sopenharmony_ci * x2: src 32962306a36Sopenharmony_ci * x3: iv (big endian, 128 bit) 33062306a36Sopenharmony_ci * w4: nblocks 33162306a36Sopenharmony_ci */ 33262306a36Sopenharmony_ci SM4_PREPARE() 33362306a36Sopenharmony_ci 33462306a36Sopenharmony_ci ld1 {RIV.16b}, [x3] 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_ci.Lcbc_dec_loop_8x: 33762306a36Sopenharmony_ci sub w4, w4, #8 33862306a36Sopenharmony_ci tbnz w4, #31, .Lcbc_dec_4x 33962306a36Sopenharmony_ci 34062306a36Sopenharmony_ci ld4 {v0.4s-v3.4s}, [x2], #64 34162306a36Sopenharmony_ci ld4 {v4.4s-v7.4s}, [x2] 34262306a36Sopenharmony_ci 34362306a36Sopenharmony_ci SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7) 34462306a36Sopenharmony_ci 34562306a36Sopenharmony_ci /* Avoid overwriting the RIV register */ 34662306a36Sopenharmony_ci rotate_clockwise_4x4(v0, v1, v2, v3) 34762306a36Sopenharmony_ci rotate_clockwise_4x4(v4, v5, v6, v7) 34862306a36Sopenharmony_ci 34962306a36Sopenharmony_ci sub x2, x2, #64 35062306a36Sopenharmony_ci 35162306a36Sopenharmony_ci eor v0.16b, v0.16b, RIV.16b 35262306a36Sopenharmony_ci 35362306a36Sopenharmony_ci ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 35462306a36Sopenharmony_ci ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 35562306a36Sopenharmony_ci 35662306a36Sopenharmony_ci eor v1.16b, v1.16b, RTMP0.16b 35762306a36Sopenharmony_ci eor v2.16b, v2.16b, RTMP1.16b 35862306a36Sopenharmony_ci eor v3.16b, v3.16b, RTMP2.16b 35962306a36Sopenharmony_ci eor v4.16b, v4.16b, RTMP3.16b 36062306a36Sopenharmony_ci eor v5.16b, v5.16b, RTMP4.16b 36162306a36Sopenharmony_ci eor v6.16b, v6.16b, RTMP5.16b 36262306a36Sopenharmony_ci eor v7.16b, v7.16b, RTMP6.16b 36362306a36Sopenharmony_ci 36462306a36Sopenharmony_ci mov RIV.16b, RTMP7.16b 36562306a36Sopenharmony_ci 36662306a36Sopenharmony_ci st1 {v0.16b-v3.16b}, [x1], #64 36762306a36Sopenharmony_ci st1 {v4.16b-v7.16b}, [x1], #64 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_ci cbz w4, .Lcbc_dec_end 37062306a36Sopenharmony_ci b .Lcbc_dec_loop_8x 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_ci.Lcbc_dec_4x: 37362306a36Sopenharmony_ci add w4, w4, #8 37462306a36Sopenharmony_ci cmp w4, #4 37562306a36Sopenharmony_ci blt .Lcbc_dec_tail 37662306a36Sopenharmony_ci 37762306a36Sopenharmony_ci sub w4, w4, #4 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_ci ld1 {v0.16b-v3.16b}, [x2], #64 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_ci rev32 v4.16b, v0.16b 38262306a36Sopenharmony_ci rev32 v5.16b, v1.16b 38362306a36Sopenharmony_ci rev32 v6.16b, v2.16b 38462306a36Sopenharmony_ci rev32 v7.16b, v3.16b 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_ci transpose_4x4(v4, v5, v6, v7) 38762306a36Sopenharmony_ci 38862306a36Sopenharmony_ci SM4_CRYPT_BLK4_BE(v4, v5, v6, v7) 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci eor v4.16b, v4.16b, RIV.16b 39162306a36Sopenharmony_ci eor v5.16b, v5.16b, v0.16b 39262306a36Sopenharmony_ci eor v6.16b, v6.16b, v1.16b 39362306a36Sopenharmony_ci eor v7.16b, v7.16b, v2.16b 39462306a36Sopenharmony_ci 39562306a36Sopenharmony_ci mov RIV.16b, v3.16b 39662306a36Sopenharmony_ci 39762306a36Sopenharmony_ci st1 {v4.16b-v7.16b}, [x1], #64 39862306a36Sopenharmony_ci 39962306a36Sopenharmony_ci cbz w4, .Lcbc_dec_end 40062306a36Sopenharmony_ci 40162306a36Sopenharmony_ci.Lcbc_dec_tail: 40262306a36Sopenharmony_ci cmp w4, #2 40362306a36Sopenharmony_ci ld1 {v0.16b}, [x2], #16 40462306a36Sopenharmony_ci blt .Lcbc_dec_tail_load_done 40562306a36Sopenharmony_ci ld1 {v1.16b}, [x2], #16 40662306a36Sopenharmony_ci beq .Lcbc_dec_tail_load_done 40762306a36Sopenharmony_ci ld1 {v2.16b}, [x2], #16 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_ci.Lcbc_dec_tail_load_done: 41062306a36Sopenharmony_ci rev32 v4.16b, v0.16b 41162306a36Sopenharmony_ci rev32 v5.16b, v1.16b 41262306a36Sopenharmony_ci rev32 v6.16b, v2.16b 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_ci transpose_4x4(v4, v5, v6, v7) 41562306a36Sopenharmony_ci 41662306a36Sopenharmony_ci SM4_CRYPT_BLK4_BE(v4, v5, v6, v7) 41762306a36Sopenharmony_ci 41862306a36Sopenharmony_ci cmp w4, #2 41962306a36Sopenharmony_ci eor v4.16b, v4.16b, RIV.16b 42062306a36Sopenharmony_ci mov RIV.16b, v0.16b 42162306a36Sopenharmony_ci st1 {v4.16b}, [x1], #16 42262306a36Sopenharmony_ci blt .Lcbc_dec_end 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci eor v5.16b, v5.16b, v0.16b 42562306a36Sopenharmony_ci mov RIV.16b, v1.16b 42662306a36Sopenharmony_ci st1 {v5.16b}, [x1], #16 42762306a36Sopenharmony_ci beq .Lcbc_dec_end 42862306a36Sopenharmony_ci 42962306a36Sopenharmony_ci eor v6.16b, v6.16b, v1.16b 43062306a36Sopenharmony_ci mov RIV.16b, v2.16b 43162306a36Sopenharmony_ci st1 {v6.16b}, [x1], #16 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ci.Lcbc_dec_end: 43462306a36Sopenharmony_ci /* store new IV */ 43562306a36Sopenharmony_ci st1 {RIV.16b}, [x3] 43662306a36Sopenharmony_ci 43762306a36Sopenharmony_ci ret 43862306a36Sopenharmony_ciSYM_FUNC_END(sm4_neon_cbc_dec) 43962306a36Sopenharmony_ci 44062306a36Sopenharmony_ci.align 3 44162306a36Sopenharmony_ciSYM_FUNC_START(sm4_neon_cfb_dec) 44262306a36Sopenharmony_ci /* input: 44362306a36Sopenharmony_ci * x0: round key array, CTX 44462306a36Sopenharmony_ci * x1: dst 44562306a36Sopenharmony_ci * x2: src 44662306a36Sopenharmony_ci * x3: iv (big endian, 128 bit) 44762306a36Sopenharmony_ci * w4: nblocks 44862306a36Sopenharmony_ci */ 44962306a36Sopenharmony_ci SM4_PREPARE() 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci ld1 {v0.16b}, [x3] 45262306a36Sopenharmony_ci 45362306a36Sopenharmony_ci.Lcfb_dec_loop_8x: 45462306a36Sopenharmony_ci sub w4, w4, #8 45562306a36Sopenharmony_ci tbnz w4, #31, .Lcfb_dec_4x 45662306a36Sopenharmony_ci 45762306a36Sopenharmony_ci ld1 {v1.16b-v3.16b}, [x2], #48 45862306a36Sopenharmony_ci ld4 {v4.4s-v7.4s}, [x2] 45962306a36Sopenharmony_ci 46062306a36Sopenharmony_ci transpose_4x4(v0, v1, v2, v3) 46162306a36Sopenharmony_ci 46262306a36Sopenharmony_ci SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 46362306a36Sopenharmony_ci 46462306a36Sopenharmony_ci sub x2, x2, #48 46562306a36Sopenharmony_ci ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 46662306a36Sopenharmony_ci ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 46762306a36Sopenharmony_ci 46862306a36Sopenharmony_ci eor v0.16b, v0.16b, RTMP0.16b 46962306a36Sopenharmony_ci eor v1.16b, v1.16b, RTMP1.16b 47062306a36Sopenharmony_ci eor v2.16b, v2.16b, RTMP2.16b 47162306a36Sopenharmony_ci eor v3.16b, v3.16b, RTMP3.16b 47262306a36Sopenharmony_ci eor v4.16b, v4.16b, RTMP4.16b 47362306a36Sopenharmony_ci eor v5.16b, v5.16b, RTMP5.16b 47462306a36Sopenharmony_ci eor v6.16b, v6.16b, RTMP6.16b 47562306a36Sopenharmony_ci eor v7.16b, v7.16b, RTMP7.16b 47662306a36Sopenharmony_ci 47762306a36Sopenharmony_ci st1 {v0.16b-v3.16b}, [x1], #64 47862306a36Sopenharmony_ci st1 {v4.16b-v7.16b}, [x1], #64 47962306a36Sopenharmony_ci 48062306a36Sopenharmony_ci mov v0.16b, RTMP7.16b 48162306a36Sopenharmony_ci 48262306a36Sopenharmony_ci cbz w4, .Lcfb_dec_end 48362306a36Sopenharmony_ci b .Lcfb_dec_loop_8x 48462306a36Sopenharmony_ci 48562306a36Sopenharmony_ci.Lcfb_dec_4x: 48662306a36Sopenharmony_ci add w4, w4, #8 48762306a36Sopenharmony_ci cmp w4, #4 48862306a36Sopenharmony_ci blt .Lcfb_dec_tail 48962306a36Sopenharmony_ci 49062306a36Sopenharmony_ci sub w4, w4, #4 49162306a36Sopenharmony_ci 49262306a36Sopenharmony_ci ld1 {v4.16b-v7.16b}, [x2], #64 49362306a36Sopenharmony_ci 49462306a36Sopenharmony_ci rev32 v0.16b, v0.16b /* v0 is IV register */ 49562306a36Sopenharmony_ci rev32 v1.16b, v4.16b 49662306a36Sopenharmony_ci rev32 v2.16b, v5.16b 49762306a36Sopenharmony_ci rev32 v3.16b, v6.16b 49862306a36Sopenharmony_ci 49962306a36Sopenharmony_ci transpose_4x4(v0, v1, v2, v3) 50062306a36Sopenharmony_ci 50162306a36Sopenharmony_ci SM4_CRYPT_BLK4_BE(v0, v1, v2, v3) 50262306a36Sopenharmony_ci 50362306a36Sopenharmony_ci eor v0.16b, v0.16b, v4.16b 50462306a36Sopenharmony_ci eor v1.16b, v1.16b, v5.16b 50562306a36Sopenharmony_ci eor v2.16b, v2.16b, v6.16b 50662306a36Sopenharmony_ci eor v3.16b, v3.16b, v7.16b 50762306a36Sopenharmony_ci 50862306a36Sopenharmony_ci st1 {v0.16b-v3.16b}, [x1], #64 50962306a36Sopenharmony_ci 51062306a36Sopenharmony_ci mov v0.16b, v7.16b 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_ci cbz w4, .Lcfb_dec_end 51362306a36Sopenharmony_ci 51462306a36Sopenharmony_ci.Lcfb_dec_tail: 51562306a36Sopenharmony_ci cmp w4, #2 51662306a36Sopenharmony_ci ld1 {v4.16b}, [x2], #16 51762306a36Sopenharmony_ci blt .Lcfb_dec_tail_load_done 51862306a36Sopenharmony_ci ld1 {v5.16b}, [x2], #16 51962306a36Sopenharmony_ci beq .Lcfb_dec_tail_load_done 52062306a36Sopenharmony_ci ld1 {v6.16b}, [x2], #16 52162306a36Sopenharmony_ci 52262306a36Sopenharmony_ci.Lcfb_dec_tail_load_done: 52362306a36Sopenharmony_ci rev32 v0.16b, v0.16b /* v0 is IV register */ 52462306a36Sopenharmony_ci rev32 v1.16b, v4.16b 52562306a36Sopenharmony_ci rev32 v2.16b, v5.16b 52662306a36Sopenharmony_ci 52762306a36Sopenharmony_ci transpose_4x4(v0, v1, v2, v3) 52862306a36Sopenharmony_ci 52962306a36Sopenharmony_ci SM4_CRYPT_BLK4_BE(v0, v1, v2, v3) 53062306a36Sopenharmony_ci 53162306a36Sopenharmony_ci cmp w4, #2 53262306a36Sopenharmony_ci eor v0.16b, v0.16b, v4.16b 53362306a36Sopenharmony_ci st1 {v0.16b}, [x1], #16 53462306a36Sopenharmony_ci mov v0.16b, v4.16b 53562306a36Sopenharmony_ci blt .Lcfb_dec_end 53662306a36Sopenharmony_ci 53762306a36Sopenharmony_ci eor v1.16b, v1.16b, v5.16b 53862306a36Sopenharmony_ci st1 {v1.16b}, [x1], #16 53962306a36Sopenharmony_ci mov v0.16b, v5.16b 54062306a36Sopenharmony_ci beq .Lcfb_dec_end 54162306a36Sopenharmony_ci 54262306a36Sopenharmony_ci eor v2.16b, v2.16b, v6.16b 54362306a36Sopenharmony_ci st1 {v2.16b}, [x1], #16 54462306a36Sopenharmony_ci mov v0.16b, v6.16b 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_ci.Lcfb_dec_end: 54762306a36Sopenharmony_ci /* store new IV */ 54862306a36Sopenharmony_ci st1 {v0.16b}, [x3] 54962306a36Sopenharmony_ci 55062306a36Sopenharmony_ci ret 55162306a36Sopenharmony_ciSYM_FUNC_END(sm4_neon_cfb_dec) 55262306a36Sopenharmony_ci 55362306a36Sopenharmony_ci.align 3 55462306a36Sopenharmony_ciSYM_FUNC_START(sm4_neon_ctr_crypt) 55562306a36Sopenharmony_ci /* input: 55662306a36Sopenharmony_ci * x0: round key array, CTX 55762306a36Sopenharmony_ci * x1: dst 55862306a36Sopenharmony_ci * x2: src 55962306a36Sopenharmony_ci * x3: ctr (big endian, 128 bit) 56062306a36Sopenharmony_ci * w4: nblocks 56162306a36Sopenharmony_ci */ 56262306a36Sopenharmony_ci SM4_PREPARE() 56362306a36Sopenharmony_ci 56462306a36Sopenharmony_ci ldp x7, x8, [x3] 56562306a36Sopenharmony_ci rev x7, x7 56662306a36Sopenharmony_ci rev x8, x8 56762306a36Sopenharmony_ci 56862306a36Sopenharmony_ci.Lctr_crypt_loop_8x: 56962306a36Sopenharmony_ci sub w4, w4, #8 57062306a36Sopenharmony_ci tbnz w4, #31, .Lctr_crypt_4x 57162306a36Sopenharmony_ci 57262306a36Sopenharmony_ci#define inc_le128(vctr) \ 57362306a36Sopenharmony_ci mov vctr.d[1], x8; \ 57462306a36Sopenharmony_ci mov vctr.d[0], x7; \ 57562306a36Sopenharmony_ci adds x8, x8, #1; \ 57662306a36Sopenharmony_ci rev64 vctr.16b, vctr.16b; \ 57762306a36Sopenharmony_ci adc x7, x7, xzr; 57862306a36Sopenharmony_ci 57962306a36Sopenharmony_ci /* construct CTRs */ 58062306a36Sopenharmony_ci inc_le128(v0) /* +0 */ 58162306a36Sopenharmony_ci inc_le128(v1) /* +1 */ 58262306a36Sopenharmony_ci inc_le128(v2) /* +2 */ 58362306a36Sopenharmony_ci inc_le128(v3) /* +3 */ 58462306a36Sopenharmony_ci inc_le128(v4) /* +4 */ 58562306a36Sopenharmony_ci inc_le128(v5) /* +5 */ 58662306a36Sopenharmony_ci inc_le128(v6) /* +6 */ 58762306a36Sopenharmony_ci inc_le128(v7) /* +7 */ 58862306a36Sopenharmony_ci 58962306a36Sopenharmony_ci transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7) 59062306a36Sopenharmony_ci 59162306a36Sopenharmony_ci SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 59262306a36Sopenharmony_ci 59362306a36Sopenharmony_ci ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 59462306a36Sopenharmony_ci ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 59562306a36Sopenharmony_ci 59662306a36Sopenharmony_ci eor v0.16b, v0.16b, RTMP0.16b 59762306a36Sopenharmony_ci eor v1.16b, v1.16b, RTMP1.16b 59862306a36Sopenharmony_ci eor v2.16b, v2.16b, RTMP2.16b 59962306a36Sopenharmony_ci eor v3.16b, v3.16b, RTMP3.16b 60062306a36Sopenharmony_ci eor v4.16b, v4.16b, RTMP4.16b 60162306a36Sopenharmony_ci eor v5.16b, v5.16b, RTMP5.16b 60262306a36Sopenharmony_ci eor v6.16b, v6.16b, RTMP6.16b 60362306a36Sopenharmony_ci eor v7.16b, v7.16b, RTMP7.16b 60462306a36Sopenharmony_ci 60562306a36Sopenharmony_ci st1 {v0.16b-v3.16b}, [x1], #64 60662306a36Sopenharmony_ci st1 {v4.16b-v7.16b}, [x1], #64 60762306a36Sopenharmony_ci 60862306a36Sopenharmony_ci cbz w4, .Lctr_crypt_end 60962306a36Sopenharmony_ci b .Lctr_crypt_loop_8x 61062306a36Sopenharmony_ci 61162306a36Sopenharmony_ci.Lctr_crypt_4x: 61262306a36Sopenharmony_ci add w4, w4, #8 61362306a36Sopenharmony_ci cmp w4, #4 61462306a36Sopenharmony_ci blt .Lctr_crypt_tail 61562306a36Sopenharmony_ci 61662306a36Sopenharmony_ci sub w4, w4, #4 61762306a36Sopenharmony_ci 61862306a36Sopenharmony_ci /* construct CTRs */ 61962306a36Sopenharmony_ci inc_le128(v0) /* +0 */ 62062306a36Sopenharmony_ci inc_le128(v1) /* +1 */ 62162306a36Sopenharmony_ci inc_le128(v2) /* +2 */ 62262306a36Sopenharmony_ci inc_le128(v3) /* +3 */ 62362306a36Sopenharmony_ci 62462306a36Sopenharmony_ci ld1 {v4.16b-v7.16b}, [x2], #64 62562306a36Sopenharmony_ci 62662306a36Sopenharmony_ci transpose_4x4(v0, v1, v2, v3) 62762306a36Sopenharmony_ci 62862306a36Sopenharmony_ci SM4_CRYPT_BLK4(v0, v1, v2, v3) 62962306a36Sopenharmony_ci 63062306a36Sopenharmony_ci eor v0.16b, v0.16b, v4.16b 63162306a36Sopenharmony_ci eor v1.16b, v1.16b, v5.16b 63262306a36Sopenharmony_ci eor v2.16b, v2.16b, v6.16b 63362306a36Sopenharmony_ci eor v3.16b, v3.16b, v7.16b 63462306a36Sopenharmony_ci 63562306a36Sopenharmony_ci st1 {v0.16b-v3.16b}, [x1], #64 63662306a36Sopenharmony_ci 63762306a36Sopenharmony_ci cbz w4, .Lctr_crypt_end 63862306a36Sopenharmony_ci 63962306a36Sopenharmony_ci.Lctr_crypt_tail: 64062306a36Sopenharmony_ci /* inc_le128 will change the sign bit */ 64162306a36Sopenharmony_ci ld1 {v4.16b}, [x2], #16 64262306a36Sopenharmony_ci inc_le128(v0) 64362306a36Sopenharmony_ci cmp w4, #2 64462306a36Sopenharmony_ci blt .Lctr_crypt_tail_load_done 64562306a36Sopenharmony_ci 64662306a36Sopenharmony_ci ld1 {v5.16b}, [x2], #16 64762306a36Sopenharmony_ci inc_le128(v1) 64862306a36Sopenharmony_ci cmp w4, #2 64962306a36Sopenharmony_ci beq .Lctr_crypt_tail_load_done 65062306a36Sopenharmony_ci 65162306a36Sopenharmony_ci ld1 {v6.16b}, [x2], #16 65262306a36Sopenharmony_ci inc_le128(v2) 65362306a36Sopenharmony_ci 65462306a36Sopenharmony_ci.Lctr_crypt_tail_load_done: 65562306a36Sopenharmony_ci transpose_4x4(v0, v1, v2, v3) 65662306a36Sopenharmony_ci 65762306a36Sopenharmony_ci SM4_CRYPT_BLK4(v0, v1, v2, v3) 65862306a36Sopenharmony_ci 65962306a36Sopenharmony_ci cmp w4, #2 66062306a36Sopenharmony_ci 66162306a36Sopenharmony_ci eor v0.16b, v0.16b, v4.16b 66262306a36Sopenharmony_ci st1 {v0.16b}, [x1], #16 66362306a36Sopenharmony_ci blt .Lctr_crypt_end 66462306a36Sopenharmony_ci 66562306a36Sopenharmony_ci eor v1.16b, v1.16b, v5.16b 66662306a36Sopenharmony_ci st1 {v1.16b}, [x1], #16 66762306a36Sopenharmony_ci beq .Lctr_crypt_end 66862306a36Sopenharmony_ci 66962306a36Sopenharmony_ci eor v2.16b, v2.16b, v6.16b 67062306a36Sopenharmony_ci st1 {v2.16b}, [x1], #16 67162306a36Sopenharmony_ci 67262306a36Sopenharmony_ci.Lctr_crypt_end: 67362306a36Sopenharmony_ci /* store new CTR */ 67462306a36Sopenharmony_ci rev x7, x7 67562306a36Sopenharmony_ci rev x8, x8 67662306a36Sopenharmony_ci stp x7, x8, [x3] 67762306a36Sopenharmony_ci 67862306a36Sopenharmony_ci ret 67962306a36Sopenharmony_ciSYM_FUNC_END(sm4_neon_ctr_crypt) 680