162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * SM4 Cipher Algorithm, AES-NI/AVX optimized.
462306a36Sopenharmony_ci * as specified in
562306a36Sopenharmony_ci * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi>
862306a36Sopenharmony_ci * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
962306a36Sopenharmony_ci * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
1062306a36Sopenharmony_ci */
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:
1362306a36Sopenharmony_ci *  https://github.com/mjosaarinen/sm4ni
1462306a36Sopenharmony_ci */
1562306a36Sopenharmony_ci
1662306a36Sopenharmony_ci#include <linux/linkage.h>
1762306a36Sopenharmony_ci#include <linux/cfi_types.h>
1862306a36Sopenharmony_ci#include <asm/frame.h>
1962306a36Sopenharmony_ci
2062306a36Sopenharmony_ci#define rRIP         (%rip)
2162306a36Sopenharmony_ci
2262306a36Sopenharmony_ci#define RX0          %xmm0
2362306a36Sopenharmony_ci#define RX1          %xmm1
2462306a36Sopenharmony_ci#define MASK_4BIT    %xmm2
2562306a36Sopenharmony_ci#define RTMP0        %xmm3
2662306a36Sopenharmony_ci#define RTMP1        %xmm4
2762306a36Sopenharmony_ci#define RTMP2        %xmm5
2862306a36Sopenharmony_ci#define RTMP3        %xmm6
2962306a36Sopenharmony_ci#define RTMP4        %xmm7
3062306a36Sopenharmony_ci
3162306a36Sopenharmony_ci#define RA0          %xmm8
3262306a36Sopenharmony_ci#define RA1          %xmm9
3362306a36Sopenharmony_ci#define RA2          %xmm10
3462306a36Sopenharmony_ci#define RA3          %xmm11
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_ci#define RB0          %xmm12
3762306a36Sopenharmony_ci#define RB1          %xmm13
3862306a36Sopenharmony_ci#define RB2          %xmm14
3962306a36Sopenharmony_ci#define RB3          %xmm15
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_ci#define RNOT         %xmm0
4262306a36Sopenharmony_ci#define RBSWAP       %xmm1
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_ci
4562306a36Sopenharmony_ci/* Transpose four 32-bit words between 128-bit vectors. */
4662306a36Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
4762306a36Sopenharmony_ci	vpunpckhdq x1, x0, t2;                \
4862306a36Sopenharmony_ci	vpunpckldq x1, x0, x0;                \
4962306a36Sopenharmony_ci	                                      \
5062306a36Sopenharmony_ci	vpunpckldq x3, x2, t1;                \
5162306a36Sopenharmony_ci	vpunpckhdq x3, x2, x2;                \
5262306a36Sopenharmony_ci	                                      \
5362306a36Sopenharmony_ci	vpunpckhqdq t1, x0, x1;               \
5462306a36Sopenharmony_ci	vpunpcklqdq t1, x0, x0;               \
5562306a36Sopenharmony_ci	                                      \
5662306a36Sopenharmony_ci	vpunpckhqdq x2, t2, x3;               \
5762306a36Sopenharmony_ci	vpunpcklqdq x2, t2, x2;
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci/* pre-SubByte transform. */
6062306a36Sopenharmony_ci#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
6162306a36Sopenharmony_ci	vpand x, mask4bit, tmp0;                     \
6262306a36Sopenharmony_ci	vpandn x, mask4bit, x;                       \
6362306a36Sopenharmony_ci	vpsrld $4, x, x;                             \
6462306a36Sopenharmony_ci	                                             \
6562306a36Sopenharmony_ci	vpshufb tmp0, lo_t, tmp0;                    \
6662306a36Sopenharmony_ci	vpshufb x, hi_t, x;                          \
6762306a36Sopenharmony_ci	vpxor tmp0, x, x;
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_ci/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
7062306a36Sopenharmony_ci * 'vaeslastenc' instruction.
7162306a36Sopenharmony_ci */
7262306a36Sopenharmony_ci#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
7362306a36Sopenharmony_ci	vpandn mask4bit, x, tmp0;                     \
7462306a36Sopenharmony_ci	vpsrld $4, x, x;                              \
7562306a36Sopenharmony_ci	vpand x, mask4bit, x;                         \
7662306a36Sopenharmony_ci	                                              \
7762306a36Sopenharmony_ci	vpshufb tmp0, lo_t, tmp0;                     \
7862306a36Sopenharmony_ci	vpshufb x, hi_t, x;                           \
7962306a36Sopenharmony_ci	vpxor tmp0, x, x;
8062306a36Sopenharmony_ci
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_ci.section	.rodata.cst16, "aM", @progbits, 16
8362306a36Sopenharmony_ci.align 16
8462306a36Sopenharmony_ci
8562306a36Sopenharmony_ci/*
8662306a36Sopenharmony_ci * Following four affine transform look-up tables are from work by
8762306a36Sopenharmony_ci * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
8862306a36Sopenharmony_ci *
8962306a36Sopenharmony_ci * These allow exposing SM4 S-Box from AES SubByte.
9062306a36Sopenharmony_ci */
9162306a36Sopenharmony_ci
9262306a36Sopenharmony_ci/* pre-SubByte affine transform, from SM4 field to AES field. */
9362306a36Sopenharmony_ci.Lpre_tf_lo_s:
9462306a36Sopenharmony_ci	.quad 0x9197E2E474720701, 0xC7C1B4B222245157
9562306a36Sopenharmony_ci.Lpre_tf_hi_s:
9662306a36Sopenharmony_ci	.quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
9762306a36Sopenharmony_ci
9862306a36Sopenharmony_ci/* post-SubByte affine transform, from AES field to SM4 field. */
9962306a36Sopenharmony_ci.Lpost_tf_lo_s:
10062306a36Sopenharmony_ci	.quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
10162306a36Sopenharmony_ci.Lpost_tf_hi_s:
10262306a36Sopenharmony_ci	.quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_ci/* For isolating SubBytes from AESENCLAST, inverse shift row */
10562306a36Sopenharmony_ci.Linv_shift_row:
10662306a36Sopenharmony_ci	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
10762306a36Sopenharmony_ci	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
11062306a36Sopenharmony_ci.Linv_shift_row_rol_8:
11162306a36Sopenharmony_ci	.byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
11262306a36Sopenharmony_ci	.byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_ci/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
11562306a36Sopenharmony_ci.Linv_shift_row_rol_16:
11662306a36Sopenharmony_ci	.byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
11762306a36Sopenharmony_ci	.byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
11862306a36Sopenharmony_ci
11962306a36Sopenharmony_ci/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
12062306a36Sopenharmony_ci.Linv_shift_row_rol_24:
12162306a36Sopenharmony_ci	.byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
12262306a36Sopenharmony_ci	.byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
12362306a36Sopenharmony_ci
12462306a36Sopenharmony_ci/* For CTR-mode IV byteswap */
12562306a36Sopenharmony_ci.Lbswap128_mask:
12662306a36Sopenharmony_ci	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci/* For input word byte-swap */
12962306a36Sopenharmony_ci.Lbswap32_mask:
13062306a36Sopenharmony_ci	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci.align 4
13362306a36Sopenharmony_ci/* 4-bit mask */
13462306a36Sopenharmony_ci.L0f0f0f0f:
13562306a36Sopenharmony_ci	.long 0x0f0f0f0f
13662306a36Sopenharmony_ci
13762306a36Sopenharmony_ci/* 12 bytes, only for padding */
13862306a36Sopenharmony_ci.Lpadding_deadbeef:
13962306a36Sopenharmony_ci	.long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_ci
14262306a36Sopenharmony_ci.text
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci/*
14562306a36Sopenharmony_ci * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
14662306a36Sopenharmony_ci *                           const u8 *src, int nblocks)
14762306a36Sopenharmony_ci */
14862306a36Sopenharmony_ciSYM_FUNC_START(sm4_aesni_avx_crypt4)
14962306a36Sopenharmony_ci	/* input:
15062306a36Sopenharmony_ci	 *	%rdi: round key array, CTX
15162306a36Sopenharmony_ci	 *	%rsi: dst (1..4 blocks)
15262306a36Sopenharmony_ci	 *	%rdx: src (1..4 blocks)
15362306a36Sopenharmony_ci	 *	%rcx: num blocks (1..4)
15462306a36Sopenharmony_ci	 */
15562306a36Sopenharmony_ci	FRAME_BEGIN
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci	vmovdqu 0*16(%rdx), RA0;
15862306a36Sopenharmony_ci	vmovdqa RA0, RA1;
15962306a36Sopenharmony_ci	vmovdqa RA0, RA2;
16062306a36Sopenharmony_ci	vmovdqa RA0, RA3;
16162306a36Sopenharmony_ci	cmpq $2, %rcx;
16262306a36Sopenharmony_ci	jb .Lblk4_load_input_done;
16362306a36Sopenharmony_ci	vmovdqu 1*16(%rdx), RA1;
16462306a36Sopenharmony_ci	je .Lblk4_load_input_done;
16562306a36Sopenharmony_ci	vmovdqu 2*16(%rdx), RA2;
16662306a36Sopenharmony_ci	cmpq $3, %rcx;
16762306a36Sopenharmony_ci	je .Lblk4_load_input_done;
16862306a36Sopenharmony_ci	vmovdqu 3*16(%rdx), RA3;
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci.Lblk4_load_input_done:
17162306a36Sopenharmony_ci
17262306a36Sopenharmony_ci	vmovdqa .Lbswap32_mask rRIP, RTMP2;
17362306a36Sopenharmony_ci	vpshufb RTMP2, RA0, RA0;
17462306a36Sopenharmony_ci	vpshufb RTMP2, RA1, RA1;
17562306a36Sopenharmony_ci	vpshufb RTMP2, RA2, RA2;
17662306a36Sopenharmony_ci	vpshufb RTMP2, RA3, RA3;
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci	vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
17962306a36Sopenharmony_ci	vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;
18062306a36Sopenharmony_ci	vmovdqa .Lpre_tf_hi_s rRIP, RB0;
18162306a36Sopenharmony_ci	vmovdqa .Lpost_tf_lo_s rRIP, RB1;
18262306a36Sopenharmony_ci	vmovdqa .Lpost_tf_hi_s rRIP, RB2;
18362306a36Sopenharmony_ci	vmovdqa .Linv_shift_row rRIP, RB3;
18462306a36Sopenharmony_ci	vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2;
18562306a36Sopenharmony_ci	vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3;
18662306a36Sopenharmony_ci	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_ci#define ROUND(round, s0, s1, s2, s3)                                \
18962306a36Sopenharmony_ci	vbroadcastss (4*(round))(%rdi), RX0;                        \
19062306a36Sopenharmony_ci	vpxor s1, RX0, RX0;                                         \
19162306a36Sopenharmony_ci	vpxor s2, RX0, RX0;                                         \
19262306a36Sopenharmony_ci	vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */                 \
19362306a36Sopenharmony_ci	                                                            \
19462306a36Sopenharmony_ci	/* sbox, non-linear part */                                 \
19562306a36Sopenharmony_ci	transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0);           \
19662306a36Sopenharmony_ci	vaesenclast MASK_4BIT, RX0, RX0;                            \
19762306a36Sopenharmony_ci	transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0);            \
19862306a36Sopenharmony_ci	                                                            \
19962306a36Sopenharmony_ci	/* linear part */                                           \
20062306a36Sopenharmony_ci	vpshufb RB3, RX0, RTMP0;                                    \
20162306a36Sopenharmony_ci	vpxor RTMP0, s0, s0; /* s0 ^ x */                           \
20262306a36Sopenharmony_ci	vpshufb RTMP2, RX0, RTMP1;                                  \
20362306a36Sopenharmony_ci	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */               \
20462306a36Sopenharmony_ci	vpshufb RTMP3, RX0, RTMP1;                                  \
20562306a36Sopenharmony_ci	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */   \
20662306a36Sopenharmony_ci	vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1;            \
20762306a36Sopenharmony_ci	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */               \
20862306a36Sopenharmony_ci	vpslld $2, RTMP0, RTMP1;                                    \
20962306a36Sopenharmony_ci	vpsrld $30, RTMP0, RTMP0;                                   \
21062306a36Sopenharmony_ci	vpxor RTMP0, s0, s0;                                        \
21162306a36Sopenharmony_ci	/* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
21262306a36Sopenharmony_ci	vpxor RTMP1, s0, s0;
21362306a36Sopenharmony_ci
21462306a36Sopenharmony_ci	leaq (32*4)(%rdi), %rax;
21562306a36Sopenharmony_ci.align 16
21662306a36Sopenharmony_ci.Lroundloop_blk4:
21762306a36Sopenharmony_ci	ROUND(0, RA0, RA1, RA2, RA3);
21862306a36Sopenharmony_ci	ROUND(1, RA1, RA2, RA3, RA0);
21962306a36Sopenharmony_ci	ROUND(2, RA2, RA3, RA0, RA1);
22062306a36Sopenharmony_ci	ROUND(3, RA3, RA0, RA1, RA2);
22162306a36Sopenharmony_ci	leaq (4*4)(%rdi), %rdi;
22262306a36Sopenharmony_ci	cmpq %rax, %rdi;
22362306a36Sopenharmony_ci	jne .Lroundloop_blk4;
22462306a36Sopenharmony_ci
22562306a36Sopenharmony_ci#undef ROUND
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ci	vmovdqa .Lbswap128_mask rRIP, RTMP2;
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_ci	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
23062306a36Sopenharmony_ci	vpshufb RTMP2, RA0, RA0;
23162306a36Sopenharmony_ci	vpshufb RTMP2, RA1, RA1;
23262306a36Sopenharmony_ci	vpshufb RTMP2, RA2, RA2;
23362306a36Sopenharmony_ci	vpshufb RTMP2, RA3, RA3;
23462306a36Sopenharmony_ci
23562306a36Sopenharmony_ci	vmovdqu RA0, 0*16(%rsi);
23662306a36Sopenharmony_ci	cmpq $2, %rcx;
23762306a36Sopenharmony_ci	jb .Lblk4_store_output_done;
23862306a36Sopenharmony_ci	vmovdqu RA1, 1*16(%rsi);
23962306a36Sopenharmony_ci	je .Lblk4_store_output_done;
24062306a36Sopenharmony_ci	vmovdqu RA2, 2*16(%rsi);
24162306a36Sopenharmony_ci	cmpq $3, %rcx;
24262306a36Sopenharmony_ci	je .Lblk4_store_output_done;
24362306a36Sopenharmony_ci	vmovdqu RA3, 3*16(%rsi);
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci.Lblk4_store_output_done:
24662306a36Sopenharmony_ci	vzeroall;
24762306a36Sopenharmony_ci	FRAME_END
24862306a36Sopenharmony_ci	RET;
24962306a36Sopenharmony_ciSYM_FUNC_END(sm4_aesni_avx_crypt4)
25062306a36Sopenharmony_ci
25162306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__sm4_crypt_blk8)
25262306a36Sopenharmony_ci	/* input:
25362306a36Sopenharmony_ci	 *	%rdi: round key array, CTX
25462306a36Sopenharmony_ci	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
25562306a36Sopenharmony_ci	 *						plaintext blocks
25662306a36Sopenharmony_ci	 * output:
25762306a36Sopenharmony_ci	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
25862306a36Sopenharmony_ci	 * 						ciphertext blocks
25962306a36Sopenharmony_ci	 */
26062306a36Sopenharmony_ci	FRAME_BEGIN
26162306a36Sopenharmony_ci
26262306a36Sopenharmony_ci	vmovdqa .Lbswap32_mask rRIP, RTMP2;
26362306a36Sopenharmony_ci	vpshufb RTMP2, RA0, RA0;
26462306a36Sopenharmony_ci	vpshufb RTMP2, RA1, RA1;
26562306a36Sopenharmony_ci	vpshufb RTMP2, RA2, RA2;
26662306a36Sopenharmony_ci	vpshufb RTMP2, RA3, RA3;
26762306a36Sopenharmony_ci	vpshufb RTMP2, RB0, RB0;
26862306a36Sopenharmony_ci	vpshufb RTMP2, RB1, RB1;
26962306a36Sopenharmony_ci	vpshufb RTMP2, RB2, RB2;
27062306a36Sopenharmony_ci	vpshufb RTMP2, RB3, RB3;
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci	vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
27362306a36Sopenharmony_ci	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
27462306a36Sopenharmony_ci	transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3)                \
27762306a36Sopenharmony_ci	vbroadcastss (4*(round))(%rdi), RX0;                        \
27862306a36Sopenharmony_ci	vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;                          \
27962306a36Sopenharmony_ci	vmovdqa .Lpre_tf_hi_s rRIP, RTMP1;                          \
28062306a36Sopenharmony_ci	vmovdqa RX0, RX1;                                           \
28162306a36Sopenharmony_ci	vpxor s1, RX0, RX0;                                         \
28262306a36Sopenharmony_ci	vpxor s2, RX0, RX0;                                         \
28362306a36Sopenharmony_ci	vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */                 \
28462306a36Sopenharmony_ci	vmovdqa .Lpost_tf_lo_s rRIP, RTMP2;                         \
28562306a36Sopenharmony_ci	vmovdqa .Lpost_tf_hi_s rRIP, RTMP3;                         \
28662306a36Sopenharmony_ci	vpxor r1, RX1, RX1;                                         \
28762306a36Sopenharmony_ci	vpxor r2, RX1, RX1;                                         \
28862306a36Sopenharmony_ci	vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */                 \
28962306a36Sopenharmony_ci                                                                    \
29062306a36Sopenharmony_ci	/* sbox, non-linear part */                                 \
29162306a36Sopenharmony_ci	transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0);         \
29262306a36Sopenharmony_ci	transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0);         \
29362306a36Sopenharmony_ci	vmovdqa .Linv_shift_row rRIP, RTMP4;                        \
29462306a36Sopenharmony_ci	vaesenclast MASK_4BIT, RX0, RX0;                            \
29562306a36Sopenharmony_ci	vaesenclast MASK_4BIT, RX1, RX1;                            \
29662306a36Sopenharmony_ci	transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0);        \
29762306a36Sopenharmony_ci	transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0);        \
29862306a36Sopenharmony_ci                                                                    \
29962306a36Sopenharmony_ci	/* linear part */                                           \
30062306a36Sopenharmony_ci	vpshufb RTMP4, RX0, RTMP0;                                  \
30162306a36Sopenharmony_ci	vpxor RTMP0, s0, s0; /* s0 ^ x */                           \
30262306a36Sopenharmony_ci	vpshufb RTMP4, RX1, RTMP2;                                  \
30362306a36Sopenharmony_ci	vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4;                  \
30462306a36Sopenharmony_ci	vpxor RTMP2, r0, r0; /* r0 ^ x */                           \
30562306a36Sopenharmony_ci	vpshufb RTMP4, RX0, RTMP1;                                  \
30662306a36Sopenharmony_ci	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */               \
30762306a36Sopenharmony_ci	vpshufb RTMP4, RX1, RTMP3;                                  \
30862306a36Sopenharmony_ci	vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4;                 \
30962306a36Sopenharmony_ci	vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */               \
31062306a36Sopenharmony_ci	vpshufb RTMP4, RX0, RTMP1;                                  \
31162306a36Sopenharmony_ci	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */   \
31262306a36Sopenharmony_ci	vpshufb RTMP4, RX1, RTMP3;                                  \
31362306a36Sopenharmony_ci	vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4;                 \
31462306a36Sopenharmony_ci	vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */   \
31562306a36Sopenharmony_ci	vpshufb RTMP4, RX0, RTMP1;                                  \
31662306a36Sopenharmony_ci	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */               \
31762306a36Sopenharmony_ci	/* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
31862306a36Sopenharmony_ci	vpslld $2, RTMP0, RTMP1;                                    \
31962306a36Sopenharmony_ci	vpsrld $30, RTMP0, RTMP0;                                   \
32062306a36Sopenharmony_ci	vpxor RTMP0, s0, s0;                                        \
32162306a36Sopenharmony_ci	vpxor RTMP1, s0, s0;                                        \
32262306a36Sopenharmony_ci	vpshufb RTMP4, RX1, RTMP3;                                  \
32362306a36Sopenharmony_ci	vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */               \
32462306a36Sopenharmony_ci	/* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
32562306a36Sopenharmony_ci	vpslld $2, RTMP2, RTMP3;                                    \
32662306a36Sopenharmony_ci	vpsrld $30, RTMP2, RTMP2;                                   \
32762306a36Sopenharmony_ci	vpxor RTMP2, r0, r0;                                        \
32862306a36Sopenharmony_ci	vpxor RTMP3, r0, r0;
32962306a36Sopenharmony_ci
33062306a36Sopenharmony_ci	leaq (32*4)(%rdi), %rax;
33162306a36Sopenharmony_ci.align 16
33262306a36Sopenharmony_ci.Lroundloop_blk8:
33362306a36Sopenharmony_ci	ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
33462306a36Sopenharmony_ci	ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
33562306a36Sopenharmony_ci	ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
33662306a36Sopenharmony_ci	ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
33762306a36Sopenharmony_ci	leaq (4*4)(%rdi), %rdi;
33862306a36Sopenharmony_ci	cmpq %rax, %rdi;
33962306a36Sopenharmony_ci	jne .Lroundloop_blk8;
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci#undef ROUND
34262306a36Sopenharmony_ci
34362306a36Sopenharmony_ci	vmovdqa .Lbswap128_mask rRIP, RTMP2;
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_ci	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
34662306a36Sopenharmony_ci	transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
34762306a36Sopenharmony_ci	vpshufb RTMP2, RA0, RA0;
34862306a36Sopenharmony_ci	vpshufb RTMP2, RA1, RA1;
34962306a36Sopenharmony_ci	vpshufb RTMP2, RA2, RA2;
35062306a36Sopenharmony_ci	vpshufb RTMP2, RA3, RA3;
35162306a36Sopenharmony_ci	vpshufb RTMP2, RB0, RB0;
35262306a36Sopenharmony_ci	vpshufb RTMP2, RB1, RB1;
35362306a36Sopenharmony_ci	vpshufb RTMP2, RB2, RB2;
35462306a36Sopenharmony_ci	vpshufb RTMP2, RB3, RB3;
35562306a36Sopenharmony_ci
35662306a36Sopenharmony_ci	FRAME_END
35762306a36Sopenharmony_ci	RET;
35862306a36Sopenharmony_ciSYM_FUNC_END(__sm4_crypt_blk8)
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_ci/*
36162306a36Sopenharmony_ci * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst,
36262306a36Sopenharmony_ci *                           const u8 *src, int nblocks)
36362306a36Sopenharmony_ci */
36462306a36Sopenharmony_ciSYM_FUNC_START(sm4_aesni_avx_crypt8)
36562306a36Sopenharmony_ci	/* input:
36662306a36Sopenharmony_ci	 *	%rdi: round key array, CTX
36762306a36Sopenharmony_ci	 *	%rsi: dst (1..8 blocks)
36862306a36Sopenharmony_ci	 *	%rdx: src (1..8 blocks)
36962306a36Sopenharmony_ci	 *	%rcx: num blocks (1..8)
37062306a36Sopenharmony_ci	 */
37162306a36Sopenharmony_ci	cmpq $5, %rcx;
37262306a36Sopenharmony_ci	jb sm4_aesni_avx_crypt4;
37362306a36Sopenharmony_ci
37462306a36Sopenharmony_ci	FRAME_BEGIN
37562306a36Sopenharmony_ci
37662306a36Sopenharmony_ci	vmovdqu (0 * 16)(%rdx), RA0;
37762306a36Sopenharmony_ci	vmovdqu (1 * 16)(%rdx), RA1;
37862306a36Sopenharmony_ci	vmovdqu (2 * 16)(%rdx), RA2;
37962306a36Sopenharmony_ci	vmovdqu (3 * 16)(%rdx), RA3;
38062306a36Sopenharmony_ci	vmovdqu (4 * 16)(%rdx), RB0;
38162306a36Sopenharmony_ci	vmovdqa RB0, RB1;
38262306a36Sopenharmony_ci	vmovdqa RB0, RB2;
38362306a36Sopenharmony_ci	vmovdqa RB0, RB3;
38462306a36Sopenharmony_ci	je .Lblk8_load_input_done;
38562306a36Sopenharmony_ci	vmovdqu (5 * 16)(%rdx), RB1;
38662306a36Sopenharmony_ci	cmpq $7, %rcx;
38762306a36Sopenharmony_ci	jb .Lblk8_load_input_done;
38862306a36Sopenharmony_ci	vmovdqu (6 * 16)(%rdx), RB2;
38962306a36Sopenharmony_ci	je .Lblk8_load_input_done;
39062306a36Sopenharmony_ci	vmovdqu (7 * 16)(%rdx), RB3;
39162306a36Sopenharmony_ci
39262306a36Sopenharmony_ci.Lblk8_load_input_done:
39362306a36Sopenharmony_ci	call __sm4_crypt_blk8;
39462306a36Sopenharmony_ci
39562306a36Sopenharmony_ci	cmpq $6, %rcx;
39662306a36Sopenharmony_ci	vmovdqu RA0, (0 * 16)(%rsi);
39762306a36Sopenharmony_ci	vmovdqu RA1, (1 * 16)(%rsi);
39862306a36Sopenharmony_ci	vmovdqu RA2, (2 * 16)(%rsi);
39962306a36Sopenharmony_ci	vmovdqu RA3, (3 * 16)(%rsi);
40062306a36Sopenharmony_ci	vmovdqu RB0, (4 * 16)(%rsi);
40162306a36Sopenharmony_ci	jb .Lblk8_store_output_done;
40262306a36Sopenharmony_ci	vmovdqu RB1, (5 * 16)(%rsi);
40362306a36Sopenharmony_ci	je .Lblk8_store_output_done;
40462306a36Sopenharmony_ci	vmovdqu RB2, (6 * 16)(%rsi);
40562306a36Sopenharmony_ci	cmpq $7, %rcx;
40662306a36Sopenharmony_ci	je .Lblk8_store_output_done;
40762306a36Sopenharmony_ci	vmovdqu RB3, (7 * 16)(%rsi);
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci.Lblk8_store_output_done:
41062306a36Sopenharmony_ci	vzeroall;
41162306a36Sopenharmony_ci	FRAME_END
41262306a36Sopenharmony_ci	RET;
41362306a36Sopenharmony_ciSYM_FUNC_END(sm4_aesni_avx_crypt8)
41462306a36Sopenharmony_ci
41562306a36Sopenharmony_ci/*
41662306a36Sopenharmony_ci * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst,
41762306a36Sopenharmony_ci *                                 const u8 *src, u8 *iv)
41862306a36Sopenharmony_ci */
41962306a36Sopenharmony_ciSYM_TYPED_FUNC_START(sm4_aesni_avx_ctr_enc_blk8)
42062306a36Sopenharmony_ci	/* input:
42162306a36Sopenharmony_ci	 *	%rdi: round key array, CTX
42262306a36Sopenharmony_ci	 *	%rsi: dst (8 blocks)
42362306a36Sopenharmony_ci	 *	%rdx: src (8 blocks)
42462306a36Sopenharmony_ci	 *	%rcx: iv (big endian, 128bit)
42562306a36Sopenharmony_ci	 */
42662306a36Sopenharmony_ci	FRAME_BEGIN
42762306a36Sopenharmony_ci
42862306a36Sopenharmony_ci	/* load IV and byteswap */
42962306a36Sopenharmony_ci	vmovdqu (%rcx), RA0;
43062306a36Sopenharmony_ci
43162306a36Sopenharmony_ci	vmovdqa .Lbswap128_mask rRIP, RBSWAP;
43262306a36Sopenharmony_ci	vpshufb RBSWAP, RA0, RTMP0; /* be => le */
43362306a36Sopenharmony_ci
43462306a36Sopenharmony_ci	vpcmpeqd RNOT, RNOT, RNOT;
43562306a36Sopenharmony_ci	vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */
43662306a36Sopenharmony_ci
43762306a36Sopenharmony_ci#define inc_le128(x, minus_one, tmp) \
43862306a36Sopenharmony_ci	vpcmpeqq minus_one, x, tmp;  \
43962306a36Sopenharmony_ci	vpsubq minus_one, x, x;      \
44062306a36Sopenharmony_ci	vpslldq $8, tmp, tmp;        \
44162306a36Sopenharmony_ci	vpsubq tmp, x, x;
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci	/* construct IVs */
44462306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP2); /* +1 */
44562306a36Sopenharmony_ci	vpshufb RBSWAP, RTMP0, RA1;
44662306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP2); /* +2 */
44762306a36Sopenharmony_ci	vpshufb RBSWAP, RTMP0, RA2;
44862306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP2); /* +3 */
44962306a36Sopenharmony_ci	vpshufb RBSWAP, RTMP0, RA3;
45062306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP2); /* +4 */
45162306a36Sopenharmony_ci	vpshufb RBSWAP, RTMP0, RB0;
45262306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP2); /* +5 */
45362306a36Sopenharmony_ci	vpshufb RBSWAP, RTMP0, RB1;
45462306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP2); /* +6 */
45562306a36Sopenharmony_ci	vpshufb RBSWAP, RTMP0, RB2;
45662306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP2); /* +7 */
45762306a36Sopenharmony_ci	vpshufb RBSWAP, RTMP0, RB3;
45862306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP2); /* +8 */
45962306a36Sopenharmony_ci	vpshufb RBSWAP, RTMP0, RTMP1;
46062306a36Sopenharmony_ci
46162306a36Sopenharmony_ci	/* store new IV */
46262306a36Sopenharmony_ci	vmovdqu RTMP1, (%rcx);
46362306a36Sopenharmony_ci
46462306a36Sopenharmony_ci	call __sm4_crypt_blk8;
46562306a36Sopenharmony_ci
46662306a36Sopenharmony_ci	vpxor (0 * 16)(%rdx), RA0, RA0;
46762306a36Sopenharmony_ci	vpxor (1 * 16)(%rdx), RA1, RA1;
46862306a36Sopenharmony_ci	vpxor (2 * 16)(%rdx), RA2, RA2;
46962306a36Sopenharmony_ci	vpxor (3 * 16)(%rdx), RA3, RA3;
47062306a36Sopenharmony_ci	vpxor (4 * 16)(%rdx), RB0, RB0;
47162306a36Sopenharmony_ci	vpxor (5 * 16)(%rdx), RB1, RB1;
47262306a36Sopenharmony_ci	vpxor (6 * 16)(%rdx), RB2, RB2;
47362306a36Sopenharmony_ci	vpxor (7 * 16)(%rdx), RB3, RB3;
47462306a36Sopenharmony_ci
47562306a36Sopenharmony_ci	vmovdqu RA0, (0 * 16)(%rsi);
47662306a36Sopenharmony_ci	vmovdqu RA1, (1 * 16)(%rsi);
47762306a36Sopenharmony_ci	vmovdqu RA2, (2 * 16)(%rsi);
47862306a36Sopenharmony_ci	vmovdqu RA3, (3 * 16)(%rsi);
47962306a36Sopenharmony_ci	vmovdqu RB0, (4 * 16)(%rsi);
48062306a36Sopenharmony_ci	vmovdqu RB1, (5 * 16)(%rsi);
48162306a36Sopenharmony_ci	vmovdqu RB2, (6 * 16)(%rsi);
48262306a36Sopenharmony_ci	vmovdqu RB3, (7 * 16)(%rsi);
48362306a36Sopenharmony_ci
48462306a36Sopenharmony_ci	vzeroall;
48562306a36Sopenharmony_ci	FRAME_END
48662306a36Sopenharmony_ci	RET;
48762306a36Sopenharmony_ciSYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8)
48862306a36Sopenharmony_ci
48962306a36Sopenharmony_ci/*
49062306a36Sopenharmony_ci * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst,
49162306a36Sopenharmony_ci *                                 const u8 *src, u8 *iv)
49262306a36Sopenharmony_ci */
49362306a36Sopenharmony_ciSYM_TYPED_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
49462306a36Sopenharmony_ci	/* input:
49562306a36Sopenharmony_ci	 *	%rdi: round key array, CTX
49662306a36Sopenharmony_ci	 *	%rsi: dst (8 blocks)
49762306a36Sopenharmony_ci	 *	%rdx: src (8 blocks)
49862306a36Sopenharmony_ci	 *	%rcx: iv
49962306a36Sopenharmony_ci	 */
50062306a36Sopenharmony_ci	FRAME_BEGIN
50162306a36Sopenharmony_ci
50262306a36Sopenharmony_ci	vmovdqu (0 * 16)(%rdx), RA0;
50362306a36Sopenharmony_ci	vmovdqu (1 * 16)(%rdx), RA1;
50462306a36Sopenharmony_ci	vmovdqu (2 * 16)(%rdx), RA2;
50562306a36Sopenharmony_ci	vmovdqu (3 * 16)(%rdx), RA3;
50662306a36Sopenharmony_ci	vmovdqu (4 * 16)(%rdx), RB0;
50762306a36Sopenharmony_ci	vmovdqu (5 * 16)(%rdx), RB1;
50862306a36Sopenharmony_ci	vmovdqu (6 * 16)(%rdx), RB2;
50962306a36Sopenharmony_ci	vmovdqu (7 * 16)(%rdx), RB3;
51062306a36Sopenharmony_ci
51162306a36Sopenharmony_ci	call __sm4_crypt_blk8;
51262306a36Sopenharmony_ci
51362306a36Sopenharmony_ci	vmovdqu (7 * 16)(%rdx), RNOT;
51462306a36Sopenharmony_ci	vpxor (%rcx), RA0, RA0;
51562306a36Sopenharmony_ci	vpxor (0 * 16)(%rdx), RA1, RA1;
51662306a36Sopenharmony_ci	vpxor (1 * 16)(%rdx), RA2, RA2;
51762306a36Sopenharmony_ci	vpxor (2 * 16)(%rdx), RA3, RA3;
51862306a36Sopenharmony_ci	vpxor (3 * 16)(%rdx), RB0, RB0;
51962306a36Sopenharmony_ci	vpxor (4 * 16)(%rdx), RB1, RB1;
52062306a36Sopenharmony_ci	vpxor (5 * 16)(%rdx), RB2, RB2;
52162306a36Sopenharmony_ci	vpxor (6 * 16)(%rdx), RB3, RB3;
52262306a36Sopenharmony_ci	vmovdqu RNOT, (%rcx); /* store new IV */
52362306a36Sopenharmony_ci
52462306a36Sopenharmony_ci	vmovdqu RA0, (0 * 16)(%rsi);
52562306a36Sopenharmony_ci	vmovdqu RA1, (1 * 16)(%rsi);
52662306a36Sopenharmony_ci	vmovdqu RA2, (2 * 16)(%rsi);
52762306a36Sopenharmony_ci	vmovdqu RA3, (3 * 16)(%rsi);
52862306a36Sopenharmony_ci	vmovdqu RB0, (4 * 16)(%rsi);
52962306a36Sopenharmony_ci	vmovdqu RB1, (5 * 16)(%rsi);
53062306a36Sopenharmony_ci	vmovdqu RB2, (6 * 16)(%rsi);
53162306a36Sopenharmony_ci	vmovdqu RB3, (7 * 16)(%rsi);
53262306a36Sopenharmony_ci
53362306a36Sopenharmony_ci	vzeroall;
53462306a36Sopenharmony_ci	FRAME_END
53562306a36Sopenharmony_ci	RET;
53662306a36Sopenharmony_ciSYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)
53762306a36Sopenharmony_ci
53862306a36Sopenharmony_ci/*
53962306a36Sopenharmony_ci * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst,
54062306a36Sopenharmony_ci *                                 const u8 *src, u8 *iv)
54162306a36Sopenharmony_ci */
54262306a36Sopenharmony_ciSYM_TYPED_FUNC_START(sm4_aesni_avx_cfb_dec_blk8)
54362306a36Sopenharmony_ci	/* input:
54462306a36Sopenharmony_ci	 *	%rdi: round key array, CTX
54562306a36Sopenharmony_ci	 *	%rsi: dst (8 blocks)
54662306a36Sopenharmony_ci	 *	%rdx: src (8 blocks)
54762306a36Sopenharmony_ci	 *	%rcx: iv
54862306a36Sopenharmony_ci	 */
54962306a36Sopenharmony_ci	FRAME_BEGIN
55062306a36Sopenharmony_ci
55162306a36Sopenharmony_ci	/* Load input */
55262306a36Sopenharmony_ci	vmovdqu (%rcx), RA0;
55362306a36Sopenharmony_ci	vmovdqu 0 * 16(%rdx), RA1;
55462306a36Sopenharmony_ci	vmovdqu 1 * 16(%rdx), RA2;
55562306a36Sopenharmony_ci	vmovdqu 2 * 16(%rdx), RA3;
55662306a36Sopenharmony_ci	vmovdqu 3 * 16(%rdx), RB0;
55762306a36Sopenharmony_ci	vmovdqu 4 * 16(%rdx), RB1;
55862306a36Sopenharmony_ci	vmovdqu 5 * 16(%rdx), RB2;
55962306a36Sopenharmony_ci	vmovdqu 6 * 16(%rdx), RB3;
56062306a36Sopenharmony_ci
56162306a36Sopenharmony_ci	/* Update IV */
56262306a36Sopenharmony_ci	vmovdqu 7 * 16(%rdx), RNOT;
56362306a36Sopenharmony_ci	vmovdqu RNOT, (%rcx);
56462306a36Sopenharmony_ci
56562306a36Sopenharmony_ci	call __sm4_crypt_blk8;
56662306a36Sopenharmony_ci
56762306a36Sopenharmony_ci	vpxor (0 * 16)(%rdx), RA0, RA0;
56862306a36Sopenharmony_ci	vpxor (1 * 16)(%rdx), RA1, RA1;
56962306a36Sopenharmony_ci	vpxor (2 * 16)(%rdx), RA2, RA2;
57062306a36Sopenharmony_ci	vpxor (3 * 16)(%rdx), RA3, RA3;
57162306a36Sopenharmony_ci	vpxor (4 * 16)(%rdx), RB0, RB0;
57262306a36Sopenharmony_ci	vpxor (5 * 16)(%rdx), RB1, RB1;
57362306a36Sopenharmony_ci	vpxor (6 * 16)(%rdx), RB2, RB2;
57462306a36Sopenharmony_ci	vpxor (7 * 16)(%rdx), RB3, RB3;
57562306a36Sopenharmony_ci
57662306a36Sopenharmony_ci	vmovdqu RA0, (0 * 16)(%rsi);
57762306a36Sopenharmony_ci	vmovdqu RA1, (1 * 16)(%rsi);
57862306a36Sopenharmony_ci	vmovdqu RA2, (2 * 16)(%rsi);
57962306a36Sopenharmony_ci	vmovdqu RA3, (3 * 16)(%rsi);
58062306a36Sopenharmony_ci	vmovdqu RB0, (4 * 16)(%rsi);
58162306a36Sopenharmony_ci	vmovdqu RB1, (5 * 16)(%rsi);
58262306a36Sopenharmony_ci	vmovdqu RB2, (6 * 16)(%rsi);
58362306a36Sopenharmony_ci	vmovdqu RB3, (7 * 16)(%rsi);
58462306a36Sopenharmony_ci
58562306a36Sopenharmony_ci	vzeroall;
58662306a36Sopenharmony_ci	FRAME_END
58762306a36Sopenharmony_ci	RET;
58862306a36Sopenharmony_ciSYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8)
589