162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * SM4 Cipher Algorithm, AES-NI/AVX2 optimized.
462306a36Sopenharmony_ci * as specified in
562306a36Sopenharmony_ci * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi>
862306a36Sopenharmony_ci * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
962306a36Sopenharmony_ci * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
1062306a36Sopenharmony_ci */
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:
1362306a36Sopenharmony_ci *  https://github.com/mjosaarinen/sm4ni
1462306a36Sopenharmony_ci */
1562306a36Sopenharmony_ci
1662306a36Sopenharmony_ci#include <linux/linkage.h>
1762306a36Sopenharmony_ci#include <linux/cfi_types.h>
1862306a36Sopenharmony_ci#include <asm/frame.h>
1962306a36Sopenharmony_ci
2062306a36Sopenharmony_ci#define rRIP         (%rip)
2162306a36Sopenharmony_ci
2262306a36Sopenharmony_ci/* vector registers */
2362306a36Sopenharmony_ci#define RX0          %ymm0
2462306a36Sopenharmony_ci#define RX1          %ymm1
2562306a36Sopenharmony_ci#define MASK_4BIT    %ymm2
2662306a36Sopenharmony_ci#define RTMP0        %ymm3
2762306a36Sopenharmony_ci#define RTMP1        %ymm4
2862306a36Sopenharmony_ci#define RTMP2        %ymm5
2962306a36Sopenharmony_ci#define RTMP3        %ymm6
3062306a36Sopenharmony_ci#define RTMP4        %ymm7
3162306a36Sopenharmony_ci
3262306a36Sopenharmony_ci#define RA0          %ymm8
3362306a36Sopenharmony_ci#define RA1          %ymm9
3462306a36Sopenharmony_ci#define RA2          %ymm10
3562306a36Sopenharmony_ci#define RA3          %ymm11
3662306a36Sopenharmony_ci
3762306a36Sopenharmony_ci#define RB0          %ymm12
3862306a36Sopenharmony_ci#define RB1          %ymm13
3962306a36Sopenharmony_ci#define RB2          %ymm14
4062306a36Sopenharmony_ci#define RB3          %ymm15
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci#define RNOT         %ymm0
4362306a36Sopenharmony_ci#define RBSWAP       %ymm1
4462306a36Sopenharmony_ci
4562306a36Sopenharmony_ci#define RX0x         %xmm0
4662306a36Sopenharmony_ci#define RX1x         %xmm1
4762306a36Sopenharmony_ci#define MASK_4BITx   %xmm2
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci#define RNOTx        %xmm0
5062306a36Sopenharmony_ci#define RBSWAPx      %xmm1
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_ci#define RTMP0x       %xmm3
5362306a36Sopenharmony_ci#define RTMP1x       %xmm4
5462306a36Sopenharmony_ci#define RTMP2x       %xmm5
5562306a36Sopenharmony_ci#define RTMP3x       %xmm6
5662306a36Sopenharmony_ci#define RTMP4x       %xmm7
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci/* helper macros */
6062306a36Sopenharmony_ci
6162306a36Sopenharmony_ci/* Transpose four 32-bit words between 128-bit vector lanes. */
6262306a36Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
6362306a36Sopenharmony_ci	vpunpckhdq x1, x0, t2;                \
6462306a36Sopenharmony_ci	vpunpckldq x1, x0, x0;                \
6562306a36Sopenharmony_ci	                                      \
6662306a36Sopenharmony_ci	vpunpckldq x3, x2, t1;                \
6762306a36Sopenharmony_ci	vpunpckhdq x3, x2, x2;                \
6862306a36Sopenharmony_ci	                                      \
6962306a36Sopenharmony_ci	vpunpckhqdq t1, x0, x1;               \
7062306a36Sopenharmony_ci	vpunpcklqdq t1, x0, x0;               \
7162306a36Sopenharmony_ci	                                      \
7262306a36Sopenharmony_ci	vpunpckhqdq x2, t2, x3;               \
7362306a36Sopenharmony_ci	vpunpcklqdq x2, t2, x2;
7462306a36Sopenharmony_ci
7562306a36Sopenharmony_ci/* post-SubByte transform. */
7662306a36Sopenharmony_ci#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
7762306a36Sopenharmony_ci	vpand x, mask4bit, tmp0;                     \
7862306a36Sopenharmony_ci	vpandn x, mask4bit, x;                       \
7962306a36Sopenharmony_ci	vpsrld $4, x, x;                             \
8062306a36Sopenharmony_ci	                                             \
8162306a36Sopenharmony_ci	vpshufb tmp0, lo_t, tmp0;                    \
8262306a36Sopenharmony_ci	vpshufb x, hi_t, x;                          \
8362306a36Sopenharmony_ci	vpxor tmp0, x, x;
8462306a36Sopenharmony_ci
8562306a36Sopenharmony_ci/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
8662306a36Sopenharmony_ci * 'vaeslastenc' instruction. */
8762306a36Sopenharmony_ci#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
8862306a36Sopenharmony_ci	vpandn mask4bit, x, tmp0;                     \
8962306a36Sopenharmony_ci	vpsrld $4, x, x;                              \
9062306a36Sopenharmony_ci	vpand x, mask4bit, x;                         \
9162306a36Sopenharmony_ci	                                              \
9262306a36Sopenharmony_ci	vpshufb tmp0, lo_t, tmp0;                     \
9362306a36Sopenharmony_ci	vpshufb x, hi_t, x;                           \
9462306a36Sopenharmony_ci	vpxor tmp0, x, x;
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ci.section	.rodata.cst16, "aM", @progbits, 16
9862306a36Sopenharmony_ci.align 16
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_ci/*
10162306a36Sopenharmony_ci * Following four affine transform look-up tables are from work by
10262306a36Sopenharmony_ci * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
10362306a36Sopenharmony_ci *
10462306a36Sopenharmony_ci * These allow exposing SM4 S-Box from AES SubByte.
10562306a36Sopenharmony_ci */
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci/* pre-SubByte affine transform, from SM4 field to AES field. */
10862306a36Sopenharmony_ci.Lpre_tf_lo_s:
10962306a36Sopenharmony_ci	.quad 0x9197E2E474720701, 0xC7C1B4B222245157
11062306a36Sopenharmony_ci.Lpre_tf_hi_s:
11162306a36Sopenharmony_ci	.quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_ci/* post-SubByte affine transform, from AES field to SM4 field. */
11462306a36Sopenharmony_ci.Lpost_tf_lo_s:
11562306a36Sopenharmony_ci	.quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
11662306a36Sopenharmony_ci.Lpost_tf_hi_s:
11762306a36Sopenharmony_ci	.quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
11862306a36Sopenharmony_ci
11962306a36Sopenharmony_ci/* For isolating SubBytes from AESENCLAST, inverse shift row */
12062306a36Sopenharmony_ci.Linv_shift_row:
12162306a36Sopenharmony_ci	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
12262306a36Sopenharmony_ci	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
12362306a36Sopenharmony_ci
12462306a36Sopenharmony_ci/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
12562306a36Sopenharmony_ci.Linv_shift_row_rol_8:
12662306a36Sopenharmony_ci	.byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
12762306a36Sopenharmony_ci	.byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_ci/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
13062306a36Sopenharmony_ci.Linv_shift_row_rol_16:
13162306a36Sopenharmony_ci	.byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
13262306a36Sopenharmony_ci	.byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ci/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
13562306a36Sopenharmony_ci.Linv_shift_row_rol_24:
13662306a36Sopenharmony_ci	.byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
13762306a36Sopenharmony_ci	.byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_ci/* For CTR-mode IV byteswap */
14062306a36Sopenharmony_ci.Lbswap128_mask:
14162306a36Sopenharmony_ci	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
14262306a36Sopenharmony_ci
14362306a36Sopenharmony_ci/* For input word byte-swap */
14462306a36Sopenharmony_ci.Lbswap32_mask:
14562306a36Sopenharmony_ci	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci.align 4
14862306a36Sopenharmony_ci/* 4-bit mask */
14962306a36Sopenharmony_ci.L0f0f0f0f:
15062306a36Sopenharmony_ci	.long 0x0f0f0f0f
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci/* 12 bytes, only for padding */
15362306a36Sopenharmony_ci.Lpadding_deadbeef:
15462306a36Sopenharmony_ci	.long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
15562306a36Sopenharmony_ci
15662306a36Sopenharmony_ci.text
15762306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__sm4_crypt_blk16)
15862306a36Sopenharmony_ci	/* input:
15962306a36Sopenharmony_ci	 *	%rdi: round key array, CTX
16062306a36Sopenharmony_ci	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
16162306a36Sopenharmony_ci	 *						plaintext blocks
16262306a36Sopenharmony_ci	 * output:
16362306a36Sopenharmony_ci	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
16462306a36Sopenharmony_ci	 * 						ciphertext blocks
16562306a36Sopenharmony_ci	 */
16662306a36Sopenharmony_ci	FRAME_BEGIN
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ci	vbroadcasti128 .Lbswap32_mask rRIP, RTMP2;
16962306a36Sopenharmony_ci	vpshufb RTMP2, RA0, RA0;
17062306a36Sopenharmony_ci	vpshufb RTMP2, RA1, RA1;
17162306a36Sopenharmony_ci	vpshufb RTMP2, RA2, RA2;
17262306a36Sopenharmony_ci	vpshufb RTMP2, RA3, RA3;
17362306a36Sopenharmony_ci	vpshufb RTMP2, RB0, RB0;
17462306a36Sopenharmony_ci	vpshufb RTMP2, RB1, RB1;
17562306a36Sopenharmony_ci	vpshufb RTMP2, RB2, RB2;
17662306a36Sopenharmony_ci	vpshufb RTMP2, RB3, RB3;
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci	vpbroadcastd .L0f0f0f0f rRIP, MASK_4BIT;
17962306a36Sopenharmony_ci	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
18062306a36Sopenharmony_ci	transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3)                \
18362306a36Sopenharmony_ci	vpbroadcastd (4*(round))(%rdi), RX0;                        \
18462306a36Sopenharmony_ci	vbroadcasti128 .Lpre_tf_lo_s rRIP, RTMP4;                   \
18562306a36Sopenharmony_ci	vbroadcasti128 .Lpre_tf_hi_s rRIP, RTMP1;                   \
18662306a36Sopenharmony_ci	vmovdqa RX0, RX1;                                           \
18762306a36Sopenharmony_ci	vpxor s1, RX0, RX0;                                         \
18862306a36Sopenharmony_ci	vpxor s2, RX0, RX0;                                         \
18962306a36Sopenharmony_ci	vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */                 \
19062306a36Sopenharmony_ci	vbroadcasti128 .Lpost_tf_lo_s rRIP, RTMP2;                  \
19162306a36Sopenharmony_ci	vbroadcasti128 .Lpost_tf_hi_s rRIP, RTMP3;                  \
19262306a36Sopenharmony_ci	vpxor r1, RX1, RX1;                                         \
19362306a36Sopenharmony_ci	vpxor r2, RX1, RX1;                                         \
19462306a36Sopenharmony_ci	vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */                 \
19562306a36Sopenharmony_ci	                                                            \
19662306a36Sopenharmony_ci	/* sbox, non-linear part */                                 \
19762306a36Sopenharmony_ci	transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0);         \
19862306a36Sopenharmony_ci	transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0);         \
19962306a36Sopenharmony_ci	vextracti128 $1, RX0, RTMP4x;                               \
20062306a36Sopenharmony_ci	vextracti128 $1, RX1, RTMP0x;                               \
20162306a36Sopenharmony_ci	vaesenclast MASK_4BITx, RX0x, RX0x;                         \
20262306a36Sopenharmony_ci	vaesenclast MASK_4BITx, RTMP4x, RTMP4x;                     \
20362306a36Sopenharmony_ci	vaesenclast MASK_4BITx, RX1x, RX1x;                         \
20462306a36Sopenharmony_ci	vaesenclast MASK_4BITx, RTMP0x, RTMP0x;                     \
20562306a36Sopenharmony_ci	vinserti128 $1, RTMP4x, RX0, RX0;                           \
20662306a36Sopenharmony_ci	vbroadcasti128 .Linv_shift_row rRIP, RTMP4;                 \
20762306a36Sopenharmony_ci	vinserti128 $1, RTMP0x, RX1, RX1;                           \
20862306a36Sopenharmony_ci	transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0);        \
20962306a36Sopenharmony_ci	transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0);        \
21062306a36Sopenharmony_ci	                                                            \
21162306a36Sopenharmony_ci	/* linear part */                                           \
21262306a36Sopenharmony_ci	vpshufb RTMP4, RX0, RTMP0;                                  \
21362306a36Sopenharmony_ci	vpxor RTMP0, s0, s0; /* s0 ^ x */                           \
21462306a36Sopenharmony_ci	vpshufb RTMP4, RX1, RTMP2;                                  \
21562306a36Sopenharmony_ci	vbroadcasti128 .Linv_shift_row_rol_8 rRIP, RTMP4;           \
21662306a36Sopenharmony_ci	vpxor RTMP2, r0, r0; /* r0 ^ x */                           \
21762306a36Sopenharmony_ci	vpshufb RTMP4, RX0, RTMP1;                                  \
21862306a36Sopenharmony_ci	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */               \
21962306a36Sopenharmony_ci	vpshufb RTMP4, RX1, RTMP3;                                  \
22062306a36Sopenharmony_ci	vbroadcasti128 .Linv_shift_row_rol_16 rRIP, RTMP4;          \
22162306a36Sopenharmony_ci	vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */               \
22262306a36Sopenharmony_ci	vpshufb RTMP4, RX0, RTMP1;                                  \
22362306a36Sopenharmony_ci	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */   \
22462306a36Sopenharmony_ci	vpshufb RTMP4, RX1, RTMP3;                                  \
22562306a36Sopenharmony_ci	vbroadcasti128 .Linv_shift_row_rol_24 rRIP, RTMP4;          \
22662306a36Sopenharmony_ci	vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */   \
22762306a36Sopenharmony_ci	vpshufb RTMP4, RX0, RTMP1;                                  \
22862306a36Sopenharmony_ci	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */               \
22962306a36Sopenharmony_ci	vpslld $2, RTMP0, RTMP1;                                    \
23062306a36Sopenharmony_ci	vpsrld $30, RTMP0, RTMP0;                                   \
23162306a36Sopenharmony_ci	vpxor RTMP0, s0, s0;                                        \
23262306a36Sopenharmony_ci	/* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
23362306a36Sopenharmony_ci	vpxor RTMP1, s0, s0;                                        \
23462306a36Sopenharmony_ci	vpshufb RTMP4, RX1, RTMP3;                                  \
23562306a36Sopenharmony_ci	vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */               \
23662306a36Sopenharmony_ci	vpslld $2, RTMP2, RTMP3;                                    \
23762306a36Sopenharmony_ci	vpsrld $30, RTMP2, RTMP2;                                   \
23862306a36Sopenharmony_ci	vpxor RTMP2, r0, r0;                                        \
23962306a36Sopenharmony_ci	/* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
24062306a36Sopenharmony_ci	vpxor RTMP3, r0, r0;
24162306a36Sopenharmony_ci
24262306a36Sopenharmony_ci	leaq (32*4)(%rdi), %rax;
24362306a36Sopenharmony_ci.align 16
24462306a36Sopenharmony_ci.Lroundloop_blk8:
24562306a36Sopenharmony_ci	ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
24662306a36Sopenharmony_ci	ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
24762306a36Sopenharmony_ci	ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
24862306a36Sopenharmony_ci	ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
24962306a36Sopenharmony_ci	leaq (4*4)(%rdi), %rdi;
25062306a36Sopenharmony_ci	cmpq %rax, %rdi;
25162306a36Sopenharmony_ci	jne .Lroundloop_blk8;
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci#undef ROUND
25462306a36Sopenharmony_ci
25562306a36Sopenharmony_ci	vbroadcasti128 .Lbswap128_mask rRIP, RTMP2;
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
25862306a36Sopenharmony_ci	transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
25962306a36Sopenharmony_ci	vpshufb RTMP2, RA0, RA0;
26062306a36Sopenharmony_ci	vpshufb RTMP2, RA1, RA1;
26162306a36Sopenharmony_ci	vpshufb RTMP2, RA2, RA2;
26262306a36Sopenharmony_ci	vpshufb RTMP2, RA3, RA3;
26362306a36Sopenharmony_ci	vpshufb RTMP2, RB0, RB0;
26462306a36Sopenharmony_ci	vpshufb RTMP2, RB1, RB1;
26562306a36Sopenharmony_ci	vpshufb RTMP2, RB2, RB2;
26662306a36Sopenharmony_ci	vpshufb RTMP2, RB3, RB3;
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	FRAME_END
26962306a36Sopenharmony_ci	RET;
27062306a36Sopenharmony_ciSYM_FUNC_END(__sm4_crypt_blk16)
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci#define inc_le128(x, minus_one, tmp) \
27362306a36Sopenharmony_ci	vpcmpeqq minus_one, x, tmp;  \
27462306a36Sopenharmony_ci	vpsubq minus_one, x, x;      \
27562306a36Sopenharmony_ci	vpslldq $8, tmp, tmp;        \
27662306a36Sopenharmony_ci	vpsubq tmp, x, x;
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci/*
27962306a36Sopenharmony_ci * void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst,
28062306a36Sopenharmony_ci *                                   const u8 *src, u8 *iv)
28162306a36Sopenharmony_ci */
28262306a36Sopenharmony_ciSYM_TYPED_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16)
28362306a36Sopenharmony_ci	/* input:
28462306a36Sopenharmony_ci	 *	%rdi: round key array, CTX
28562306a36Sopenharmony_ci	 *	%rsi: dst (16 blocks)
28662306a36Sopenharmony_ci	 *	%rdx: src (16 blocks)
28762306a36Sopenharmony_ci	 *	%rcx: iv (big endian, 128bit)
28862306a36Sopenharmony_ci	 */
28962306a36Sopenharmony_ci	FRAME_BEGIN
29062306a36Sopenharmony_ci
29162306a36Sopenharmony_ci	movq 8(%rcx), %rax;
29262306a36Sopenharmony_ci	bswapq %rax;
29362306a36Sopenharmony_ci
29462306a36Sopenharmony_ci	vzeroupper;
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci	vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
29762306a36Sopenharmony_ci	vpcmpeqd RNOT, RNOT, RNOT;
29862306a36Sopenharmony_ci	vpsrldq $8, RNOT, RNOT;   /* ab: -1:0 ; cd: -1:0 */
29962306a36Sopenharmony_ci	vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
30062306a36Sopenharmony_ci
30162306a36Sopenharmony_ci	/* load IV and byteswap */
30262306a36Sopenharmony_ci	vmovdqu (%rcx), RTMP4x;
30362306a36Sopenharmony_ci	vpshufb RTMP3x, RTMP4x, RTMP4x;
30462306a36Sopenharmony_ci	vmovdqa RTMP4x, RTMP0x;
30562306a36Sopenharmony_ci	inc_le128(RTMP4x, RNOTx, RTMP1x);
30662306a36Sopenharmony_ci	vinserti128 $1, RTMP4x, RTMP0, RTMP0;
30762306a36Sopenharmony_ci	vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
30862306a36Sopenharmony_ci
30962306a36Sopenharmony_ci	/* check need for handling 64-bit overflow and carry */
31062306a36Sopenharmony_ci	cmpq $(0xffffffffffffffff - 16), %rax;
31162306a36Sopenharmony_ci	ja .Lhandle_ctr_carry;
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci	/* construct IVs */
31462306a36Sopenharmony_ci	vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
31562306a36Sopenharmony_ci	vpshufb RTMP3, RTMP0, RA1;
31662306a36Sopenharmony_ci	vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
31762306a36Sopenharmony_ci	vpshufb RTMP3, RTMP0, RA2;
31862306a36Sopenharmony_ci	vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
31962306a36Sopenharmony_ci	vpshufb RTMP3, RTMP0, RA3;
32062306a36Sopenharmony_ci	vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
32162306a36Sopenharmony_ci	vpshufb RTMP3, RTMP0, RB0;
32262306a36Sopenharmony_ci	vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
32362306a36Sopenharmony_ci	vpshufb RTMP3, RTMP0, RB1;
32462306a36Sopenharmony_ci	vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
32562306a36Sopenharmony_ci	vpshufb RTMP3, RTMP0, RB2;
32662306a36Sopenharmony_ci	vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
32762306a36Sopenharmony_ci	vpshufb RTMP3, RTMP0, RB3;
32862306a36Sopenharmony_ci	vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
32962306a36Sopenharmony_ci	vpshufb RTMP3x, RTMP0x, RTMP0x;
33062306a36Sopenharmony_ci
33162306a36Sopenharmony_ci	jmp .Lctr_carry_done;
33262306a36Sopenharmony_ci
33362306a36Sopenharmony_ci.Lhandle_ctr_carry:
33462306a36Sopenharmony_ci	/* construct IVs */
33562306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP1);
33662306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP1);
33762306a36Sopenharmony_ci	vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */
33862306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP1);
33962306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP1);
34062306a36Sopenharmony_ci	vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */
34162306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP1);
34262306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP1);
34362306a36Sopenharmony_ci	vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */
34462306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP1);
34562306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP1);
34662306a36Sopenharmony_ci	vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */
34762306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP1);
34862306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP1);
34962306a36Sopenharmony_ci	vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
35062306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP1);
35162306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP1);
35262306a36Sopenharmony_ci	vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */
35362306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP1);
35462306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP1);
35562306a36Sopenharmony_ci	vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */
35662306a36Sopenharmony_ci	inc_le128(RTMP0, RNOT, RTMP1);
35762306a36Sopenharmony_ci	vextracti128 $1, RTMP0, RTMP0x;
35862306a36Sopenharmony_ci	vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_ci.align 4
36162306a36Sopenharmony_ci.Lctr_carry_done:
36262306a36Sopenharmony_ci	/* store new IV */
36362306a36Sopenharmony_ci	vmovdqu RTMP0x, (%rcx);
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_ci	call __sm4_crypt_blk16;
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ci	vpxor (0 * 32)(%rdx), RA0, RA0;
36862306a36Sopenharmony_ci	vpxor (1 * 32)(%rdx), RA1, RA1;
36962306a36Sopenharmony_ci	vpxor (2 * 32)(%rdx), RA2, RA2;
37062306a36Sopenharmony_ci	vpxor (3 * 32)(%rdx), RA3, RA3;
37162306a36Sopenharmony_ci	vpxor (4 * 32)(%rdx), RB0, RB0;
37262306a36Sopenharmony_ci	vpxor (5 * 32)(%rdx), RB1, RB1;
37362306a36Sopenharmony_ci	vpxor (6 * 32)(%rdx), RB2, RB2;
37462306a36Sopenharmony_ci	vpxor (7 * 32)(%rdx), RB3, RB3;
37562306a36Sopenharmony_ci
37662306a36Sopenharmony_ci	vmovdqu RA0, (0 * 32)(%rsi);
37762306a36Sopenharmony_ci	vmovdqu RA1, (1 * 32)(%rsi);
37862306a36Sopenharmony_ci	vmovdqu RA2, (2 * 32)(%rsi);
37962306a36Sopenharmony_ci	vmovdqu RA3, (3 * 32)(%rsi);
38062306a36Sopenharmony_ci	vmovdqu RB0, (4 * 32)(%rsi);
38162306a36Sopenharmony_ci	vmovdqu RB1, (5 * 32)(%rsi);
38262306a36Sopenharmony_ci	vmovdqu RB2, (6 * 32)(%rsi);
38362306a36Sopenharmony_ci	vmovdqu RB3, (7 * 32)(%rsi);
38462306a36Sopenharmony_ci
38562306a36Sopenharmony_ci	vzeroall;
38662306a36Sopenharmony_ci	FRAME_END
38762306a36Sopenharmony_ci	RET;
38862306a36Sopenharmony_ciSYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16)
38962306a36Sopenharmony_ci
39062306a36Sopenharmony_ci/*
39162306a36Sopenharmony_ci * void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst,
39262306a36Sopenharmony_ci *                                   const u8 *src, u8 *iv)
39362306a36Sopenharmony_ci */
39462306a36Sopenharmony_ciSYM_TYPED_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)
39562306a36Sopenharmony_ci	/* input:
39662306a36Sopenharmony_ci	 *	%rdi: round key array, CTX
39762306a36Sopenharmony_ci	 *	%rsi: dst (16 blocks)
39862306a36Sopenharmony_ci	 *	%rdx: src (16 blocks)
39962306a36Sopenharmony_ci	 *	%rcx: iv
40062306a36Sopenharmony_ci	 */
40162306a36Sopenharmony_ci	FRAME_BEGIN
40262306a36Sopenharmony_ci
40362306a36Sopenharmony_ci	vzeroupper;
40462306a36Sopenharmony_ci
40562306a36Sopenharmony_ci	vmovdqu (0 * 32)(%rdx), RA0;
40662306a36Sopenharmony_ci	vmovdqu (1 * 32)(%rdx), RA1;
40762306a36Sopenharmony_ci	vmovdqu (2 * 32)(%rdx), RA2;
40862306a36Sopenharmony_ci	vmovdqu (3 * 32)(%rdx), RA3;
40962306a36Sopenharmony_ci	vmovdqu (4 * 32)(%rdx), RB0;
41062306a36Sopenharmony_ci	vmovdqu (5 * 32)(%rdx), RB1;
41162306a36Sopenharmony_ci	vmovdqu (6 * 32)(%rdx), RB2;
41262306a36Sopenharmony_ci	vmovdqu (7 * 32)(%rdx), RB3;
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_ci	call __sm4_crypt_blk16;
41562306a36Sopenharmony_ci
41662306a36Sopenharmony_ci	vmovdqu (%rcx), RNOTx;
41762306a36Sopenharmony_ci	vinserti128 $1, (%rdx), RNOT, RNOT;
41862306a36Sopenharmony_ci	vpxor RNOT, RA0, RA0;
41962306a36Sopenharmony_ci	vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
42062306a36Sopenharmony_ci	vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
42162306a36Sopenharmony_ci	vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
42262306a36Sopenharmony_ci	vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
42362306a36Sopenharmony_ci	vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
42462306a36Sopenharmony_ci	vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
42562306a36Sopenharmony_ci	vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
42662306a36Sopenharmony_ci	vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
42762306a36Sopenharmony_ci	vmovdqu RNOTx, (%rcx); /* store new IV */
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci	vmovdqu RA0, (0 * 32)(%rsi);
43062306a36Sopenharmony_ci	vmovdqu RA1, (1 * 32)(%rsi);
43162306a36Sopenharmony_ci	vmovdqu RA2, (2 * 32)(%rsi);
43262306a36Sopenharmony_ci	vmovdqu RA3, (3 * 32)(%rsi);
43362306a36Sopenharmony_ci	vmovdqu RB0, (4 * 32)(%rsi);
43462306a36Sopenharmony_ci	vmovdqu RB1, (5 * 32)(%rsi);
43562306a36Sopenharmony_ci	vmovdqu RB2, (6 * 32)(%rsi);
43662306a36Sopenharmony_ci	vmovdqu RB3, (7 * 32)(%rsi);
43762306a36Sopenharmony_ci
43862306a36Sopenharmony_ci	vzeroall;
43962306a36Sopenharmony_ci	FRAME_END
44062306a36Sopenharmony_ci	RET;
44162306a36Sopenharmony_ciSYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16)
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci/*
44462306a36Sopenharmony_ci * void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst,
44562306a36Sopenharmony_ci *                                   const u8 *src, u8 *iv)
44662306a36Sopenharmony_ci */
44762306a36Sopenharmony_ciSYM_TYPED_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16)
44862306a36Sopenharmony_ci	/* input:
44962306a36Sopenharmony_ci	 *	%rdi: round key array, CTX
45062306a36Sopenharmony_ci	 *	%rsi: dst (16 blocks)
45162306a36Sopenharmony_ci	 *	%rdx: src (16 blocks)
45262306a36Sopenharmony_ci	 *	%rcx: iv
45362306a36Sopenharmony_ci	 */
45462306a36Sopenharmony_ci	FRAME_BEGIN
45562306a36Sopenharmony_ci
45662306a36Sopenharmony_ci	vzeroupper;
45762306a36Sopenharmony_ci
45862306a36Sopenharmony_ci	/* Load input */
45962306a36Sopenharmony_ci	vmovdqu (%rcx), RNOTx;
46062306a36Sopenharmony_ci	vinserti128 $1, (%rdx), RNOT, RA0;
46162306a36Sopenharmony_ci	vmovdqu (0 * 32 + 16)(%rdx), RA1;
46262306a36Sopenharmony_ci	vmovdqu (1 * 32 + 16)(%rdx), RA2;
46362306a36Sopenharmony_ci	vmovdqu (2 * 32 + 16)(%rdx), RA3;
46462306a36Sopenharmony_ci	vmovdqu (3 * 32 + 16)(%rdx), RB0;
46562306a36Sopenharmony_ci	vmovdqu (4 * 32 + 16)(%rdx), RB1;
46662306a36Sopenharmony_ci	vmovdqu (5 * 32 + 16)(%rdx), RB2;
46762306a36Sopenharmony_ci	vmovdqu (6 * 32 + 16)(%rdx), RB3;
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci	/* Update IV */
47062306a36Sopenharmony_ci	vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
47162306a36Sopenharmony_ci	vmovdqu RNOTx, (%rcx);
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_ci	call __sm4_crypt_blk16;
47462306a36Sopenharmony_ci
47562306a36Sopenharmony_ci	vpxor (0 * 32)(%rdx), RA0, RA0;
47662306a36Sopenharmony_ci	vpxor (1 * 32)(%rdx), RA1, RA1;
47762306a36Sopenharmony_ci	vpxor (2 * 32)(%rdx), RA2, RA2;
47862306a36Sopenharmony_ci	vpxor (3 * 32)(%rdx), RA3, RA3;
47962306a36Sopenharmony_ci	vpxor (4 * 32)(%rdx), RB0, RB0;
48062306a36Sopenharmony_ci	vpxor (5 * 32)(%rdx), RB1, RB1;
48162306a36Sopenharmony_ci	vpxor (6 * 32)(%rdx), RB2, RB2;
48262306a36Sopenharmony_ci	vpxor (7 * 32)(%rdx), RB3, RB3;
48362306a36Sopenharmony_ci
48462306a36Sopenharmony_ci	vmovdqu RA0, (0 * 32)(%rsi);
48562306a36Sopenharmony_ci	vmovdqu RA1, (1 * 32)(%rsi);
48662306a36Sopenharmony_ci	vmovdqu RA2, (2 * 32)(%rsi);
48762306a36Sopenharmony_ci	vmovdqu RA3, (3 * 32)(%rsi);
48862306a36Sopenharmony_ci	vmovdqu RB0, (4 * 32)(%rsi);
48962306a36Sopenharmony_ci	vmovdqu RB1, (5 * 32)(%rsi);
49062306a36Sopenharmony_ci	vmovdqu RB2, (6 * 32)(%rsi);
49162306a36Sopenharmony_ci	vmovdqu RB3, (7 * 32)(%rsi);
49262306a36Sopenharmony_ci
49362306a36Sopenharmony_ci	vzeroall;
49462306a36Sopenharmony_ci	FRAME_END
49562306a36Sopenharmony_ci	RET;
49662306a36Sopenharmony_ciSYM_FUNC_END(sm4_aesni_avx2_cfb_dec_blk16)
497