162306a36Sopenharmony_ci/*
262306a36Sopenharmony_ci * x86_64/AVX/AES-NI assembler implementation of Camellia
362306a36Sopenharmony_ci *
462306a36Sopenharmony_ci * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
562306a36Sopenharmony_ci *
662306a36Sopenharmony_ci * This program is free software; you can redistribute it and/or modify
762306a36Sopenharmony_ci * it under the terms of the GNU General Public License as published by
862306a36Sopenharmony_ci * the Free Software Foundation; either version 2 of the License, or
962306a36Sopenharmony_ci * (at your option) any later version.
1062306a36Sopenharmony_ci *
1162306a36Sopenharmony_ci */
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci/*
1462306a36Sopenharmony_ci * Version licensed under 2-clause BSD License is available at:
1562306a36Sopenharmony_ci *	http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
1662306a36Sopenharmony_ci */
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci#include <linux/linkage.h>
1962306a36Sopenharmony_ci#include <asm/frame.h>
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_ci#define CAMELLIA_TABLE_BYTE_LEN 272
2262306a36Sopenharmony_ci
2362306a36Sopenharmony_ci/* struct camellia_ctx: */
2462306a36Sopenharmony_ci#define key_table 0
2562306a36Sopenharmony_ci#define key_length CAMELLIA_TABLE_BYTE_LEN
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_ci/* register macros */
2862306a36Sopenharmony_ci#define CTX %rdi
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ci/**********************************************************************
3162306a36Sopenharmony_ci  16-way camellia
3262306a36Sopenharmony_ci **********************************************************************/
3362306a36Sopenharmony_ci#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
3462306a36Sopenharmony_ci	vpand x, mask4bit, tmp0; \
3562306a36Sopenharmony_ci	vpandn x, mask4bit, x; \
3662306a36Sopenharmony_ci	vpsrld $4, x, x; \
3762306a36Sopenharmony_ci	\
3862306a36Sopenharmony_ci	vpshufb tmp0, lo_t, tmp0; \
3962306a36Sopenharmony_ci	vpshufb x, hi_t, x; \
4062306a36Sopenharmony_ci	vpxor tmp0, x, x;
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci/*
4362306a36Sopenharmony_ci * IN:
4462306a36Sopenharmony_ci *   x0..x7: byte-sliced AB state
4562306a36Sopenharmony_ci *   mem_cd: register pointer storing CD state
4662306a36Sopenharmony_ci *   key: index for key material
4762306a36Sopenharmony_ci * OUT:
4862306a36Sopenharmony_ci *   x0..x7: new byte-sliced CD state
4962306a36Sopenharmony_ci */
5062306a36Sopenharmony_ci#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
5162306a36Sopenharmony_ci		  t7, mem_cd, key) \
5262306a36Sopenharmony_ci	/* \
5362306a36Sopenharmony_ci	 * S-function with AES subbytes \
5462306a36Sopenharmony_ci	 */ \
5562306a36Sopenharmony_ci	vmovdqa .Linv_shift_row(%rip), t4; \
5662306a36Sopenharmony_ci	vbroadcastss .L0f0f0f0f(%rip), t7; \
5762306a36Sopenharmony_ci	vmovdqa .Lpre_tf_lo_s1(%rip), t0; \
5862306a36Sopenharmony_ci	vmovdqa .Lpre_tf_hi_s1(%rip), t1; \
5962306a36Sopenharmony_ci	\
6062306a36Sopenharmony_ci	/* AES inverse shift rows */ \
6162306a36Sopenharmony_ci	vpshufb t4, x0, x0; \
6262306a36Sopenharmony_ci	vpshufb t4, x7, x7; \
6362306a36Sopenharmony_ci	vpshufb t4, x1, x1; \
6462306a36Sopenharmony_ci	vpshufb t4, x4, x4; \
6562306a36Sopenharmony_ci	vpshufb t4, x2, x2; \
6662306a36Sopenharmony_ci	vpshufb t4, x5, x5; \
6762306a36Sopenharmony_ci	vpshufb t4, x3, x3; \
6862306a36Sopenharmony_ci	vpshufb t4, x6, x6; \
6962306a36Sopenharmony_ci	\
7062306a36Sopenharmony_ci	/* prefilter sboxes 1, 2 and 3 */ \
7162306a36Sopenharmony_ci	vmovdqa .Lpre_tf_lo_s4(%rip), t2; \
7262306a36Sopenharmony_ci	vmovdqa .Lpre_tf_hi_s4(%rip), t3; \
7362306a36Sopenharmony_ci	filter_8bit(x0, t0, t1, t7, t6); \
7462306a36Sopenharmony_ci	filter_8bit(x7, t0, t1, t7, t6); \
7562306a36Sopenharmony_ci	filter_8bit(x1, t0, t1, t7, t6); \
7662306a36Sopenharmony_ci	filter_8bit(x4, t0, t1, t7, t6); \
7762306a36Sopenharmony_ci	filter_8bit(x2, t0, t1, t7, t6); \
7862306a36Sopenharmony_ci	filter_8bit(x5, t0, t1, t7, t6); \
7962306a36Sopenharmony_ci	\
8062306a36Sopenharmony_ci	/* prefilter sbox 4 */ \
8162306a36Sopenharmony_ci	vpxor t4, t4, t4; \
8262306a36Sopenharmony_ci	filter_8bit(x3, t2, t3, t7, t6); \
8362306a36Sopenharmony_ci	filter_8bit(x6, t2, t3, t7, t6); \
8462306a36Sopenharmony_ci	\
8562306a36Sopenharmony_ci	/* AES subbytes + AES shift rows */ \
8662306a36Sopenharmony_ci	vmovdqa .Lpost_tf_lo_s1(%rip), t0; \
8762306a36Sopenharmony_ci	vmovdqa .Lpost_tf_hi_s1(%rip), t1; \
8862306a36Sopenharmony_ci	vaesenclast t4, x0, x0; \
8962306a36Sopenharmony_ci	vaesenclast t4, x7, x7; \
9062306a36Sopenharmony_ci	vaesenclast t4, x1, x1; \
9162306a36Sopenharmony_ci	vaesenclast t4, x4, x4; \
9262306a36Sopenharmony_ci	vaesenclast t4, x2, x2; \
9362306a36Sopenharmony_ci	vaesenclast t4, x5, x5; \
9462306a36Sopenharmony_ci	vaesenclast t4, x3, x3; \
9562306a36Sopenharmony_ci	vaesenclast t4, x6, x6; \
9662306a36Sopenharmony_ci	\
9762306a36Sopenharmony_ci	/* postfilter sboxes 1 and 4 */ \
9862306a36Sopenharmony_ci	vmovdqa .Lpost_tf_lo_s3(%rip), t2; \
9962306a36Sopenharmony_ci	vmovdqa .Lpost_tf_hi_s3(%rip), t3; \
10062306a36Sopenharmony_ci	filter_8bit(x0, t0, t1, t7, t6); \
10162306a36Sopenharmony_ci	filter_8bit(x7, t0, t1, t7, t6); \
10262306a36Sopenharmony_ci	filter_8bit(x3, t0, t1, t7, t6); \
10362306a36Sopenharmony_ci	filter_8bit(x6, t0, t1, t7, t6); \
10462306a36Sopenharmony_ci	\
10562306a36Sopenharmony_ci	/* postfilter sbox 3 */ \
10662306a36Sopenharmony_ci	vmovdqa .Lpost_tf_lo_s2(%rip), t4; \
10762306a36Sopenharmony_ci	vmovdqa .Lpost_tf_hi_s2(%rip), t5; \
10862306a36Sopenharmony_ci	filter_8bit(x2, t2, t3, t7, t6); \
10962306a36Sopenharmony_ci	filter_8bit(x5, t2, t3, t7, t6); \
11062306a36Sopenharmony_ci	\
11162306a36Sopenharmony_ci	vpxor t6, t6, t6; \
11262306a36Sopenharmony_ci	vmovq key, t0; \
11362306a36Sopenharmony_ci	\
11462306a36Sopenharmony_ci	/* postfilter sbox 2 */ \
11562306a36Sopenharmony_ci	filter_8bit(x1, t4, t5, t7, t2); \
11662306a36Sopenharmony_ci	filter_8bit(x4, t4, t5, t7, t2); \
11762306a36Sopenharmony_ci	\
11862306a36Sopenharmony_ci	vpsrldq $5, t0, t5; \
11962306a36Sopenharmony_ci	vpsrldq $1, t0, t1; \
12062306a36Sopenharmony_ci	vpsrldq $2, t0, t2; \
12162306a36Sopenharmony_ci	vpsrldq $3, t0, t3; \
12262306a36Sopenharmony_ci	vpsrldq $4, t0, t4; \
12362306a36Sopenharmony_ci	vpshufb t6, t0, t0; \
12462306a36Sopenharmony_ci	vpshufb t6, t1, t1; \
12562306a36Sopenharmony_ci	vpshufb t6, t2, t2; \
12662306a36Sopenharmony_ci	vpshufb t6, t3, t3; \
12762306a36Sopenharmony_ci	vpshufb t6, t4, t4; \
12862306a36Sopenharmony_ci	vpsrldq $2, t5, t7; \
12962306a36Sopenharmony_ci	vpshufb t6, t7, t7; \
13062306a36Sopenharmony_ci	\
13162306a36Sopenharmony_ci	/* \
13262306a36Sopenharmony_ci	 * P-function \
13362306a36Sopenharmony_ci	 */ \
13462306a36Sopenharmony_ci	vpxor x5, x0, x0; \
13562306a36Sopenharmony_ci	vpxor x6, x1, x1; \
13662306a36Sopenharmony_ci	vpxor x7, x2, x2; \
13762306a36Sopenharmony_ci	vpxor x4, x3, x3; \
13862306a36Sopenharmony_ci	\
13962306a36Sopenharmony_ci	vpxor x2, x4, x4; \
14062306a36Sopenharmony_ci	vpxor x3, x5, x5; \
14162306a36Sopenharmony_ci	vpxor x0, x6, x6; \
14262306a36Sopenharmony_ci	vpxor x1, x7, x7; \
14362306a36Sopenharmony_ci	\
14462306a36Sopenharmony_ci	vpxor x7, x0, x0; \
14562306a36Sopenharmony_ci	vpxor x4, x1, x1; \
14662306a36Sopenharmony_ci	vpxor x5, x2, x2; \
14762306a36Sopenharmony_ci	vpxor x6, x3, x3; \
14862306a36Sopenharmony_ci	\
14962306a36Sopenharmony_ci	vpxor x3, x4, x4; \
15062306a36Sopenharmony_ci	vpxor x0, x5, x5; \
15162306a36Sopenharmony_ci	vpxor x1, x6, x6; \
15262306a36Sopenharmony_ci	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
15362306a36Sopenharmony_ci	\
15462306a36Sopenharmony_ci	/* \
15562306a36Sopenharmony_ci	 * Add key material and result to CD (x becomes new CD) \
15662306a36Sopenharmony_ci	 */ \
15762306a36Sopenharmony_ci	\
15862306a36Sopenharmony_ci	vpxor t3, x4, x4; \
15962306a36Sopenharmony_ci	vpxor 0 * 16(mem_cd), x4, x4; \
16062306a36Sopenharmony_ci	\
16162306a36Sopenharmony_ci	vpxor t2, x5, x5; \
16262306a36Sopenharmony_ci	vpxor 1 * 16(mem_cd), x5, x5; \
16362306a36Sopenharmony_ci	\
16462306a36Sopenharmony_ci	vpsrldq $1, t5, t3; \
16562306a36Sopenharmony_ci	vpshufb t6, t5, t5; \
16662306a36Sopenharmony_ci	vpshufb t6, t3, t6; \
16762306a36Sopenharmony_ci	\
16862306a36Sopenharmony_ci	vpxor t1, x6, x6; \
16962306a36Sopenharmony_ci	vpxor 2 * 16(mem_cd), x6, x6; \
17062306a36Sopenharmony_ci	\
17162306a36Sopenharmony_ci	vpxor t0, x7, x7; \
17262306a36Sopenharmony_ci	vpxor 3 * 16(mem_cd), x7, x7; \
17362306a36Sopenharmony_ci	\
17462306a36Sopenharmony_ci	vpxor t7, x0, x0; \
17562306a36Sopenharmony_ci	vpxor 4 * 16(mem_cd), x0, x0; \
17662306a36Sopenharmony_ci	\
17762306a36Sopenharmony_ci	vpxor t6, x1, x1; \
17862306a36Sopenharmony_ci	vpxor 5 * 16(mem_cd), x1, x1; \
17962306a36Sopenharmony_ci	\
18062306a36Sopenharmony_ci	vpxor t5, x2, x2; \
18162306a36Sopenharmony_ci	vpxor 6 * 16(mem_cd), x2, x2; \
18262306a36Sopenharmony_ci	\
18362306a36Sopenharmony_ci	vpxor t4, x3, x3; \
18462306a36Sopenharmony_ci	vpxor 7 * 16(mem_cd), x3, x3;
18562306a36Sopenharmony_ci
18662306a36Sopenharmony_ci/*
18762306a36Sopenharmony_ci * Size optimization... with inlined roundsm16, binary would be over 5 times
18862306a36Sopenharmony_ci * larger and would only be 0.5% faster (on sandy-bridge).
18962306a36Sopenharmony_ci */
19062306a36Sopenharmony_ci.align 8
19162306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
19262306a36Sopenharmony_ci	roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
19362306a36Sopenharmony_ci		  %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
19462306a36Sopenharmony_ci		  %rcx, (%r9));
19562306a36Sopenharmony_ci	RET;
19662306a36Sopenharmony_ciSYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci.align 8
19962306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
20062306a36Sopenharmony_ci	roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
20162306a36Sopenharmony_ci		  %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
20262306a36Sopenharmony_ci		  %rax, (%r9));
20362306a36Sopenharmony_ci	RET;
20462306a36Sopenharmony_ciSYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
20562306a36Sopenharmony_ci
20662306a36Sopenharmony_ci/*
20762306a36Sopenharmony_ci * IN/OUT:
20862306a36Sopenharmony_ci *  x0..x7: byte-sliced AB state preloaded
20962306a36Sopenharmony_ci *  mem_ab: byte-sliced AB state in memory
21062306a36Sopenharmony_ci *  mem_cb: byte-sliced CD state in memory
21162306a36Sopenharmony_ci */
21262306a36Sopenharmony_ci#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
21362306a36Sopenharmony_ci		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
21462306a36Sopenharmony_ci	leaq (key_table + (i) * 8)(CTX), %r9; \
21562306a36Sopenharmony_ci	call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
21662306a36Sopenharmony_ci	\
21762306a36Sopenharmony_ci	vmovdqu x4, 0 * 16(mem_cd); \
21862306a36Sopenharmony_ci	vmovdqu x5, 1 * 16(mem_cd); \
21962306a36Sopenharmony_ci	vmovdqu x6, 2 * 16(mem_cd); \
22062306a36Sopenharmony_ci	vmovdqu x7, 3 * 16(mem_cd); \
22162306a36Sopenharmony_ci	vmovdqu x0, 4 * 16(mem_cd); \
22262306a36Sopenharmony_ci	vmovdqu x1, 5 * 16(mem_cd); \
22362306a36Sopenharmony_ci	vmovdqu x2, 6 * 16(mem_cd); \
22462306a36Sopenharmony_ci	vmovdqu x3, 7 * 16(mem_cd); \
22562306a36Sopenharmony_ci	\
22662306a36Sopenharmony_ci	leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
22762306a36Sopenharmony_ci	call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
22862306a36Sopenharmony_ci	\
22962306a36Sopenharmony_ci	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
23062306a36Sopenharmony_ci
23162306a36Sopenharmony_ci#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_ci#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
23462306a36Sopenharmony_ci	/* Store new AB state */ \
23562306a36Sopenharmony_ci	vmovdqu x0, 0 * 16(mem_ab); \
23662306a36Sopenharmony_ci	vmovdqu x1, 1 * 16(mem_ab); \
23762306a36Sopenharmony_ci	vmovdqu x2, 2 * 16(mem_ab); \
23862306a36Sopenharmony_ci	vmovdqu x3, 3 * 16(mem_ab); \
23962306a36Sopenharmony_ci	vmovdqu x4, 4 * 16(mem_ab); \
24062306a36Sopenharmony_ci	vmovdqu x5, 5 * 16(mem_ab); \
24162306a36Sopenharmony_ci	vmovdqu x6, 6 * 16(mem_ab); \
24262306a36Sopenharmony_ci	vmovdqu x7, 7 * 16(mem_ab);
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
24562306a36Sopenharmony_ci		      y6, y7, mem_ab, mem_cd, i) \
24662306a36Sopenharmony_ci	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
24762306a36Sopenharmony_ci		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
24862306a36Sopenharmony_ci	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
24962306a36Sopenharmony_ci		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
25062306a36Sopenharmony_ci	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
25162306a36Sopenharmony_ci		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
25462306a36Sopenharmony_ci		      y6, y7, mem_ab, mem_cd, i) \
25562306a36Sopenharmony_ci	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
25662306a36Sopenharmony_ci		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
25762306a36Sopenharmony_ci	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
25862306a36Sopenharmony_ci		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
25962306a36Sopenharmony_ci	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
26062306a36Sopenharmony_ci		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
26162306a36Sopenharmony_ci
26262306a36Sopenharmony_ci/*
26362306a36Sopenharmony_ci * IN:
26462306a36Sopenharmony_ci *  v0..3: byte-sliced 32-bit integers
26562306a36Sopenharmony_ci * OUT:
26662306a36Sopenharmony_ci *  v0..3: (IN <<< 1)
26762306a36Sopenharmony_ci */
26862306a36Sopenharmony_ci#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
26962306a36Sopenharmony_ci	vpcmpgtb v0, zero, t0; \
27062306a36Sopenharmony_ci	vpaddb v0, v0, v0; \
27162306a36Sopenharmony_ci	vpabsb t0, t0; \
27262306a36Sopenharmony_ci	\
27362306a36Sopenharmony_ci	vpcmpgtb v1, zero, t1; \
27462306a36Sopenharmony_ci	vpaddb v1, v1, v1; \
27562306a36Sopenharmony_ci	vpabsb t1, t1; \
27662306a36Sopenharmony_ci	\
27762306a36Sopenharmony_ci	vpcmpgtb v2, zero, t2; \
27862306a36Sopenharmony_ci	vpaddb v2, v2, v2; \
27962306a36Sopenharmony_ci	vpabsb t2, t2; \
28062306a36Sopenharmony_ci	\
28162306a36Sopenharmony_ci	vpor t0, v1, v1; \
28262306a36Sopenharmony_ci	\
28362306a36Sopenharmony_ci	vpcmpgtb v3, zero, t0; \
28462306a36Sopenharmony_ci	vpaddb v3, v3, v3; \
28562306a36Sopenharmony_ci	vpabsb t0, t0; \
28662306a36Sopenharmony_ci	\
28762306a36Sopenharmony_ci	vpor t1, v2, v2; \
28862306a36Sopenharmony_ci	vpor t2, v3, v3; \
28962306a36Sopenharmony_ci	vpor t0, v0, v0;
29062306a36Sopenharmony_ci
29162306a36Sopenharmony_ci/*
29262306a36Sopenharmony_ci * IN:
29362306a36Sopenharmony_ci *   r: byte-sliced AB state in memory
29462306a36Sopenharmony_ci *   l: byte-sliced CD state in memory
29562306a36Sopenharmony_ci * OUT:
29662306a36Sopenharmony_ci *   x0..x7: new byte-sliced CD state
29762306a36Sopenharmony_ci */
29862306a36Sopenharmony_ci#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
29962306a36Sopenharmony_ci	      tt1, tt2, tt3, kll, klr, krl, krr) \
30062306a36Sopenharmony_ci	/* \
30162306a36Sopenharmony_ci	 * t0 = kll; \
30262306a36Sopenharmony_ci	 * t0 &= ll; \
30362306a36Sopenharmony_ci	 * lr ^= rol32(t0, 1); \
30462306a36Sopenharmony_ci	 */ \
30562306a36Sopenharmony_ci	vpxor tt0, tt0, tt0; \
30662306a36Sopenharmony_ci	vmovd kll, t0; \
30762306a36Sopenharmony_ci	vpshufb tt0, t0, t3; \
30862306a36Sopenharmony_ci	vpsrldq $1, t0, t0; \
30962306a36Sopenharmony_ci	vpshufb tt0, t0, t2; \
31062306a36Sopenharmony_ci	vpsrldq $1, t0, t0; \
31162306a36Sopenharmony_ci	vpshufb tt0, t0, t1; \
31262306a36Sopenharmony_ci	vpsrldq $1, t0, t0; \
31362306a36Sopenharmony_ci	vpshufb tt0, t0, t0; \
31462306a36Sopenharmony_ci	\
31562306a36Sopenharmony_ci	vpand l0, t0, t0; \
31662306a36Sopenharmony_ci	vpand l1, t1, t1; \
31762306a36Sopenharmony_ci	vpand l2, t2, t2; \
31862306a36Sopenharmony_ci	vpand l3, t3, t3; \
31962306a36Sopenharmony_ci	\
32062306a36Sopenharmony_ci	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
32162306a36Sopenharmony_ci	\
32262306a36Sopenharmony_ci	vpxor l4, t0, l4; \
32362306a36Sopenharmony_ci	vmovdqu l4, 4 * 16(l); \
32462306a36Sopenharmony_ci	vpxor l5, t1, l5; \
32562306a36Sopenharmony_ci	vmovdqu l5, 5 * 16(l); \
32662306a36Sopenharmony_ci	vpxor l6, t2, l6; \
32762306a36Sopenharmony_ci	vmovdqu l6, 6 * 16(l); \
32862306a36Sopenharmony_ci	vpxor l7, t3, l7; \
32962306a36Sopenharmony_ci	vmovdqu l7, 7 * 16(l); \
33062306a36Sopenharmony_ci	\
33162306a36Sopenharmony_ci	/* \
33262306a36Sopenharmony_ci	 * t2 = krr; \
33362306a36Sopenharmony_ci	 * t2 |= rr; \
33462306a36Sopenharmony_ci	 * rl ^= t2; \
33562306a36Sopenharmony_ci	 */ \
33662306a36Sopenharmony_ci	\
33762306a36Sopenharmony_ci	vmovd krr, t0; \
33862306a36Sopenharmony_ci	vpshufb tt0, t0, t3; \
33962306a36Sopenharmony_ci	vpsrldq $1, t0, t0; \
34062306a36Sopenharmony_ci	vpshufb tt0, t0, t2; \
34162306a36Sopenharmony_ci	vpsrldq $1, t0, t0; \
34262306a36Sopenharmony_ci	vpshufb tt0, t0, t1; \
34362306a36Sopenharmony_ci	vpsrldq $1, t0, t0; \
34462306a36Sopenharmony_ci	vpshufb tt0, t0, t0; \
34562306a36Sopenharmony_ci	\
34662306a36Sopenharmony_ci	vpor 4 * 16(r), t0, t0; \
34762306a36Sopenharmony_ci	vpor 5 * 16(r), t1, t1; \
34862306a36Sopenharmony_ci	vpor 6 * 16(r), t2, t2; \
34962306a36Sopenharmony_ci	vpor 7 * 16(r), t3, t3; \
35062306a36Sopenharmony_ci	\
35162306a36Sopenharmony_ci	vpxor 0 * 16(r), t0, t0; \
35262306a36Sopenharmony_ci	vpxor 1 * 16(r), t1, t1; \
35362306a36Sopenharmony_ci	vpxor 2 * 16(r), t2, t2; \
35462306a36Sopenharmony_ci	vpxor 3 * 16(r), t3, t3; \
35562306a36Sopenharmony_ci	vmovdqu t0, 0 * 16(r); \
35662306a36Sopenharmony_ci	vmovdqu t1, 1 * 16(r); \
35762306a36Sopenharmony_ci	vmovdqu t2, 2 * 16(r); \
35862306a36Sopenharmony_ci	vmovdqu t3, 3 * 16(r); \
35962306a36Sopenharmony_ci	\
36062306a36Sopenharmony_ci	/* \
36162306a36Sopenharmony_ci	 * t2 = krl; \
36262306a36Sopenharmony_ci	 * t2 &= rl; \
36362306a36Sopenharmony_ci	 * rr ^= rol32(t2, 1); \
36462306a36Sopenharmony_ci	 */ \
36562306a36Sopenharmony_ci	vmovd krl, t0; \
36662306a36Sopenharmony_ci	vpshufb tt0, t0, t3; \
36762306a36Sopenharmony_ci	vpsrldq $1, t0, t0; \
36862306a36Sopenharmony_ci	vpshufb tt0, t0, t2; \
36962306a36Sopenharmony_ci	vpsrldq $1, t0, t0; \
37062306a36Sopenharmony_ci	vpshufb tt0, t0, t1; \
37162306a36Sopenharmony_ci	vpsrldq $1, t0, t0; \
37262306a36Sopenharmony_ci	vpshufb tt0, t0, t0; \
37362306a36Sopenharmony_ci	\
37462306a36Sopenharmony_ci	vpand 0 * 16(r), t0, t0; \
37562306a36Sopenharmony_ci	vpand 1 * 16(r), t1, t1; \
37662306a36Sopenharmony_ci	vpand 2 * 16(r), t2, t2; \
37762306a36Sopenharmony_ci	vpand 3 * 16(r), t3, t3; \
37862306a36Sopenharmony_ci	\
37962306a36Sopenharmony_ci	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
38062306a36Sopenharmony_ci	\
38162306a36Sopenharmony_ci	vpxor 4 * 16(r), t0, t0; \
38262306a36Sopenharmony_ci	vpxor 5 * 16(r), t1, t1; \
38362306a36Sopenharmony_ci	vpxor 6 * 16(r), t2, t2; \
38462306a36Sopenharmony_ci	vpxor 7 * 16(r), t3, t3; \
38562306a36Sopenharmony_ci	vmovdqu t0, 4 * 16(r); \
38662306a36Sopenharmony_ci	vmovdqu t1, 5 * 16(r); \
38762306a36Sopenharmony_ci	vmovdqu t2, 6 * 16(r); \
38862306a36Sopenharmony_ci	vmovdqu t3, 7 * 16(r); \
38962306a36Sopenharmony_ci	\
39062306a36Sopenharmony_ci	/* \
39162306a36Sopenharmony_ci	 * t0 = klr; \
39262306a36Sopenharmony_ci	 * t0 |= lr; \
39362306a36Sopenharmony_ci	 * ll ^= t0; \
39462306a36Sopenharmony_ci	 */ \
39562306a36Sopenharmony_ci	\
39662306a36Sopenharmony_ci	vmovd klr, t0; \
39762306a36Sopenharmony_ci	vpshufb tt0, t0, t3; \
39862306a36Sopenharmony_ci	vpsrldq $1, t0, t0; \
39962306a36Sopenharmony_ci	vpshufb tt0, t0, t2; \
40062306a36Sopenharmony_ci	vpsrldq $1, t0, t0; \
40162306a36Sopenharmony_ci	vpshufb tt0, t0, t1; \
40262306a36Sopenharmony_ci	vpsrldq $1, t0, t0; \
40362306a36Sopenharmony_ci	vpshufb tt0, t0, t0; \
40462306a36Sopenharmony_ci	\
40562306a36Sopenharmony_ci	vpor l4, t0, t0; \
40662306a36Sopenharmony_ci	vpor l5, t1, t1; \
40762306a36Sopenharmony_ci	vpor l6, t2, t2; \
40862306a36Sopenharmony_ci	vpor l7, t3, t3; \
40962306a36Sopenharmony_ci	\
41062306a36Sopenharmony_ci	vpxor l0, t0, l0; \
41162306a36Sopenharmony_ci	vmovdqu l0, 0 * 16(l); \
41262306a36Sopenharmony_ci	vpxor l1, t1, l1; \
41362306a36Sopenharmony_ci	vmovdqu l1, 1 * 16(l); \
41462306a36Sopenharmony_ci	vpxor l2, t2, l2; \
41562306a36Sopenharmony_ci	vmovdqu l2, 2 * 16(l); \
41662306a36Sopenharmony_ci	vpxor l3, t3, l3; \
41762306a36Sopenharmony_ci	vmovdqu l3, 3 * 16(l);
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
42062306a36Sopenharmony_ci	vpunpckhdq x1, x0, t2; \
42162306a36Sopenharmony_ci	vpunpckldq x1, x0, x0; \
42262306a36Sopenharmony_ci	\
42362306a36Sopenharmony_ci	vpunpckldq x3, x2, t1; \
42462306a36Sopenharmony_ci	vpunpckhdq x3, x2, x2; \
42562306a36Sopenharmony_ci	\
42662306a36Sopenharmony_ci	vpunpckhqdq t1, x0, x1; \
42762306a36Sopenharmony_ci	vpunpcklqdq t1, x0, x0; \
42862306a36Sopenharmony_ci	\
42962306a36Sopenharmony_ci	vpunpckhqdq x2, t2, x3; \
43062306a36Sopenharmony_ci	vpunpcklqdq x2, t2, x2;
43162306a36Sopenharmony_ci
43262306a36Sopenharmony_ci#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
43362306a36Sopenharmony_ci			 b3, c3, d3, st0, st1) \
43462306a36Sopenharmony_ci	vmovdqu d2, st0; \
43562306a36Sopenharmony_ci	vmovdqu d3, st1; \
43662306a36Sopenharmony_ci	transpose_4x4(a0, a1, a2, a3, d2, d3); \
43762306a36Sopenharmony_ci	transpose_4x4(b0, b1, b2, b3, d2, d3); \
43862306a36Sopenharmony_ci	vmovdqu st0, d2; \
43962306a36Sopenharmony_ci	vmovdqu st1, d3; \
44062306a36Sopenharmony_ci	\
44162306a36Sopenharmony_ci	vmovdqu a0, st0; \
44262306a36Sopenharmony_ci	vmovdqu a1, st1; \
44362306a36Sopenharmony_ci	transpose_4x4(c0, c1, c2, c3, a0, a1); \
44462306a36Sopenharmony_ci	transpose_4x4(d0, d1, d2, d3, a0, a1); \
44562306a36Sopenharmony_ci	\
44662306a36Sopenharmony_ci	vmovdqu .Lshufb_16x16b(%rip), a0; \
44762306a36Sopenharmony_ci	vmovdqu st1, a1; \
44862306a36Sopenharmony_ci	vpshufb a0, a2, a2; \
44962306a36Sopenharmony_ci	vpshufb a0, a3, a3; \
45062306a36Sopenharmony_ci	vpshufb a0, b0, b0; \
45162306a36Sopenharmony_ci	vpshufb a0, b1, b1; \
45262306a36Sopenharmony_ci	vpshufb a0, b2, b2; \
45362306a36Sopenharmony_ci	vpshufb a0, b3, b3; \
45462306a36Sopenharmony_ci	vpshufb a0, a1, a1; \
45562306a36Sopenharmony_ci	vpshufb a0, c0, c0; \
45662306a36Sopenharmony_ci	vpshufb a0, c1, c1; \
45762306a36Sopenharmony_ci	vpshufb a0, c2, c2; \
45862306a36Sopenharmony_ci	vpshufb a0, c3, c3; \
45962306a36Sopenharmony_ci	vpshufb a0, d0, d0; \
46062306a36Sopenharmony_ci	vpshufb a0, d1, d1; \
46162306a36Sopenharmony_ci	vpshufb a0, d2, d2; \
46262306a36Sopenharmony_ci	vpshufb a0, d3, d3; \
46362306a36Sopenharmony_ci	vmovdqu d3, st1; \
46462306a36Sopenharmony_ci	vmovdqu st0, d3; \
46562306a36Sopenharmony_ci	vpshufb a0, d3, a0; \
46662306a36Sopenharmony_ci	vmovdqu d2, st0; \
46762306a36Sopenharmony_ci	\
46862306a36Sopenharmony_ci	transpose_4x4(a0, b0, c0, d0, d2, d3); \
46962306a36Sopenharmony_ci	transpose_4x4(a1, b1, c1, d1, d2, d3); \
47062306a36Sopenharmony_ci	vmovdqu st0, d2; \
47162306a36Sopenharmony_ci	vmovdqu st1, d3; \
47262306a36Sopenharmony_ci	\
47362306a36Sopenharmony_ci	vmovdqu b0, st0; \
47462306a36Sopenharmony_ci	vmovdqu b1, st1; \
47562306a36Sopenharmony_ci	transpose_4x4(a2, b2, c2, d2, b0, b1); \
47662306a36Sopenharmony_ci	transpose_4x4(a3, b3, c3, d3, b0, b1); \
47762306a36Sopenharmony_ci	vmovdqu st0, b0; \
47862306a36Sopenharmony_ci	vmovdqu st1, b1; \
47962306a36Sopenharmony_ci	/* does not adjust output bytes inside vectors */
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_ci/* load blocks to registers and apply pre-whitening */
48262306a36Sopenharmony_ci#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
48362306a36Sopenharmony_ci		     y6, y7, rio, key) \
48462306a36Sopenharmony_ci	vmovq key, x0; \
48562306a36Sopenharmony_ci	vpshufb .Lpack_bswap(%rip), x0, x0; \
48662306a36Sopenharmony_ci	\
48762306a36Sopenharmony_ci	vpxor 0 * 16(rio), x0, y7; \
48862306a36Sopenharmony_ci	vpxor 1 * 16(rio), x0, y6; \
48962306a36Sopenharmony_ci	vpxor 2 * 16(rio), x0, y5; \
49062306a36Sopenharmony_ci	vpxor 3 * 16(rio), x0, y4; \
49162306a36Sopenharmony_ci	vpxor 4 * 16(rio), x0, y3; \
49262306a36Sopenharmony_ci	vpxor 5 * 16(rio), x0, y2; \
49362306a36Sopenharmony_ci	vpxor 6 * 16(rio), x0, y1; \
49462306a36Sopenharmony_ci	vpxor 7 * 16(rio), x0, y0; \
49562306a36Sopenharmony_ci	vpxor 8 * 16(rio), x0, x7; \
49662306a36Sopenharmony_ci	vpxor 9 * 16(rio), x0, x6; \
49762306a36Sopenharmony_ci	vpxor 10 * 16(rio), x0, x5; \
49862306a36Sopenharmony_ci	vpxor 11 * 16(rio), x0, x4; \
49962306a36Sopenharmony_ci	vpxor 12 * 16(rio), x0, x3; \
50062306a36Sopenharmony_ci	vpxor 13 * 16(rio), x0, x2; \
50162306a36Sopenharmony_ci	vpxor 14 * 16(rio), x0, x1; \
50262306a36Sopenharmony_ci	vpxor 15 * 16(rio), x0, x0;
50362306a36Sopenharmony_ci
50462306a36Sopenharmony_ci/* byteslice pre-whitened blocks and store to temporary memory */
50562306a36Sopenharmony_ci#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
50662306a36Sopenharmony_ci		      y6, y7, mem_ab, mem_cd) \
50762306a36Sopenharmony_ci	byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
50862306a36Sopenharmony_ci			 y5, y6, y7, (mem_ab), (mem_cd)); \
50962306a36Sopenharmony_ci	\
51062306a36Sopenharmony_ci	vmovdqu x0, 0 * 16(mem_ab); \
51162306a36Sopenharmony_ci	vmovdqu x1, 1 * 16(mem_ab); \
51262306a36Sopenharmony_ci	vmovdqu x2, 2 * 16(mem_ab); \
51362306a36Sopenharmony_ci	vmovdqu x3, 3 * 16(mem_ab); \
51462306a36Sopenharmony_ci	vmovdqu x4, 4 * 16(mem_ab); \
51562306a36Sopenharmony_ci	vmovdqu x5, 5 * 16(mem_ab); \
51662306a36Sopenharmony_ci	vmovdqu x6, 6 * 16(mem_ab); \
51762306a36Sopenharmony_ci	vmovdqu x7, 7 * 16(mem_ab); \
51862306a36Sopenharmony_ci	vmovdqu y0, 0 * 16(mem_cd); \
51962306a36Sopenharmony_ci	vmovdqu y1, 1 * 16(mem_cd); \
52062306a36Sopenharmony_ci	vmovdqu y2, 2 * 16(mem_cd); \
52162306a36Sopenharmony_ci	vmovdqu y3, 3 * 16(mem_cd); \
52262306a36Sopenharmony_ci	vmovdqu y4, 4 * 16(mem_cd); \
52362306a36Sopenharmony_ci	vmovdqu y5, 5 * 16(mem_cd); \
52462306a36Sopenharmony_ci	vmovdqu y6, 6 * 16(mem_cd); \
52562306a36Sopenharmony_ci	vmovdqu y7, 7 * 16(mem_cd);
52662306a36Sopenharmony_ci
52762306a36Sopenharmony_ci/* de-byteslice, apply post-whitening and store blocks */
52862306a36Sopenharmony_ci#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
52962306a36Sopenharmony_ci		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
53062306a36Sopenharmony_ci	byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
53162306a36Sopenharmony_ci			 y7, x3, x7, stack_tmp0, stack_tmp1); \
53262306a36Sopenharmony_ci	\
53362306a36Sopenharmony_ci	vmovdqu x0, stack_tmp0; \
53462306a36Sopenharmony_ci	\
53562306a36Sopenharmony_ci	vmovq key, x0; \
53662306a36Sopenharmony_ci	vpshufb .Lpack_bswap(%rip), x0, x0; \
53762306a36Sopenharmony_ci	\
53862306a36Sopenharmony_ci	vpxor x0, y7, y7; \
53962306a36Sopenharmony_ci	vpxor x0, y6, y6; \
54062306a36Sopenharmony_ci	vpxor x0, y5, y5; \
54162306a36Sopenharmony_ci	vpxor x0, y4, y4; \
54262306a36Sopenharmony_ci	vpxor x0, y3, y3; \
54362306a36Sopenharmony_ci	vpxor x0, y2, y2; \
54462306a36Sopenharmony_ci	vpxor x0, y1, y1; \
54562306a36Sopenharmony_ci	vpxor x0, y0, y0; \
54662306a36Sopenharmony_ci	vpxor x0, x7, x7; \
54762306a36Sopenharmony_ci	vpxor x0, x6, x6; \
54862306a36Sopenharmony_ci	vpxor x0, x5, x5; \
54962306a36Sopenharmony_ci	vpxor x0, x4, x4; \
55062306a36Sopenharmony_ci	vpxor x0, x3, x3; \
55162306a36Sopenharmony_ci	vpxor x0, x2, x2; \
55262306a36Sopenharmony_ci	vpxor x0, x1, x1; \
55362306a36Sopenharmony_ci	vpxor stack_tmp0, x0, x0;
55462306a36Sopenharmony_ci
55562306a36Sopenharmony_ci#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
55662306a36Sopenharmony_ci		     y6, y7, rio) \
55762306a36Sopenharmony_ci	vmovdqu x0, 0 * 16(rio); \
55862306a36Sopenharmony_ci	vmovdqu x1, 1 * 16(rio); \
55962306a36Sopenharmony_ci	vmovdqu x2, 2 * 16(rio); \
56062306a36Sopenharmony_ci	vmovdqu x3, 3 * 16(rio); \
56162306a36Sopenharmony_ci	vmovdqu x4, 4 * 16(rio); \
56262306a36Sopenharmony_ci	vmovdqu x5, 5 * 16(rio); \
56362306a36Sopenharmony_ci	vmovdqu x6, 6 * 16(rio); \
56462306a36Sopenharmony_ci	vmovdqu x7, 7 * 16(rio); \
56562306a36Sopenharmony_ci	vmovdqu y0, 8 * 16(rio); \
56662306a36Sopenharmony_ci	vmovdqu y1, 9 * 16(rio); \
56762306a36Sopenharmony_ci	vmovdqu y2, 10 * 16(rio); \
56862306a36Sopenharmony_ci	vmovdqu y3, 11 * 16(rio); \
56962306a36Sopenharmony_ci	vmovdqu y4, 12 * 16(rio); \
57062306a36Sopenharmony_ci	vmovdqu y5, 13 * 16(rio); \
57162306a36Sopenharmony_ci	vmovdqu y6, 14 * 16(rio); \
57262306a36Sopenharmony_ci	vmovdqu y7, 15 * 16(rio);
57362306a36Sopenharmony_ci
57462306a36Sopenharmony_ci
57562306a36Sopenharmony_ci/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
57662306a36Sopenharmony_ci.section	.rodata.cst16, "aM", @progbits, 16
57762306a36Sopenharmony_ci.align 16
57862306a36Sopenharmony_ci
57962306a36Sopenharmony_ci#define SHUFB_BYTES(idx) \
58062306a36Sopenharmony_ci	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
58162306a36Sopenharmony_ci
58262306a36Sopenharmony_ci.Lshufb_16x16b:
58362306a36Sopenharmony_ci	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
58462306a36Sopenharmony_ci
58562306a36Sopenharmony_ci.Lpack_bswap:
58662306a36Sopenharmony_ci	.long 0x00010203
58762306a36Sopenharmony_ci	.long 0x04050607
58862306a36Sopenharmony_ci	.long 0x80808080
58962306a36Sopenharmony_ci	.long 0x80808080
59062306a36Sopenharmony_ci
59162306a36Sopenharmony_ci/*
59262306a36Sopenharmony_ci * pre-SubByte transform
59362306a36Sopenharmony_ci *
59462306a36Sopenharmony_ci * pre-lookup for sbox1, sbox2, sbox3:
59562306a36Sopenharmony_ci *   swap_bitendianness(
59662306a36Sopenharmony_ci *       isom_map_camellia_to_aes(
59762306a36Sopenharmony_ci *           camellia_f(
59862306a36Sopenharmony_ci *               swap_bitendianess(in)
59962306a36Sopenharmony_ci *           )
60062306a36Sopenharmony_ci *       )
60162306a36Sopenharmony_ci *   )
60262306a36Sopenharmony_ci *
60362306a36Sopenharmony_ci * (note: '⊕ 0xc5' inside camellia_f())
60462306a36Sopenharmony_ci */
60562306a36Sopenharmony_ci.Lpre_tf_lo_s1:
60662306a36Sopenharmony_ci	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
60762306a36Sopenharmony_ci	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
60862306a36Sopenharmony_ci.Lpre_tf_hi_s1:
60962306a36Sopenharmony_ci	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
61062306a36Sopenharmony_ci	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
61162306a36Sopenharmony_ci
61262306a36Sopenharmony_ci/*
61362306a36Sopenharmony_ci * pre-SubByte transform
61462306a36Sopenharmony_ci *
61562306a36Sopenharmony_ci * pre-lookup for sbox4:
61662306a36Sopenharmony_ci *   swap_bitendianness(
61762306a36Sopenharmony_ci *       isom_map_camellia_to_aes(
61862306a36Sopenharmony_ci *           camellia_f(
61962306a36Sopenharmony_ci *               swap_bitendianess(in <<< 1)
62062306a36Sopenharmony_ci *           )
62162306a36Sopenharmony_ci *       )
62262306a36Sopenharmony_ci *   )
62362306a36Sopenharmony_ci *
62462306a36Sopenharmony_ci * (note: '⊕ 0xc5' inside camellia_f())
62562306a36Sopenharmony_ci */
62662306a36Sopenharmony_ci.Lpre_tf_lo_s4:
62762306a36Sopenharmony_ci	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
62862306a36Sopenharmony_ci	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
62962306a36Sopenharmony_ci.Lpre_tf_hi_s4:
63062306a36Sopenharmony_ci	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
63162306a36Sopenharmony_ci	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
63262306a36Sopenharmony_ci
63362306a36Sopenharmony_ci/*
63462306a36Sopenharmony_ci * post-SubByte transform
63562306a36Sopenharmony_ci *
63662306a36Sopenharmony_ci * post-lookup for sbox1, sbox4:
63762306a36Sopenharmony_ci *  swap_bitendianness(
63862306a36Sopenharmony_ci *      camellia_h(
63962306a36Sopenharmony_ci *          isom_map_aes_to_camellia(
64062306a36Sopenharmony_ci *              swap_bitendianness(
64162306a36Sopenharmony_ci *                  aes_inverse_affine_transform(in)
64262306a36Sopenharmony_ci *              )
64362306a36Sopenharmony_ci *          )
64462306a36Sopenharmony_ci *      )
64562306a36Sopenharmony_ci *  )
64662306a36Sopenharmony_ci *
64762306a36Sopenharmony_ci * (note: '⊕ 0x6e' inside camellia_h())
64862306a36Sopenharmony_ci */
64962306a36Sopenharmony_ci.Lpost_tf_lo_s1:
65062306a36Sopenharmony_ci	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
65162306a36Sopenharmony_ci	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
65262306a36Sopenharmony_ci.Lpost_tf_hi_s1:
65362306a36Sopenharmony_ci	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
65462306a36Sopenharmony_ci	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
65562306a36Sopenharmony_ci
65662306a36Sopenharmony_ci/*
65762306a36Sopenharmony_ci * post-SubByte transform
65862306a36Sopenharmony_ci *
65962306a36Sopenharmony_ci * post-lookup for sbox2:
66062306a36Sopenharmony_ci *  swap_bitendianness(
66162306a36Sopenharmony_ci *      camellia_h(
66262306a36Sopenharmony_ci *          isom_map_aes_to_camellia(
66362306a36Sopenharmony_ci *              swap_bitendianness(
66462306a36Sopenharmony_ci *                  aes_inverse_affine_transform(in)
66562306a36Sopenharmony_ci *              )
66662306a36Sopenharmony_ci *          )
66762306a36Sopenharmony_ci *      )
66862306a36Sopenharmony_ci *  ) <<< 1
66962306a36Sopenharmony_ci *
67062306a36Sopenharmony_ci * (note: '⊕ 0x6e' inside camellia_h())
67162306a36Sopenharmony_ci */
67262306a36Sopenharmony_ci.Lpost_tf_lo_s2:
67362306a36Sopenharmony_ci	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
67462306a36Sopenharmony_ci	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
67562306a36Sopenharmony_ci.Lpost_tf_hi_s2:
67662306a36Sopenharmony_ci	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
67762306a36Sopenharmony_ci	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
67862306a36Sopenharmony_ci
67962306a36Sopenharmony_ci/*
68062306a36Sopenharmony_ci * post-SubByte transform
68162306a36Sopenharmony_ci *
68262306a36Sopenharmony_ci * post-lookup for sbox3:
68362306a36Sopenharmony_ci *  swap_bitendianness(
68462306a36Sopenharmony_ci *      camellia_h(
68562306a36Sopenharmony_ci *          isom_map_aes_to_camellia(
68662306a36Sopenharmony_ci *              swap_bitendianness(
68762306a36Sopenharmony_ci *                  aes_inverse_affine_transform(in)
68862306a36Sopenharmony_ci *              )
68962306a36Sopenharmony_ci *          )
69062306a36Sopenharmony_ci *      )
69162306a36Sopenharmony_ci *  ) >>> 1
69262306a36Sopenharmony_ci *
69362306a36Sopenharmony_ci * (note: '⊕ 0x6e' inside camellia_h())
69462306a36Sopenharmony_ci */
69562306a36Sopenharmony_ci.Lpost_tf_lo_s3:
69662306a36Sopenharmony_ci	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
69762306a36Sopenharmony_ci	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
69862306a36Sopenharmony_ci.Lpost_tf_hi_s3:
69962306a36Sopenharmony_ci	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
70062306a36Sopenharmony_ci	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
70162306a36Sopenharmony_ci
70262306a36Sopenharmony_ci/* For isolating SubBytes from AESENCLAST, inverse shift row */
70362306a36Sopenharmony_ci.Linv_shift_row:
70462306a36Sopenharmony_ci	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
70562306a36Sopenharmony_ci	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
70662306a36Sopenharmony_ci
70762306a36Sopenharmony_ci/* 4-bit mask */
70862306a36Sopenharmony_ci.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
70962306a36Sopenharmony_ci.align 4
71062306a36Sopenharmony_ci.L0f0f0f0f:
71162306a36Sopenharmony_ci	.long 0x0f0f0f0f
71262306a36Sopenharmony_ci
71362306a36Sopenharmony_ci.text
71462306a36Sopenharmony_ci
71562306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__camellia_enc_blk16)
71662306a36Sopenharmony_ci	/* input:
71762306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
71862306a36Sopenharmony_ci	 *	%rax: temporary storage, 256 bytes
71962306a36Sopenharmony_ci	 *	%xmm0..%xmm15: 16 plaintext blocks
72062306a36Sopenharmony_ci	 * output:
72162306a36Sopenharmony_ci	 *	%xmm0..%xmm15: 16 encrypted blocks, order swapped:
72262306a36Sopenharmony_ci	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
72362306a36Sopenharmony_ci	 */
72462306a36Sopenharmony_ci	FRAME_BEGIN
72562306a36Sopenharmony_ci
72662306a36Sopenharmony_ci	leaq 8 * 16(%rax), %rcx;
72762306a36Sopenharmony_ci
72862306a36Sopenharmony_ci	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
72962306a36Sopenharmony_ci		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
73062306a36Sopenharmony_ci		      %xmm15, %rax, %rcx);
73162306a36Sopenharmony_ci
73262306a36Sopenharmony_ci	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
73362306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
73462306a36Sopenharmony_ci		     %xmm15, %rax, %rcx, 0);
73562306a36Sopenharmony_ci
73662306a36Sopenharmony_ci	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
73762306a36Sopenharmony_ci	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
73862306a36Sopenharmony_ci	      %xmm15,
73962306a36Sopenharmony_ci	      ((key_table + (8) * 8) + 0)(CTX),
74062306a36Sopenharmony_ci	      ((key_table + (8) * 8) + 4)(CTX),
74162306a36Sopenharmony_ci	      ((key_table + (8) * 8) + 8)(CTX),
74262306a36Sopenharmony_ci	      ((key_table + (8) * 8) + 12)(CTX));
74362306a36Sopenharmony_ci
74462306a36Sopenharmony_ci	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
74562306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
74662306a36Sopenharmony_ci		     %xmm15, %rax, %rcx, 8);
74762306a36Sopenharmony_ci
74862306a36Sopenharmony_ci	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
74962306a36Sopenharmony_ci	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
75062306a36Sopenharmony_ci	      %xmm15,
75162306a36Sopenharmony_ci	      ((key_table + (16) * 8) + 0)(CTX),
75262306a36Sopenharmony_ci	      ((key_table + (16) * 8) + 4)(CTX),
75362306a36Sopenharmony_ci	      ((key_table + (16) * 8) + 8)(CTX),
75462306a36Sopenharmony_ci	      ((key_table + (16) * 8) + 12)(CTX));
75562306a36Sopenharmony_ci
75662306a36Sopenharmony_ci	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
75762306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
75862306a36Sopenharmony_ci		     %xmm15, %rax, %rcx, 16);
75962306a36Sopenharmony_ci
76062306a36Sopenharmony_ci	movl $24, %r8d;
76162306a36Sopenharmony_ci	cmpl $16, key_length(CTX);
76262306a36Sopenharmony_ci	jne .Lenc_max32;
76362306a36Sopenharmony_ci
76462306a36Sopenharmony_ci.Lenc_done:
76562306a36Sopenharmony_ci	/* load CD for output */
76662306a36Sopenharmony_ci	vmovdqu 0 * 16(%rcx), %xmm8;
76762306a36Sopenharmony_ci	vmovdqu 1 * 16(%rcx), %xmm9;
76862306a36Sopenharmony_ci	vmovdqu 2 * 16(%rcx), %xmm10;
76962306a36Sopenharmony_ci	vmovdqu 3 * 16(%rcx), %xmm11;
77062306a36Sopenharmony_ci	vmovdqu 4 * 16(%rcx), %xmm12;
77162306a36Sopenharmony_ci	vmovdqu 5 * 16(%rcx), %xmm13;
77262306a36Sopenharmony_ci	vmovdqu 6 * 16(%rcx), %xmm14;
77362306a36Sopenharmony_ci	vmovdqu 7 * 16(%rcx), %xmm15;
77462306a36Sopenharmony_ci
77562306a36Sopenharmony_ci	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
77662306a36Sopenharmony_ci		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
77762306a36Sopenharmony_ci		    %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
77862306a36Sopenharmony_ci
77962306a36Sopenharmony_ci	FRAME_END
78062306a36Sopenharmony_ci	RET;
78162306a36Sopenharmony_ci
78262306a36Sopenharmony_ci.align 8
78362306a36Sopenharmony_ci.Lenc_max32:
78462306a36Sopenharmony_ci	movl $32, %r8d;
78562306a36Sopenharmony_ci
78662306a36Sopenharmony_ci	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
78762306a36Sopenharmony_ci	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
78862306a36Sopenharmony_ci	      %xmm15,
78962306a36Sopenharmony_ci	      ((key_table + (24) * 8) + 0)(CTX),
79062306a36Sopenharmony_ci	      ((key_table + (24) * 8) + 4)(CTX),
79162306a36Sopenharmony_ci	      ((key_table + (24) * 8) + 8)(CTX),
79262306a36Sopenharmony_ci	      ((key_table + (24) * 8) + 12)(CTX));
79362306a36Sopenharmony_ci
79462306a36Sopenharmony_ci	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
79562306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
79662306a36Sopenharmony_ci		     %xmm15, %rax, %rcx, 24);
79762306a36Sopenharmony_ci
79862306a36Sopenharmony_ci	jmp .Lenc_done;
79962306a36Sopenharmony_ciSYM_FUNC_END(__camellia_enc_blk16)
80062306a36Sopenharmony_ci
80162306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__camellia_dec_blk16)
80262306a36Sopenharmony_ci	/* input:
80362306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
80462306a36Sopenharmony_ci	 *	%rax: temporary storage, 256 bytes
80562306a36Sopenharmony_ci	 *	%r8d: 24 for 16 byte key, 32 for larger
80662306a36Sopenharmony_ci	 *	%xmm0..%xmm15: 16 encrypted blocks
80762306a36Sopenharmony_ci	 * output:
80862306a36Sopenharmony_ci	 *	%xmm0..%xmm15: 16 plaintext blocks, order swapped:
80962306a36Sopenharmony_ci	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
81062306a36Sopenharmony_ci	 */
81162306a36Sopenharmony_ci	FRAME_BEGIN
81262306a36Sopenharmony_ci
81362306a36Sopenharmony_ci	leaq 8 * 16(%rax), %rcx;
81462306a36Sopenharmony_ci
81562306a36Sopenharmony_ci	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
81662306a36Sopenharmony_ci		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
81762306a36Sopenharmony_ci		      %xmm15, %rax, %rcx);
81862306a36Sopenharmony_ci
81962306a36Sopenharmony_ci	cmpl $32, %r8d;
82062306a36Sopenharmony_ci	je .Ldec_max32;
82162306a36Sopenharmony_ci
82262306a36Sopenharmony_ci.Ldec_max24:
82362306a36Sopenharmony_ci	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
82462306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
82562306a36Sopenharmony_ci		     %xmm15, %rax, %rcx, 16);
82662306a36Sopenharmony_ci
82762306a36Sopenharmony_ci	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
82862306a36Sopenharmony_ci	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
82962306a36Sopenharmony_ci	      %xmm15,
83062306a36Sopenharmony_ci	      ((key_table + (16) * 8) + 8)(CTX),
83162306a36Sopenharmony_ci	      ((key_table + (16) * 8) + 12)(CTX),
83262306a36Sopenharmony_ci	      ((key_table + (16) * 8) + 0)(CTX),
83362306a36Sopenharmony_ci	      ((key_table + (16) * 8) + 4)(CTX));
83462306a36Sopenharmony_ci
83562306a36Sopenharmony_ci	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
83662306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
83762306a36Sopenharmony_ci		     %xmm15, %rax, %rcx, 8);
83862306a36Sopenharmony_ci
83962306a36Sopenharmony_ci	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
84062306a36Sopenharmony_ci	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
84162306a36Sopenharmony_ci	      %xmm15,
84262306a36Sopenharmony_ci	      ((key_table + (8) * 8) + 8)(CTX),
84362306a36Sopenharmony_ci	      ((key_table + (8) * 8) + 12)(CTX),
84462306a36Sopenharmony_ci	      ((key_table + (8) * 8) + 0)(CTX),
84562306a36Sopenharmony_ci	      ((key_table + (8) * 8) + 4)(CTX));
84662306a36Sopenharmony_ci
84762306a36Sopenharmony_ci	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
84862306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
84962306a36Sopenharmony_ci		     %xmm15, %rax, %rcx, 0);
85062306a36Sopenharmony_ci
85162306a36Sopenharmony_ci	/* load CD for output */
85262306a36Sopenharmony_ci	vmovdqu 0 * 16(%rcx), %xmm8;
85362306a36Sopenharmony_ci	vmovdqu 1 * 16(%rcx), %xmm9;
85462306a36Sopenharmony_ci	vmovdqu 2 * 16(%rcx), %xmm10;
85562306a36Sopenharmony_ci	vmovdqu 3 * 16(%rcx), %xmm11;
85662306a36Sopenharmony_ci	vmovdqu 4 * 16(%rcx), %xmm12;
85762306a36Sopenharmony_ci	vmovdqu 5 * 16(%rcx), %xmm13;
85862306a36Sopenharmony_ci	vmovdqu 6 * 16(%rcx), %xmm14;
85962306a36Sopenharmony_ci	vmovdqu 7 * 16(%rcx), %xmm15;
86062306a36Sopenharmony_ci
86162306a36Sopenharmony_ci	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
86262306a36Sopenharmony_ci		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
86362306a36Sopenharmony_ci		    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
86462306a36Sopenharmony_ci
86562306a36Sopenharmony_ci	FRAME_END
86662306a36Sopenharmony_ci	RET;
86762306a36Sopenharmony_ci
86862306a36Sopenharmony_ci.align 8
86962306a36Sopenharmony_ci.Ldec_max32:
87062306a36Sopenharmony_ci	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
87162306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
87262306a36Sopenharmony_ci		     %xmm15, %rax, %rcx, 24);
87362306a36Sopenharmony_ci
87462306a36Sopenharmony_ci	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
87562306a36Sopenharmony_ci	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
87662306a36Sopenharmony_ci	      %xmm15,
87762306a36Sopenharmony_ci	      ((key_table + (24) * 8) + 8)(CTX),
87862306a36Sopenharmony_ci	      ((key_table + (24) * 8) + 12)(CTX),
87962306a36Sopenharmony_ci	      ((key_table + (24) * 8) + 0)(CTX),
88062306a36Sopenharmony_ci	      ((key_table + (24) * 8) + 4)(CTX));
88162306a36Sopenharmony_ci
88262306a36Sopenharmony_ci	jmp .Ldec_max24;
88362306a36Sopenharmony_ciSYM_FUNC_END(__camellia_dec_blk16)
88462306a36Sopenharmony_ci
88562306a36Sopenharmony_ciSYM_FUNC_START(camellia_ecb_enc_16way)
88662306a36Sopenharmony_ci	/* input:
88762306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
88862306a36Sopenharmony_ci	 *	%rsi: dst (16 blocks)
88962306a36Sopenharmony_ci	 *	%rdx: src (16 blocks)
89062306a36Sopenharmony_ci	 */
89162306a36Sopenharmony_ci	 FRAME_BEGIN
89262306a36Sopenharmony_ci
89362306a36Sopenharmony_ci	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
89462306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
89562306a36Sopenharmony_ci		     %xmm15, %rdx, (key_table)(CTX));
89662306a36Sopenharmony_ci
89762306a36Sopenharmony_ci	/* now dst can be used as temporary buffer (even in src == dst case) */
89862306a36Sopenharmony_ci	movq	%rsi, %rax;
89962306a36Sopenharmony_ci
90062306a36Sopenharmony_ci	call __camellia_enc_blk16;
90162306a36Sopenharmony_ci
90262306a36Sopenharmony_ci	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
90362306a36Sopenharmony_ci		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
90462306a36Sopenharmony_ci		     %xmm8, %rsi);
90562306a36Sopenharmony_ci
90662306a36Sopenharmony_ci	FRAME_END
90762306a36Sopenharmony_ci	RET;
90862306a36Sopenharmony_ciSYM_FUNC_END(camellia_ecb_enc_16way)
90962306a36Sopenharmony_ci
91062306a36Sopenharmony_ciSYM_FUNC_START(camellia_ecb_dec_16way)
91162306a36Sopenharmony_ci	/* input:
91262306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
91362306a36Sopenharmony_ci	 *	%rsi: dst (16 blocks)
91462306a36Sopenharmony_ci	 *	%rdx: src (16 blocks)
91562306a36Sopenharmony_ci	 */
91662306a36Sopenharmony_ci	 FRAME_BEGIN
91762306a36Sopenharmony_ci
91862306a36Sopenharmony_ci	cmpl $16, key_length(CTX);
91962306a36Sopenharmony_ci	movl $32, %r8d;
92062306a36Sopenharmony_ci	movl $24, %eax;
92162306a36Sopenharmony_ci	cmovel %eax, %r8d; /* max */
92262306a36Sopenharmony_ci
92362306a36Sopenharmony_ci	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
92462306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
92562306a36Sopenharmony_ci		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
92662306a36Sopenharmony_ci
92762306a36Sopenharmony_ci	/* now dst can be used as temporary buffer (even in src == dst case) */
92862306a36Sopenharmony_ci	movq	%rsi, %rax;
92962306a36Sopenharmony_ci
93062306a36Sopenharmony_ci	call __camellia_dec_blk16;
93162306a36Sopenharmony_ci
93262306a36Sopenharmony_ci	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
93362306a36Sopenharmony_ci		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
93462306a36Sopenharmony_ci		     %xmm8, %rsi);
93562306a36Sopenharmony_ci
93662306a36Sopenharmony_ci	FRAME_END
93762306a36Sopenharmony_ci	RET;
93862306a36Sopenharmony_ciSYM_FUNC_END(camellia_ecb_dec_16way)
93962306a36Sopenharmony_ci
94062306a36Sopenharmony_ciSYM_FUNC_START(camellia_cbc_dec_16way)
94162306a36Sopenharmony_ci	/* input:
94262306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
94362306a36Sopenharmony_ci	 *	%rsi: dst (16 blocks)
94462306a36Sopenharmony_ci	 *	%rdx: src (16 blocks)
94562306a36Sopenharmony_ci	 */
94662306a36Sopenharmony_ci	FRAME_BEGIN
94762306a36Sopenharmony_ci
94862306a36Sopenharmony_ci	cmpl $16, key_length(CTX);
94962306a36Sopenharmony_ci	movl $32, %r8d;
95062306a36Sopenharmony_ci	movl $24, %eax;
95162306a36Sopenharmony_ci	cmovel %eax, %r8d; /* max */
95262306a36Sopenharmony_ci
95362306a36Sopenharmony_ci	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
95462306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
95562306a36Sopenharmony_ci		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
95662306a36Sopenharmony_ci
95762306a36Sopenharmony_ci	/*
95862306a36Sopenharmony_ci	 * dst might still be in-use (in case dst == src), so use stack for
95962306a36Sopenharmony_ci	 * temporary storage.
96062306a36Sopenharmony_ci	 */
96162306a36Sopenharmony_ci	subq $(16 * 16), %rsp;
96262306a36Sopenharmony_ci	movq %rsp, %rax;
96362306a36Sopenharmony_ci
96462306a36Sopenharmony_ci	call __camellia_dec_blk16;
96562306a36Sopenharmony_ci
96662306a36Sopenharmony_ci	addq $(16 * 16), %rsp;
96762306a36Sopenharmony_ci
96862306a36Sopenharmony_ci	vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
96962306a36Sopenharmony_ci	vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
97062306a36Sopenharmony_ci	vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
97162306a36Sopenharmony_ci	vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
97262306a36Sopenharmony_ci	vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
97362306a36Sopenharmony_ci	vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
97462306a36Sopenharmony_ci	vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
97562306a36Sopenharmony_ci	vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
97662306a36Sopenharmony_ci	vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
97762306a36Sopenharmony_ci	vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
97862306a36Sopenharmony_ci	vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
97962306a36Sopenharmony_ci	vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
98062306a36Sopenharmony_ci	vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
98162306a36Sopenharmony_ci	vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
98262306a36Sopenharmony_ci	vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
98362306a36Sopenharmony_ci	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
98462306a36Sopenharmony_ci		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
98562306a36Sopenharmony_ci		     %xmm8, %rsi);
98662306a36Sopenharmony_ci
98762306a36Sopenharmony_ci	FRAME_END
98862306a36Sopenharmony_ci	RET;
98962306a36Sopenharmony_ciSYM_FUNC_END(camellia_cbc_dec_16way)
990