162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Camellia Cipher Algorithm (x86_64)
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
662306a36Sopenharmony_ci */
762306a36Sopenharmony_ci
862306a36Sopenharmony_ci#include <linux/linkage.h>
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci.file "camellia-x86_64-asm_64.S"
1162306a36Sopenharmony_ci.text
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci.extern camellia_sp10011110;
1462306a36Sopenharmony_ci.extern camellia_sp22000222;
1562306a36Sopenharmony_ci.extern camellia_sp03303033;
1662306a36Sopenharmony_ci.extern camellia_sp00444404;
1762306a36Sopenharmony_ci.extern camellia_sp02220222;
1862306a36Sopenharmony_ci.extern camellia_sp30333033;
1962306a36Sopenharmony_ci.extern camellia_sp44044404;
2062306a36Sopenharmony_ci.extern camellia_sp11101110;
2162306a36Sopenharmony_ci
2262306a36Sopenharmony_ci#define sp10011110 camellia_sp10011110
2362306a36Sopenharmony_ci#define sp22000222 camellia_sp22000222
2462306a36Sopenharmony_ci#define sp03303033 camellia_sp03303033
2562306a36Sopenharmony_ci#define sp00444404 camellia_sp00444404
2662306a36Sopenharmony_ci#define sp02220222 camellia_sp02220222
2762306a36Sopenharmony_ci#define sp30333033 camellia_sp30333033
2862306a36Sopenharmony_ci#define sp44044404 camellia_sp44044404
2962306a36Sopenharmony_ci#define sp11101110 camellia_sp11101110
3062306a36Sopenharmony_ci
3162306a36Sopenharmony_ci#define CAMELLIA_TABLE_BYTE_LEN 272
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_ci/* struct camellia_ctx: */
3462306a36Sopenharmony_ci#define key_table 0
3562306a36Sopenharmony_ci#define key_length CAMELLIA_TABLE_BYTE_LEN
3662306a36Sopenharmony_ci
3762306a36Sopenharmony_ci/* register macros */
3862306a36Sopenharmony_ci#define CTX %rdi
3962306a36Sopenharmony_ci#define RIO %rsi
4062306a36Sopenharmony_ci#define RIOd %esi
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci#define RAB0 %rax
4362306a36Sopenharmony_ci#define RCD0 %rcx
4462306a36Sopenharmony_ci#define RAB1 %rbx
4562306a36Sopenharmony_ci#define RCD1 %rdx
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_ci#define RAB0d %eax
4862306a36Sopenharmony_ci#define RCD0d %ecx
4962306a36Sopenharmony_ci#define RAB1d %ebx
5062306a36Sopenharmony_ci#define RCD1d %edx
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_ci#define RAB0bl %al
5362306a36Sopenharmony_ci#define RCD0bl %cl
5462306a36Sopenharmony_ci#define RAB1bl %bl
5562306a36Sopenharmony_ci#define RCD1bl %dl
5662306a36Sopenharmony_ci
5762306a36Sopenharmony_ci#define RAB0bh %ah
5862306a36Sopenharmony_ci#define RCD0bh %ch
5962306a36Sopenharmony_ci#define RAB1bh %bh
6062306a36Sopenharmony_ci#define RCD1bh %dh
6162306a36Sopenharmony_ci
6262306a36Sopenharmony_ci#define RT0 %rsi
6362306a36Sopenharmony_ci#define RT1 %r12
6462306a36Sopenharmony_ci#define RT2 %r8
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_ci#define RT0d %esi
6762306a36Sopenharmony_ci#define RT1d %r12d
6862306a36Sopenharmony_ci#define RT2d %r8d
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci#define RT2bl %r8b
7162306a36Sopenharmony_ci
7262306a36Sopenharmony_ci#define RXOR %r9
7362306a36Sopenharmony_ci#define RR12 %r10
7462306a36Sopenharmony_ci#define RDST %r11
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci#define RXORd %r9d
7762306a36Sopenharmony_ci#define RXORbl %r9b
7862306a36Sopenharmony_ci
7962306a36Sopenharmony_ci#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
8062306a36Sopenharmony_ci	leaq T0(%rip), 			tmp1; \
8162306a36Sopenharmony_ci	movzbl ab ## bl,		tmp2 ## d; \
8262306a36Sopenharmony_ci	xorq (tmp1, tmp2, 8),		dst; \
8362306a36Sopenharmony_ci	leaq T1(%rip), 			tmp2; \
8462306a36Sopenharmony_ci	movzbl ab ## bh,		tmp1 ## d; \
8562306a36Sopenharmony_ci	rorq $16,			ab; \
8662306a36Sopenharmony_ci	xorq (tmp2, tmp1, 8),		dst;
8762306a36Sopenharmony_ci
8862306a36Sopenharmony_ci/**********************************************************************
8962306a36Sopenharmony_ci  1-way camellia
9062306a36Sopenharmony_ci **********************************************************************/
9162306a36Sopenharmony_ci#define roundsm(ab, subkey, cd) \
9262306a36Sopenharmony_ci	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
9362306a36Sopenharmony_ci	\
9462306a36Sopenharmony_ci	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
9562306a36Sopenharmony_ci	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
9662306a36Sopenharmony_ci	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
9762306a36Sopenharmony_ci	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
9862306a36Sopenharmony_ci	\
9962306a36Sopenharmony_ci	xorq RT2,					cd ## 0;
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_ci#define fls(l, r, kl, kr) \
10262306a36Sopenharmony_ci	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
10362306a36Sopenharmony_ci	andl l ## 0d,					RT0d; \
10462306a36Sopenharmony_ci	roll $1,					RT0d; \
10562306a36Sopenharmony_ci	shlq $32,					RT0; \
10662306a36Sopenharmony_ci	xorq RT0,					l ## 0; \
10762306a36Sopenharmony_ci	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
10862306a36Sopenharmony_ci	orq r ## 0,					RT1; \
10962306a36Sopenharmony_ci	shrq $32,					RT1; \
11062306a36Sopenharmony_ci	xorq RT1,					r ## 0; \
11162306a36Sopenharmony_ci	\
11262306a36Sopenharmony_ci	movq (key_table + ((kl) * 2) * 4)(CTX),		RT2; \
11362306a36Sopenharmony_ci	orq l ## 0,					RT2; \
11462306a36Sopenharmony_ci	shrq $32,					RT2; \
11562306a36Sopenharmony_ci	xorq RT2,					l ## 0; \
11662306a36Sopenharmony_ci	movl (key_table + ((kr) * 2) * 4)(CTX),		RT0d; \
11762306a36Sopenharmony_ci	andl r ## 0d,					RT0d; \
11862306a36Sopenharmony_ci	roll $1,					RT0d; \
11962306a36Sopenharmony_ci	shlq $32,					RT0; \
12062306a36Sopenharmony_ci	xorq RT0,					r ## 0;
12162306a36Sopenharmony_ci
12262306a36Sopenharmony_ci#define enc_rounds(i) \
12362306a36Sopenharmony_ci	roundsm(RAB, i + 2, RCD); \
12462306a36Sopenharmony_ci	roundsm(RCD, i + 3, RAB); \
12562306a36Sopenharmony_ci	roundsm(RAB, i + 4, RCD); \
12662306a36Sopenharmony_ci	roundsm(RCD, i + 5, RAB); \
12762306a36Sopenharmony_ci	roundsm(RAB, i + 6, RCD); \
12862306a36Sopenharmony_ci	roundsm(RCD, i + 7, RAB);
12962306a36Sopenharmony_ci
13062306a36Sopenharmony_ci#define enc_fls(i) \
13162306a36Sopenharmony_ci	fls(RAB, RCD, i + 0, i + 1);
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci#define enc_inpack() \
13462306a36Sopenharmony_ci	movq (RIO),			RAB0; \
13562306a36Sopenharmony_ci	bswapq				RAB0; \
13662306a36Sopenharmony_ci	rolq $32,			RAB0; \
13762306a36Sopenharmony_ci	movq 4*2(RIO),			RCD0; \
13862306a36Sopenharmony_ci	bswapq				RCD0; \
13962306a36Sopenharmony_ci	rorq $32,			RCD0; \
14062306a36Sopenharmony_ci	xorq key_table(CTX),		RAB0;
14162306a36Sopenharmony_ci
14262306a36Sopenharmony_ci#define enc_outunpack(op, max) \
14362306a36Sopenharmony_ci	xorq key_table(CTX, max, 8),	RCD0; \
14462306a36Sopenharmony_ci	rorq $32,			RCD0; \
14562306a36Sopenharmony_ci	bswapq				RCD0; \
14662306a36Sopenharmony_ci	op ## q RCD0,			(RIO); \
14762306a36Sopenharmony_ci	rolq $32,			RAB0; \
14862306a36Sopenharmony_ci	bswapq				RAB0; \
14962306a36Sopenharmony_ci	op ## q RAB0,			4*2(RIO);
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_ci#define dec_rounds(i) \
15262306a36Sopenharmony_ci	roundsm(RAB, i + 7, RCD); \
15362306a36Sopenharmony_ci	roundsm(RCD, i + 6, RAB); \
15462306a36Sopenharmony_ci	roundsm(RAB, i + 5, RCD); \
15562306a36Sopenharmony_ci	roundsm(RCD, i + 4, RAB); \
15662306a36Sopenharmony_ci	roundsm(RAB, i + 3, RCD); \
15762306a36Sopenharmony_ci	roundsm(RCD, i + 2, RAB);
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_ci#define dec_fls(i) \
16062306a36Sopenharmony_ci	fls(RAB, RCD, i + 1, i + 0);
16162306a36Sopenharmony_ci
16262306a36Sopenharmony_ci#define dec_inpack(max) \
16362306a36Sopenharmony_ci	movq (RIO),			RAB0; \
16462306a36Sopenharmony_ci	bswapq				RAB0; \
16562306a36Sopenharmony_ci	rolq $32,			RAB0; \
16662306a36Sopenharmony_ci	movq 4*2(RIO),			RCD0; \
16762306a36Sopenharmony_ci	bswapq				RCD0; \
16862306a36Sopenharmony_ci	rorq $32,			RCD0; \
16962306a36Sopenharmony_ci	xorq key_table(CTX, max, 8),	RAB0;
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ci#define dec_outunpack() \
17262306a36Sopenharmony_ci	xorq key_table(CTX),		RCD0; \
17362306a36Sopenharmony_ci	rorq $32,			RCD0; \
17462306a36Sopenharmony_ci	bswapq				RCD0; \
17562306a36Sopenharmony_ci	movq RCD0,			(RIO); \
17662306a36Sopenharmony_ci	rolq $32,			RAB0; \
17762306a36Sopenharmony_ci	bswapq				RAB0; \
17862306a36Sopenharmony_ci	movq RAB0,			4*2(RIO);
17962306a36Sopenharmony_ci
18062306a36Sopenharmony_ciSYM_FUNC_START(__camellia_enc_blk)
18162306a36Sopenharmony_ci	/* input:
18262306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
18362306a36Sopenharmony_ci	 *	%rsi: dst
18462306a36Sopenharmony_ci	 *	%rdx: src
18562306a36Sopenharmony_ci	 *	%rcx: bool xor
18662306a36Sopenharmony_ci	 */
18762306a36Sopenharmony_ci	movq %r12, RR12;
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci	movq %rcx, RXOR;
19062306a36Sopenharmony_ci	movq %rsi, RDST;
19162306a36Sopenharmony_ci	movq %rdx, RIO;
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_ci	enc_inpack();
19462306a36Sopenharmony_ci
19562306a36Sopenharmony_ci	enc_rounds(0);
19662306a36Sopenharmony_ci	enc_fls(8);
19762306a36Sopenharmony_ci	enc_rounds(8);
19862306a36Sopenharmony_ci	enc_fls(16);
19962306a36Sopenharmony_ci	enc_rounds(16);
20062306a36Sopenharmony_ci	movl $24, RT1d; /* max */
20162306a36Sopenharmony_ci
20262306a36Sopenharmony_ci	cmpb $16, key_length(CTX);
20362306a36Sopenharmony_ci	je .L__enc_done;
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci	enc_fls(24);
20662306a36Sopenharmony_ci	enc_rounds(24);
20762306a36Sopenharmony_ci	movl $32, RT1d; /* max */
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_ci.L__enc_done:
21062306a36Sopenharmony_ci	testb RXORbl, RXORbl;
21162306a36Sopenharmony_ci	movq RDST, RIO;
21262306a36Sopenharmony_ci
21362306a36Sopenharmony_ci	jnz .L__enc_xor;
21462306a36Sopenharmony_ci
21562306a36Sopenharmony_ci	enc_outunpack(mov, RT1);
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci	movq RR12, %r12;
21862306a36Sopenharmony_ci	RET;
21962306a36Sopenharmony_ci
22062306a36Sopenharmony_ci.L__enc_xor:
22162306a36Sopenharmony_ci	enc_outunpack(xor, RT1);
22262306a36Sopenharmony_ci
22362306a36Sopenharmony_ci	movq RR12, %r12;
22462306a36Sopenharmony_ci	RET;
22562306a36Sopenharmony_ciSYM_FUNC_END(__camellia_enc_blk)
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ciSYM_FUNC_START(camellia_dec_blk)
22862306a36Sopenharmony_ci	/* input:
22962306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
23062306a36Sopenharmony_ci	 *	%rsi: dst
23162306a36Sopenharmony_ci	 *	%rdx: src
23262306a36Sopenharmony_ci	 */
23362306a36Sopenharmony_ci	cmpl $16, key_length(CTX);
23462306a36Sopenharmony_ci	movl $32, RT2d;
23562306a36Sopenharmony_ci	movl $24, RXORd;
23662306a36Sopenharmony_ci	cmovel RXORd, RT2d; /* max */
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci	movq %r12, RR12;
23962306a36Sopenharmony_ci	movq %rsi, RDST;
24062306a36Sopenharmony_ci	movq %rdx, RIO;
24162306a36Sopenharmony_ci
24262306a36Sopenharmony_ci	dec_inpack(RT2);
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci	cmpb $24, RT2bl;
24562306a36Sopenharmony_ci	je .L__dec_rounds16;
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_ci	dec_rounds(24);
24862306a36Sopenharmony_ci	dec_fls(24);
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci.L__dec_rounds16:
25162306a36Sopenharmony_ci	dec_rounds(16);
25262306a36Sopenharmony_ci	dec_fls(16);
25362306a36Sopenharmony_ci	dec_rounds(8);
25462306a36Sopenharmony_ci	dec_fls(8);
25562306a36Sopenharmony_ci	dec_rounds(0);
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci	movq RDST, RIO;
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	dec_outunpack();
26062306a36Sopenharmony_ci
26162306a36Sopenharmony_ci	movq RR12, %r12;
26262306a36Sopenharmony_ci	RET;
26362306a36Sopenharmony_ciSYM_FUNC_END(camellia_dec_blk)
26462306a36Sopenharmony_ci
26562306a36Sopenharmony_ci/**********************************************************************
26662306a36Sopenharmony_ci  2-way camellia
26762306a36Sopenharmony_ci **********************************************************************/
26862306a36Sopenharmony_ci#define roundsm2(ab, subkey, cd) \
26962306a36Sopenharmony_ci	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
27062306a36Sopenharmony_ci	xorq RT2,					cd ## 1; \
27162306a36Sopenharmony_ci	\
27262306a36Sopenharmony_ci	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
27362306a36Sopenharmony_ci	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
27462306a36Sopenharmony_ci	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
27562306a36Sopenharmony_ci	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
27662306a36Sopenharmony_ci	\
27762306a36Sopenharmony_ci		xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
27862306a36Sopenharmony_ci		xorq RT2,					cd ## 0; \
27962306a36Sopenharmony_ci		xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
28062306a36Sopenharmony_ci		xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
28162306a36Sopenharmony_ci		xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_ci#define fls2(l, r, kl, kr) \
28462306a36Sopenharmony_ci	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
28562306a36Sopenharmony_ci	andl l ## 0d,					RT0d; \
28662306a36Sopenharmony_ci	roll $1,					RT0d; \
28762306a36Sopenharmony_ci	shlq $32,					RT0; \
28862306a36Sopenharmony_ci	xorq RT0,					l ## 0; \
28962306a36Sopenharmony_ci	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
29062306a36Sopenharmony_ci	orq r ## 0,					RT1; \
29162306a36Sopenharmony_ci	shrq $32,					RT1; \
29262306a36Sopenharmony_ci	xorq RT1,					r ## 0; \
29362306a36Sopenharmony_ci	\
29462306a36Sopenharmony_ci		movl (key_table + ((kl) * 2) * 4)(CTX),		RT2d; \
29562306a36Sopenharmony_ci		andl l ## 1d,					RT2d; \
29662306a36Sopenharmony_ci		roll $1,					RT2d; \
29762306a36Sopenharmony_ci		shlq $32,					RT2; \
29862306a36Sopenharmony_ci		xorq RT2,					l ## 1; \
29962306a36Sopenharmony_ci		movq (key_table + ((kr) * 2) * 4)(CTX),		RT0; \
30062306a36Sopenharmony_ci		orq r ## 1,					RT0; \
30162306a36Sopenharmony_ci		shrq $32,					RT0; \
30262306a36Sopenharmony_ci		xorq RT0,					r ## 1; \
30362306a36Sopenharmony_ci	\
30462306a36Sopenharmony_ci	movq (key_table + ((kl) * 2) * 4)(CTX),		RT1; \
30562306a36Sopenharmony_ci	orq l ## 0,					RT1; \
30662306a36Sopenharmony_ci	shrq $32,					RT1; \
30762306a36Sopenharmony_ci	xorq RT1,					l ## 0; \
30862306a36Sopenharmony_ci	movl (key_table + ((kr) * 2) * 4)(CTX),		RT2d; \
30962306a36Sopenharmony_ci	andl r ## 0d,					RT2d; \
31062306a36Sopenharmony_ci	roll $1,					RT2d; \
31162306a36Sopenharmony_ci	shlq $32,					RT2; \
31262306a36Sopenharmony_ci	xorq RT2,					r ## 0; \
31362306a36Sopenharmony_ci	\
31462306a36Sopenharmony_ci		movq (key_table + ((kl) * 2) * 4)(CTX),		RT0; \
31562306a36Sopenharmony_ci		orq l ## 1,					RT0; \
31662306a36Sopenharmony_ci		shrq $32,					RT0; \
31762306a36Sopenharmony_ci		xorq RT0,					l ## 1; \
31862306a36Sopenharmony_ci		movl (key_table + ((kr) * 2) * 4)(CTX),		RT1d; \
31962306a36Sopenharmony_ci		andl r ## 1d,					RT1d; \
32062306a36Sopenharmony_ci		roll $1,					RT1d; \
32162306a36Sopenharmony_ci		shlq $32,					RT1; \
32262306a36Sopenharmony_ci		xorq RT1,					r ## 1;
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_ci#define enc_rounds2(i) \
32562306a36Sopenharmony_ci	roundsm2(RAB, i + 2, RCD); \
32662306a36Sopenharmony_ci	roundsm2(RCD, i + 3, RAB); \
32762306a36Sopenharmony_ci	roundsm2(RAB, i + 4, RCD); \
32862306a36Sopenharmony_ci	roundsm2(RCD, i + 5, RAB); \
32962306a36Sopenharmony_ci	roundsm2(RAB, i + 6, RCD); \
33062306a36Sopenharmony_ci	roundsm2(RCD, i + 7, RAB);
33162306a36Sopenharmony_ci
33262306a36Sopenharmony_ci#define enc_fls2(i) \
33362306a36Sopenharmony_ci	fls2(RAB, RCD, i + 0, i + 1);
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_ci#define enc_inpack2() \
33662306a36Sopenharmony_ci	movq (RIO),			RAB0; \
33762306a36Sopenharmony_ci	bswapq				RAB0; \
33862306a36Sopenharmony_ci	rorq $32,			RAB0; \
33962306a36Sopenharmony_ci	movq 4*2(RIO),			RCD0; \
34062306a36Sopenharmony_ci	bswapq				RCD0; \
34162306a36Sopenharmony_ci	rolq $32,			RCD0; \
34262306a36Sopenharmony_ci	xorq key_table(CTX),		RAB0; \
34362306a36Sopenharmony_ci	\
34462306a36Sopenharmony_ci		movq 8*2(RIO),			RAB1; \
34562306a36Sopenharmony_ci		bswapq				RAB1; \
34662306a36Sopenharmony_ci		rorq $32,			RAB1; \
34762306a36Sopenharmony_ci		movq 12*2(RIO),			RCD1; \
34862306a36Sopenharmony_ci		bswapq				RCD1; \
34962306a36Sopenharmony_ci		rolq $32,			RCD1; \
35062306a36Sopenharmony_ci		xorq key_table(CTX),		RAB1;
35162306a36Sopenharmony_ci
35262306a36Sopenharmony_ci#define enc_outunpack2(op, max) \
35362306a36Sopenharmony_ci	xorq key_table(CTX, max, 8),	RCD0; \
35462306a36Sopenharmony_ci	rolq $32,			RCD0; \
35562306a36Sopenharmony_ci	bswapq				RCD0; \
35662306a36Sopenharmony_ci	op ## q RCD0,			(RIO); \
35762306a36Sopenharmony_ci	rorq $32,			RAB0; \
35862306a36Sopenharmony_ci	bswapq				RAB0; \
35962306a36Sopenharmony_ci	op ## q RAB0,			4*2(RIO); \
36062306a36Sopenharmony_ci	\
36162306a36Sopenharmony_ci		xorq key_table(CTX, max, 8),	RCD1; \
36262306a36Sopenharmony_ci		rolq $32,			RCD1; \
36362306a36Sopenharmony_ci		bswapq				RCD1; \
36462306a36Sopenharmony_ci		op ## q RCD1,			8*2(RIO); \
36562306a36Sopenharmony_ci		rorq $32,			RAB1; \
36662306a36Sopenharmony_ci		bswapq				RAB1; \
36762306a36Sopenharmony_ci		op ## q RAB1,			12*2(RIO);
36862306a36Sopenharmony_ci
36962306a36Sopenharmony_ci#define dec_rounds2(i) \
37062306a36Sopenharmony_ci	roundsm2(RAB, i + 7, RCD); \
37162306a36Sopenharmony_ci	roundsm2(RCD, i + 6, RAB); \
37262306a36Sopenharmony_ci	roundsm2(RAB, i + 5, RCD); \
37362306a36Sopenharmony_ci	roundsm2(RCD, i + 4, RAB); \
37462306a36Sopenharmony_ci	roundsm2(RAB, i + 3, RCD); \
37562306a36Sopenharmony_ci	roundsm2(RCD, i + 2, RAB);
37662306a36Sopenharmony_ci
37762306a36Sopenharmony_ci#define dec_fls2(i) \
37862306a36Sopenharmony_ci	fls2(RAB, RCD, i + 1, i + 0);
37962306a36Sopenharmony_ci
38062306a36Sopenharmony_ci#define dec_inpack2(max) \
38162306a36Sopenharmony_ci	movq (RIO),			RAB0; \
38262306a36Sopenharmony_ci	bswapq				RAB0; \
38362306a36Sopenharmony_ci	rorq $32,			RAB0; \
38462306a36Sopenharmony_ci	movq 4*2(RIO),			RCD0; \
38562306a36Sopenharmony_ci	bswapq				RCD0; \
38662306a36Sopenharmony_ci	rolq $32,			RCD0; \
38762306a36Sopenharmony_ci	xorq key_table(CTX, max, 8),	RAB0; \
38862306a36Sopenharmony_ci	\
38962306a36Sopenharmony_ci		movq 8*2(RIO),			RAB1; \
39062306a36Sopenharmony_ci		bswapq				RAB1; \
39162306a36Sopenharmony_ci		rorq $32,			RAB1; \
39262306a36Sopenharmony_ci		movq 12*2(RIO),			RCD1; \
39362306a36Sopenharmony_ci		bswapq				RCD1; \
39462306a36Sopenharmony_ci		rolq $32,			RCD1; \
39562306a36Sopenharmony_ci		xorq key_table(CTX, max, 8),	RAB1;
39662306a36Sopenharmony_ci
39762306a36Sopenharmony_ci#define dec_outunpack2() \
39862306a36Sopenharmony_ci	xorq key_table(CTX),		RCD0; \
39962306a36Sopenharmony_ci	rolq $32,			RCD0; \
40062306a36Sopenharmony_ci	bswapq				RCD0; \
40162306a36Sopenharmony_ci	movq RCD0,			(RIO); \
40262306a36Sopenharmony_ci	rorq $32,			RAB0; \
40362306a36Sopenharmony_ci	bswapq				RAB0; \
40462306a36Sopenharmony_ci	movq RAB0,			4*2(RIO); \
40562306a36Sopenharmony_ci	\
40662306a36Sopenharmony_ci		xorq key_table(CTX),		RCD1; \
40762306a36Sopenharmony_ci		rolq $32,			RCD1; \
40862306a36Sopenharmony_ci		bswapq				RCD1; \
40962306a36Sopenharmony_ci		movq RCD1,			8*2(RIO); \
41062306a36Sopenharmony_ci		rorq $32,			RAB1; \
41162306a36Sopenharmony_ci		bswapq				RAB1; \
41262306a36Sopenharmony_ci		movq RAB1,			12*2(RIO);
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_ciSYM_FUNC_START(__camellia_enc_blk_2way)
41562306a36Sopenharmony_ci	/* input:
41662306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
41762306a36Sopenharmony_ci	 *	%rsi: dst
41862306a36Sopenharmony_ci	 *	%rdx: src
41962306a36Sopenharmony_ci	 *	%rcx: bool xor
42062306a36Sopenharmony_ci	 */
42162306a36Sopenharmony_ci	pushq %rbx;
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_ci	movq %r12, RR12;
42462306a36Sopenharmony_ci	movq %rcx, RXOR;
42562306a36Sopenharmony_ci	movq %rsi, RDST;
42662306a36Sopenharmony_ci	movq %rdx, RIO;
42762306a36Sopenharmony_ci
42862306a36Sopenharmony_ci	enc_inpack2();
42962306a36Sopenharmony_ci
43062306a36Sopenharmony_ci	enc_rounds2(0);
43162306a36Sopenharmony_ci	enc_fls2(8);
43262306a36Sopenharmony_ci	enc_rounds2(8);
43362306a36Sopenharmony_ci	enc_fls2(16);
43462306a36Sopenharmony_ci	enc_rounds2(16);
43562306a36Sopenharmony_ci	movl $24, RT2d; /* max */
43662306a36Sopenharmony_ci
43762306a36Sopenharmony_ci	cmpb $16, key_length(CTX);
43862306a36Sopenharmony_ci	je .L__enc2_done;
43962306a36Sopenharmony_ci
44062306a36Sopenharmony_ci	enc_fls2(24);
44162306a36Sopenharmony_ci	enc_rounds2(24);
44262306a36Sopenharmony_ci	movl $32, RT2d; /* max */
44362306a36Sopenharmony_ci
44462306a36Sopenharmony_ci.L__enc2_done:
44562306a36Sopenharmony_ci	test RXORbl, RXORbl;
44662306a36Sopenharmony_ci	movq RDST, RIO;
44762306a36Sopenharmony_ci	jnz .L__enc2_xor;
44862306a36Sopenharmony_ci
44962306a36Sopenharmony_ci	enc_outunpack2(mov, RT2);
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci	movq RR12, %r12;
45262306a36Sopenharmony_ci	popq %rbx;
45362306a36Sopenharmony_ci	RET;
45462306a36Sopenharmony_ci
45562306a36Sopenharmony_ci.L__enc2_xor:
45662306a36Sopenharmony_ci	enc_outunpack2(xor, RT2);
45762306a36Sopenharmony_ci
45862306a36Sopenharmony_ci	movq RR12, %r12;
45962306a36Sopenharmony_ci	popq %rbx;
46062306a36Sopenharmony_ci	RET;
46162306a36Sopenharmony_ciSYM_FUNC_END(__camellia_enc_blk_2way)
46262306a36Sopenharmony_ci
46362306a36Sopenharmony_ciSYM_FUNC_START(camellia_dec_blk_2way)
46462306a36Sopenharmony_ci	/* input:
46562306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
46662306a36Sopenharmony_ci	 *	%rsi: dst
46762306a36Sopenharmony_ci	 *	%rdx: src
46862306a36Sopenharmony_ci	 */
46962306a36Sopenharmony_ci	cmpl $16, key_length(CTX);
47062306a36Sopenharmony_ci	movl $32, RT2d;
47162306a36Sopenharmony_ci	movl $24, RXORd;
47262306a36Sopenharmony_ci	cmovel RXORd, RT2d; /* max */
47362306a36Sopenharmony_ci
47462306a36Sopenharmony_ci	movq %rbx, RXOR;
47562306a36Sopenharmony_ci	movq %r12, RR12;
47662306a36Sopenharmony_ci	movq %rsi, RDST;
47762306a36Sopenharmony_ci	movq %rdx, RIO;
47862306a36Sopenharmony_ci
47962306a36Sopenharmony_ci	dec_inpack2(RT2);
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_ci	cmpb $24, RT2bl;
48262306a36Sopenharmony_ci	je .L__dec2_rounds16;
48362306a36Sopenharmony_ci
48462306a36Sopenharmony_ci	dec_rounds2(24);
48562306a36Sopenharmony_ci	dec_fls2(24);
48662306a36Sopenharmony_ci
48762306a36Sopenharmony_ci.L__dec2_rounds16:
48862306a36Sopenharmony_ci	dec_rounds2(16);
48962306a36Sopenharmony_ci	dec_fls2(16);
49062306a36Sopenharmony_ci	dec_rounds2(8);
49162306a36Sopenharmony_ci	dec_fls2(8);
49262306a36Sopenharmony_ci	dec_rounds2(0);
49362306a36Sopenharmony_ci
49462306a36Sopenharmony_ci	movq RDST, RIO;
49562306a36Sopenharmony_ci
49662306a36Sopenharmony_ci	dec_outunpack2();
49762306a36Sopenharmony_ci
49862306a36Sopenharmony_ci	movq RR12, %r12;
49962306a36Sopenharmony_ci	movq RXOR, %rbx;
50062306a36Sopenharmony_ci	RET;
50162306a36Sopenharmony_ciSYM_FUNC_END(camellia_dec_blk_2way)
502