18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Camellia Cipher Algorithm (x86_64)
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
68c2ecf20Sopenharmony_ci */
78c2ecf20Sopenharmony_ci
88c2ecf20Sopenharmony_ci#include <linux/linkage.h>
98c2ecf20Sopenharmony_ci
108c2ecf20Sopenharmony_ci.file "camellia-x86_64-asm_64.S"
118c2ecf20Sopenharmony_ci.text
128c2ecf20Sopenharmony_ci
138c2ecf20Sopenharmony_ci.extern camellia_sp10011110;
148c2ecf20Sopenharmony_ci.extern camellia_sp22000222;
158c2ecf20Sopenharmony_ci.extern camellia_sp03303033;
168c2ecf20Sopenharmony_ci.extern camellia_sp00444404;
178c2ecf20Sopenharmony_ci.extern camellia_sp02220222;
188c2ecf20Sopenharmony_ci.extern camellia_sp30333033;
198c2ecf20Sopenharmony_ci.extern camellia_sp44044404;
208c2ecf20Sopenharmony_ci.extern camellia_sp11101110;
218c2ecf20Sopenharmony_ci
228c2ecf20Sopenharmony_ci#define sp10011110 camellia_sp10011110
238c2ecf20Sopenharmony_ci#define sp22000222 camellia_sp22000222
248c2ecf20Sopenharmony_ci#define sp03303033 camellia_sp03303033
258c2ecf20Sopenharmony_ci#define sp00444404 camellia_sp00444404
268c2ecf20Sopenharmony_ci#define sp02220222 camellia_sp02220222
278c2ecf20Sopenharmony_ci#define sp30333033 camellia_sp30333033
288c2ecf20Sopenharmony_ci#define sp44044404 camellia_sp44044404
298c2ecf20Sopenharmony_ci#define sp11101110 camellia_sp11101110
308c2ecf20Sopenharmony_ci
318c2ecf20Sopenharmony_ci#define CAMELLIA_TABLE_BYTE_LEN 272
328c2ecf20Sopenharmony_ci
338c2ecf20Sopenharmony_ci/* struct camellia_ctx: */
348c2ecf20Sopenharmony_ci#define key_table 0
358c2ecf20Sopenharmony_ci#define key_length CAMELLIA_TABLE_BYTE_LEN
368c2ecf20Sopenharmony_ci
378c2ecf20Sopenharmony_ci/* register macros */
388c2ecf20Sopenharmony_ci#define CTX %rdi
398c2ecf20Sopenharmony_ci#define RIO %rsi
408c2ecf20Sopenharmony_ci#define RIOd %esi
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_ci#define RAB0 %rax
438c2ecf20Sopenharmony_ci#define RCD0 %rcx
448c2ecf20Sopenharmony_ci#define RAB1 %rbx
458c2ecf20Sopenharmony_ci#define RCD1 %rdx
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci#define RAB0d %eax
488c2ecf20Sopenharmony_ci#define RCD0d %ecx
498c2ecf20Sopenharmony_ci#define RAB1d %ebx
508c2ecf20Sopenharmony_ci#define RCD1d %edx
518c2ecf20Sopenharmony_ci
528c2ecf20Sopenharmony_ci#define RAB0bl %al
538c2ecf20Sopenharmony_ci#define RCD0bl %cl
548c2ecf20Sopenharmony_ci#define RAB1bl %bl
558c2ecf20Sopenharmony_ci#define RCD1bl %dl
568c2ecf20Sopenharmony_ci
578c2ecf20Sopenharmony_ci#define RAB0bh %ah
588c2ecf20Sopenharmony_ci#define RCD0bh %ch
598c2ecf20Sopenharmony_ci#define RAB1bh %bh
608c2ecf20Sopenharmony_ci#define RCD1bh %dh
618c2ecf20Sopenharmony_ci
628c2ecf20Sopenharmony_ci#define RT0 %rsi
638c2ecf20Sopenharmony_ci#define RT1 %r12
648c2ecf20Sopenharmony_ci#define RT2 %r8
658c2ecf20Sopenharmony_ci
668c2ecf20Sopenharmony_ci#define RT0d %esi
678c2ecf20Sopenharmony_ci#define RT1d %r12d
688c2ecf20Sopenharmony_ci#define RT2d %r8d
698c2ecf20Sopenharmony_ci
708c2ecf20Sopenharmony_ci#define RT2bl %r8b
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_ci#define RXOR %r9
738c2ecf20Sopenharmony_ci#define RR12 %r10
748c2ecf20Sopenharmony_ci#define RDST %r11
758c2ecf20Sopenharmony_ci
768c2ecf20Sopenharmony_ci#define RXORd %r9d
778c2ecf20Sopenharmony_ci#define RXORbl %r9b
788c2ecf20Sopenharmony_ci
798c2ecf20Sopenharmony_ci#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
808c2ecf20Sopenharmony_ci	movzbl ab ## bl,		tmp2 ## d; \
818c2ecf20Sopenharmony_ci	movzbl ab ## bh,		tmp1 ## d; \
828c2ecf20Sopenharmony_ci	rorq $16,			ab; \
838c2ecf20Sopenharmony_ci	xorq T0(, tmp2, 8),		dst; \
848c2ecf20Sopenharmony_ci	xorq T1(, tmp1, 8),		dst;
858c2ecf20Sopenharmony_ci
868c2ecf20Sopenharmony_ci/**********************************************************************
878c2ecf20Sopenharmony_ci  1-way camellia
888c2ecf20Sopenharmony_ci **********************************************************************/
898c2ecf20Sopenharmony_ci#define roundsm(ab, subkey, cd) \
908c2ecf20Sopenharmony_ci	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
918c2ecf20Sopenharmony_ci	\
928c2ecf20Sopenharmony_ci	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
938c2ecf20Sopenharmony_ci	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
948c2ecf20Sopenharmony_ci	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
958c2ecf20Sopenharmony_ci	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
968c2ecf20Sopenharmony_ci	\
978c2ecf20Sopenharmony_ci	xorq RT2,					cd ## 0;
988c2ecf20Sopenharmony_ci
998c2ecf20Sopenharmony_ci#define fls(l, r, kl, kr) \
1008c2ecf20Sopenharmony_ci	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
1018c2ecf20Sopenharmony_ci	andl l ## 0d,					RT0d; \
1028c2ecf20Sopenharmony_ci	roll $1,					RT0d; \
1038c2ecf20Sopenharmony_ci	shlq $32,					RT0; \
1048c2ecf20Sopenharmony_ci	xorq RT0,					l ## 0; \
1058c2ecf20Sopenharmony_ci	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
1068c2ecf20Sopenharmony_ci	orq r ## 0,					RT1; \
1078c2ecf20Sopenharmony_ci	shrq $32,					RT1; \
1088c2ecf20Sopenharmony_ci	xorq RT1,					r ## 0; \
1098c2ecf20Sopenharmony_ci	\
1108c2ecf20Sopenharmony_ci	movq (key_table + ((kl) * 2) * 4)(CTX),		RT2; \
1118c2ecf20Sopenharmony_ci	orq l ## 0,					RT2; \
1128c2ecf20Sopenharmony_ci	shrq $32,					RT2; \
1138c2ecf20Sopenharmony_ci	xorq RT2,					l ## 0; \
1148c2ecf20Sopenharmony_ci	movl (key_table + ((kr) * 2) * 4)(CTX),		RT0d; \
1158c2ecf20Sopenharmony_ci	andl r ## 0d,					RT0d; \
1168c2ecf20Sopenharmony_ci	roll $1,					RT0d; \
1178c2ecf20Sopenharmony_ci	shlq $32,					RT0; \
1188c2ecf20Sopenharmony_ci	xorq RT0,					r ## 0;
1198c2ecf20Sopenharmony_ci
1208c2ecf20Sopenharmony_ci#define enc_rounds(i) \
1218c2ecf20Sopenharmony_ci	roundsm(RAB, i + 2, RCD); \
1228c2ecf20Sopenharmony_ci	roundsm(RCD, i + 3, RAB); \
1238c2ecf20Sopenharmony_ci	roundsm(RAB, i + 4, RCD); \
1248c2ecf20Sopenharmony_ci	roundsm(RCD, i + 5, RAB); \
1258c2ecf20Sopenharmony_ci	roundsm(RAB, i + 6, RCD); \
1268c2ecf20Sopenharmony_ci	roundsm(RCD, i + 7, RAB);
1278c2ecf20Sopenharmony_ci
1288c2ecf20Sopenharmony_ci#define enc_fls(i) \
1298c2ecf20Sopenharmony_ci	fls(RAB, RCD, i + 0, i + 1);
1308c2ecf20Sopenharmony_ci
1318c2ecf20Sopenharmony_ci#define enc_inpack() \
1328c2ecf20Sopenharmony_ci	movq (RIO),			RAB0; \
1338c2ecf20Sopenharmony_ci	bswapq				RAB0; \
1348c2ecf20Sopenharmony_ci	rolq $32,			RAB0; \
1358c2ecf20Sopenharmony_ci	movq 4*2(RIO),			RCD0; \
1368c2ecf20Sopenharmony_ci	bswapq				RCD0; \
1378c2ecf20Sopenharmony_ci	rorq $32,			RCD0; \
1388c2ecf20Sopenharmony_ci	xorq key_table(CTX),		RAB0;
1398c2ecf20Sopenharmony_ci
1408c2ecf20Sopenharmony_ci#define enc_outunpack(op, max) \
1418c2ecf20Sopenharmony_ci	xorq key_table(CTX, max, 8),	RCD0; \
1428c2ecf20Sopenharmony_ci	rorq $32,			RCD0; \
1438c2ecf20Sopenharmony_ci	bswapq				RCD0; \
1448c2ecf20Sopenharmony_ci	op ## q RCD0,			(RIO); \
1458c2ecf20Sopenharmony_ci	rolq $32,			RAB0; \
1468c2ecf20Sopenharmony_ci	bswapq				RAB0; \
1478c2ecf20Sopenharmony_ci	op ## q RAB0,			4*2(RIO);
1488c2ecf20Sopenharmony_ci
1498c2ecf20Sopenharmony_ci#define dec_rounds(i) \
1508c2ecf20Sopenharmony_ci	roundsm(RAB, i + 7, RCD); \
1518c2ecf20Sopenharmony_ci	roundsm(RCD, i + 6, RAB); \
1528c2ecf20Sopenharmony_ci	roundsm(RAB, i + 5, RCD); \
1538c2ecf20Sopenharmony_ci	roundsm(RCD, i + 4, RAB); \
1548c2ecf20Sopenharmony_ci	roundsm(RAB, i + 3, RCD); \
1558c2ecf20Sopenharmony_ci	roundsm(RCD, i + 2, RAB);
1568c2ecf20Sopenharmony_ci
1578c2ecf20Sopenharmony_ci#define dec_fls(i) \
1588c2ecf20Sopenharmony_ci	fls(RAB, RCD, i + 1, i + 0);
1598c2ecf20Sopenharmony_ci
1608c2ecf20Sopenharmony_ci#define dec_inpack(max) \
1618c2ecf20Sopenharmony_ci	movq (RIO),			RAB0; \
1628c2ecf20Sopenharmony_ci	bswapq				RAB0; \
1638c2ecf20Sopenharmony_ci	rolq $32,			RAB0; \
1648c2ecf20Sopenharmony_ci	movq 4*2(RIO),			RCD0; \
1658c2ecf20Sopenharmony_ci	bswapq				RCD0; \
1668c2ecf20Sopenharmony_ci	rorq $32,			RCD0; \
1678c2ecf20Sopenharmony_ci	xorq key_table(CTX, max, 8),	RAB0;
1688c2ecf20Sopenharmony_ci
1698c2ecf20Sopenharmony_ci#define dec_outunpack() \
1708c2ecf20Sopenharmony_ci	xorq key_table(CTX),		RCD0; \
1718c2ecf20Sopenharmony_ci	rorq $32,			RCD0; \
1728c2ecf20Sopenharmony_ci	bswapq				RCD0; \
1738c2ecf20Sopenharmony_ci	movq RCD0,			(RIO); \
1748c2ecf20Sopenharmony_ci	rolq $32,			RAB0; \
1758c2ecf20Sopenharmony_ci	bswapq				RAB0; \
1768c2ecf20Sopenharmony_ci	movq RAB0,			4*2(RIO);
1778c2ecf20Sopenharmony_ci
1788c2ecf20Sopenharmony_ciSYM_FUNC_START(__camellia_enc_blk)
1798c2ecf20Sopenharmony_ci	/* input:
1808c2ecf20Sopenharmony_ci	 *	%rdi: ctx, CTX
1818c2ecf20Sopenharmony_ci	 *	%rsi: dst
1828c2ecf20Sopenharmony_ci	 *	%rdx: src
1838c2ecf20Sopenharmony_ci	 *	%rcx: bool xor
1848c2ecf20Sopenharmony_ci	 */
1858c2ecf20Sopenharmony_ci	movq %r12, RR12;
1868c2ecf20Sopenharmony_ci
1878c2ecf20Sopenharmony_ci	movq %rcx, RXOR;
1888c2ecf20Sopenharmony_ci	movq %rsi, RDST;
1898c2ecf20Sopenharmony_ci	movq %rdx, RIO;
1908c2ecf20Sopenharmony_ci
1918c2ecf20Sopenharmony_ci	enc_inpack();
1928c2ecf20Sopenharmony_ci
1938c2ecf20Sopenharmony_ci	enc_rounds(0);
1948c2ecf20Sopenharmony_ci	enc_fls(8);
1958c2ecf20Sopenharmony_ci	enc_rounds(8);
1968c2ecf20Sopenharmony_ci	enc_fls(16);
1978c2ecf20Sopenharmony_ci	enc_rounds(16);
1988c2ecf20Sopenharmony_ci	movl $24, RT1d; /* max */
1998c2ecf20Sopenharmony_ci
2008c2ecf20Sopenharmony_ci	cmpb $16, key_length(CTX);
2018c2ecf20Sopenharmony_ci	je .L__enc_done;
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_ci	enc_fls(24);
2048c2ecf20Sopenharmony_ci	enc_rounds(24);
2058c2ecf20Sopenharmony_ci	movl $32, RT1d; /* max */
2068c2ecf20Sopenharmony_ci
2078c2ecf20Sopenharmony_ci.L__enc_done:
2088c2ecf20Sopenharmony_ci	testb RXORbl, RXORbl;
2098c2ecf20Sopenharmony_ci	movq RDST, RIO;
2108c2ecf20Sopenharmony_ci
2118c2ecf20Sopenharmony_ci	jnz .L__enc_xor;
2128c2ecf20Sopenharmony_ci
2138c2ecf20Sopenharmony_ci	enc_outunpack(mov, RT1);
2148c2ecf20Sopenharmony_ci
2158c2ecf20Sopenharmony_ci	movq RR12, %r12;
2168c2ecf20Sopenharmony_ci	RET;
2178c2ecf20Sopenharmony_ci
2188c2ecf20Sopenharmony_ci.L__enc_xor:
2198c2ecf20Sopenharmony_ci	enc_outunpack(xor, RT1);
2208c2ecf20Sopenharmony_ci
2218c2ecf20Sopenharmony_ci	movq RR12, %r12;
2228c2ecf20Sopenharmony_ci	RET;
2238c2ecf20Sopenharmony_ciSYM_FUNC_END(__camellia_enc_blk)
2248c2ecf20Sopenharmony_ci
2258c2ecf20Sopenharmony_ciSYM_FUNC_START(camellia_dec_blk)
2268c2ecf20Sopenharmony_ci	/* input:
2278c2ecf20Sopenharmony_ci	 *	%rdi: ctx, CTX
2288c2ecf20Sopenharmony_ci	 *	%rsi: dst
2298c2ecf20Sopenharmony_ci	 *	%rdx: src
2308c2ecf20Sopenharmony_ci	 */
2318c2ecf20Sopenharmony_ci	cmpl $16, key_length(CTX);
2328c2ecf20Sopenharmony_ci	movl $32, RT2d;
2338c2ecf20Sopenharmony_ci	movl $24, RXORd;
2348c2ecf20Sopenharmony_ci	cmovel RXORd, RT2d; /* max */
2358c2ecf20Sopenharmony_ci
2368c2ecf20Sopenharmony_ci	movq %r12, RR12;
2378c2ecf20Sopenharmony_ci	movq %rsi, RDST;
2388c2ecf20Sopenharmony_ci	movq %rdx, RIO;
2398c2ecf20Sopenharmony_ci
2408c2ecf20Sopenharmony_ci	dec_inpack(RT2);
2418c2ecf20Sopenharmony_ci
2428c2ecf20Sopenharmony_ci	cmpb $24, RT2bl;
2438c2ecf20Sopenharmony_ci	je .L__dec_rounds16;
2448c2ecf20Sopenharmony_ci
2458c2ecf20Sopenharmony_ci	dec_rounds(24);
2468c2ecf20Sopenharmony_ci	dec_fls(24);
2478c2ecf20Sopenharmony_ci
2488c2ecf20Sopenharmony_ci.L__dec_rounds16:
2498c2ecf20Sopenharmony_ci	dec_rounds(16);
2508c2ecf20Sopenharmony_ci	dec_fls(16);
2518c2ecf20Sopenharmony_ci	dec_rounds(8);
2528c2ecf20Sopenharmony_ci	dec_fls(8);
2538c2ecf20Sopenharmony_ci	dec_rounds(0);
2548c2ecf20Sopenharmony_ci
2558c2ecf20Sopenharmony_ci	movq RDST, RIO;
2568c2ecf20Sopenharmony_ci
2578c2ecf20Sopenharmony_ci	dec_outunpack();
2588c2ecf20Sopenharmony_ci
2598c2ecf20Sopenharmony_ci	movq RR12, %r12;
2608c2ecf20Sopenharmony_ci	RET;
2618c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_dec_blk)
2628c2ecf20Sopenharmony_ci
2638c2ecf20Sopenharmony_ci/**********************************************************************
2648c2ecf20Sopenharmony_ci  2-way camellia
2658c2ecf20Sopenharmony_ci **********************************************************************/
2668c2ecf20Sopenharmony_ci#define roundsm2(ab, subkey, cd) \
2678c2ecf20Sopenharmony_ci	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
2688c2ecf20Sopenharmony_ci	xorq RT2,					cd ## 1; \
2698c2ecf20Sopenharmony_ci	\
2708c2ecf20Sopenharmony_ci	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
2718c2ecf20Sopenharmony_ci	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
2728c2ecf20Sopenharmony_ci	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
2738c2ecf20Sopenharmony_ci	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
2748c2ecf20Sopenharmony_ci	\
2758c2ecf20Sopenharmony_ci		xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
2768c2ecf20Sopenharmony_ci		xorq RT2,					cd ## 0; \
2778c2ecf20Sopenharmony_ci		xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
2788c2ecf20Sopenharmony_ci		xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
2798c2ecf20Sopenharmony_ci		xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
2808c2ecf20Sopenharmony_ci
2818c2ecf20Sopenharmony_ci#define fls2(l, r, kl, kr) \
2828c2ecf20Sopenharmony_ci	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
2838c2ecf20Sopenharmony_ci	andl l ## 0d,					RT0d; \
2848c2ecf20Sopenharmony_ci	roll $1,					RT0d; \
2858c2ecf20Sopenharmony_ci	shlq $32,					RT0; \
2868c2ecf20Sopenharmony_ci	xorq RT0,					l ## 0; \
2878c2ecf20Sopenharmony_ci	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
2888c2ecf20Sopenharmony_ci	orq r ## 0,					RT1; \
2898c2ecf20Sopenharmony_ci	shrq $32,					RT1; \
2908c2ecf20Sopenharmony_ci	xorq RT1,					r ## 0; \
2918c2ecf20Sopenharmony_ci	\
2928c2ecf20Sopenharmony_ci		movl (key_table + ((kl) * 2) * 4)(CTX),		RT2d; \
2938c2ecf20Sopenharmony_ci		andl l ## 1d,					RT2d; \
2948c2ecf20Sopenharmony_ci		roll $1,					RT2d; \
2958c2ecf20Sopenharmony_ci		shlq $32,					RT2; \
2968c2ecf20Sopenharmony_ci		xorq RT2,					l ## 1; \
2978c2ecf20Sopenharmony_ci		movq (key_table + ((kr) * 2) * 4)(CTX),		RT0; \
2988c2ecf20Sopenharmony_ci		orq r ## 1,					RT0; \
2998c2ecf20Sopenharmony_ci		shrq $32,					RT0; \
3008c2ecf20Sopenharmony_ci		xorq RT0,					r ## 1; \
3018c2ecf20Sopenharmony_ci	\
3028c2ecf20Sopenharmony_ci	movq (key_table + ((kl) * 2) * 4)(CTX),		RT1; \
3038c2ecf20Sopenharmony_ci	orq l ## 0,					RT1; \
3048c2ecf20Sopenharmony_ci	shrq $32,					RT1; \
3058c2ecf20Sopenharmony_ci	xorq RT1,					l ## 0; \
3068c2ecf20Sopenharmony_ci	movl (key_table + ((kr) * 2) * 4)(CTX),		RT2d; \
3078c2ecf20Sopenharmony_ci	andl r ## 0d,					RT2d; \
3088c2ecf20Sopenharmony_ci	roll $1,					RT2d; \
3098c2ecf20Sopenharmony_ci	shlq $32,					RT2; \
3108c2ecf20Sopenharmony_ci	xorq RT2,					r ## 0; \
3118c2ecf20Sopenharmony_ci	\
3128c2ecf20Sopenharmony_ci		movq (key_table + ((kl) * 2) * 4)(CTX),		RT0; \
3138c2ecf20Sopenharmony_ci		orq l ## 1,					RT0; \
3148c2ecf20Sopenharmony_ci		shrq $32,					RT0; \
3158c2ecf20Sopenharmony_ci		xorq RT0,					l ## 1; \
3168c2ecf20Sopenharmony_ci		movl (key_table + ((kr) * 2) * 4)(CTX),		RT1d; \
3178c2ecf20Sopenharmony_ci		andl r ## 1d,					RT1d; \
3188c2ecf20Sopenharmony_ci		roll $1,					RT1d; \
3198c2ecf20Sopenharmony_ci		shlq $32,					RT1; \
3208c2ecf20Sopenharmony_ci		xorq RT1,					r ## 1;
3218c2ecf20Sopenharmony_ci
3228c2ecf20Sopenharmony_ci#define enc_rounds2(i) \
3238c2ecf20Sopenharmony_ci	roundsm2(RAB, i + 2, RCD); \
3248c2ecf20Sopenharmony_ci	roundsm2(RCD, i + 3, RAB); \
3258c2ecf20Sopenharmony_ci	roundsm2(RAB, i + 4, RCD); \
3268c2ecf20Sopenharmony_ci	roundsm2(RCD, i + 5, RAB); \
3278c2ecf20Sopenharmony_ci	roundsm2(RAB, i + 6, RCD); \
3288c2ecf20Sopenharmony_ci	roundsm2(RCD, i + 7, RAB);
3298c2ecf20Sopenharmony_ci
3308c2ecf20Sopenharmony_ci#define enc_fls2(i) \
3318c2ecf20Sopenharmony_ci	fls2(RAB, RCD, i + 0, i + 1);
3328c2ecf20Sopenharmony_ci
3338c2ecf20Sopenharmony_ci#define enc_inpack2() \
3348c2ecf20Sopenharmony_ci	movq (RIO),			RAB0; \
3358c2ecf20Sopenharmony_ci	bswapq				RAB0; \
3368c2ecf20Sopenharmony_ci	rorq $32,			RAB0; \
3378c2ecf20Sopenharmony_ci	movq 4*2(RIO),			RCD0; \
3388c2ecf20Sopenharmony_ci	bswapq				RCD0; \
3398c2ecf20Sopenharmony_ci	rolq $32,			RCD0; \
3408c2ecf20Sopenharmony_ci	xorq key_table(CTX),		RAB0; \
3418c2ecf20Sopenharmony_ci	\
3428c2ecf20Sopenharmony_ci		movq 8*2(RIO),			RAB1; \
3438c2ecf20Sopenharmony_ci		bswapq				RAB1; \
3448c2ecf20Sopenharmony_ci		rorq $32,			RAB1; \
3458c2ecf20Sopenharmony_ci		movq 12*2(RIO),			RCD1; \
3468c2ecf20Sopenharmony_ci		bswapq				RCD1; \
3478c2ecf20Sopenharmony_ci		rolq $32,			RCD1; \
3488c2ecf20Sopenharmony_ci		xorq key_table(CTX),		RAB1;
3498c2ecf20Sopenharmony_ci
3508c2ecf20Sopenharmony_ci#define enc_outunpack2(op, max) \
3518c2ecf20Sopenharmony_ci	xorq key_table(CTX, max, 8),	RCD0; \
3528c2ecf20Sopenharmony_ci	rolq $32,			RCD0; \
3538c2ecf20Sopenharmony_ci	bswapq				RCD0; \
3548c2ecf20Sopenharmony_ci	op ## q RCD0,			(RIO); \
3558c2ecf20Sopenharmony_ci	rorq $32,			RAB0; \
3568c2ecf20Sopenharmony_ci	bswapq				RAB0; \
3578c2ecf20Sopenharmony_ci	op ## q RAB0,			4*2(RIO); \
3588c2ecf20Sopenharmony_ci	\
3598c2ecf20Sopenharmony_ci		xorq key_table(CTX, max, 8),	RCD1; \
3608c2ecf20Sopenharmony_ci		rolq $32,			RCD1; \
3618c2ecf20Sopenharmony_ci		bswapq				RCD1; \
3628c2ecf20Sopenharmony_ci		op ## q RCD1,			8*2(RIO); \
3638c2ecf20Sopenharmony_ci		rorq $32,			RAB1; \
3648c2ecf20Sopenharmony_ci		bswapq				RAB1; \
3658c2ecf20Sopenharmony_ci		op ## q RAB1,			12*2(RIO);
3668c2ecf20Sopenharmony_ci
3678c2ecf20Sopenharmony_ci#define dec_rounds2(i) \
3688c2ecf20Sopenharmony_ci	roundsm2(RAB, i + 7, RCD); \
3698c2ecf20Sopenharmony_ci	roundsm2(RCD, i + 6, RAB); \
3708c2ecf20Sopenharmony_ci	roundsm2(RAB, i + 5, RCD); \
3718c2ecf20Sopenharmony_ci	roundsm2(RCD, i + 4, RAB); \
3728c2ecf20Sopenharmony_ci	roundsm2(RAB, i + 3, RCD); \
3738c2ecf20Sopenharmony_ci	roundsm2(RCD, i + 2, RAB);
3748c2ecf20Sopenharmony_ci
3758c2ecf20Sopenharmony_ci#define dec_fls2(i) \
3768c2ecf20Sopenharmony_ci	fls2(RAB, RCD, i + 1, i + 0);
3778c2ecf20Sopenharmony_ci
3788c2ecf20Sopenharmony_ci#define dec_inpack2(max) \
3798c2ecf20Sopenharmony_ci	movq (RIO),			RAB0; \
3808c2ecf20Sopenharmony_ci	bswapq				RAB0; \
3818c2ecf20Sopenharmony_ci	rorq $32,			RAB0; \
3828c2ecf20Sopenharmony_ci	movq 4*2(RIO),			RCD0; \
3838c2ecf20Sopenharmony_ci	bswapq				RCD0; \
3848c2ecf20Sopenharmony_ci	rolq $32,			RCD0; \
3858c2ecf20Sopenharmony_ci	xorq key_table(CTX, max, 8),	RAB0; \
3868c2ecf20Sopenharmony_ci	\
3878c2ecf20Sopenharmony_ci		movq 8*2(RIO),			RAB1; \
3888c2ecf20Sopenharmony_ci		bswapq				RAB1; \
3898c2ecf20Sopenharmony_ci		rorq $32,			RAB1; \
3908c2ecf20Sopenharmony_ci		movq 12*2(RIO),			RCD1; \
3918c2ecf20Sopenharmony_ci		bswapq				RCD1; \
3928c2ecf20Sopenharmony_ci		rolq $32,			RCD1; \
3938c2ecf20Sopenharmony_ci		xorq key_table(CTX, max, 8),	RAB1;
3948c2ecf20Sopenharmony_ci
3958c2ecf20Sopenharmony_ci#define dec_outunpack2() \
3968c2ecf20Sopenharmony_ci	xorq key_table(CTX),		RCD0; \
3978c2ecf20Sopenharmony_ci	rolq $32,			RCD0; \
3988c2ecf20Sopenharmony_ci	bswapq				RCD0; \
3998c2ecf20Sopenharmony_ci	movq RCD0,			(RIO); \
4008c2ecf20Sopenharmony_ci	rorq $32,			RAB0; \
4018c2ecf20Sopenharmony_ci	bswapq				RAB0; \
4028c2ecf20Sopenharmony_ci	movq RAB0,			4*2(RIO); \
4038c2ecf20Sopenharmony_ci	\
4048c2ecf20Sopenharmony_ci		xorq key_table(CTX),		RCD1; \
4058c2ecf20Sopenharmony_ci		rolq $32,			RCD1; \
4068c2ecf20Sopenharmony_ci		bswapq				RCD1; \
4078c2ecf20Sopenharmony_ci		movq RCD1,			8*2(RIO); \
4088c2ecf20Sopenharmony_ci		rorq $32,			RAB1; \
4098c2ecf20Sopenharmony_ci		bswapq				RAB1; \
4108c2ecf20Sopenharmony_ci		movq RAB1,			12*2(RIO);
4118c2ecf20Sopenharmony_ci
4128c2ecf20Sopenharmony_ciSYM_FUNC_START(__camellia_enc_blk_2way)
4138c2ecf20Sopenharmony_ci	/* input:
4148c2ecf20Sopenharmony_ci	 *	%rdi: ctx, CTX
4158c2ecf20Sopenharmony_ci	 *	%rsi: dst
4168c2ecf20Sopenharmony_ci	 *	%rdx: src
4178c2ecf20Sopenharmony_ci	 *	%rcx: bool xor
4188c2ecf20Sopenharmony_ci	 */
4198c2ecf20Sopenharmony_ci	pushq %rbx;
4208c2ecf20Sopenharmony_ci
4218c2ecf20Sopenharmony_ci	movq %r12, RR12;
4228c2ecf20Sopenharmony_ci	movq %rcx, RXOR;
4238c2ecf20Sopenharmony_ci	movq %rsi, RDST;
4248c2ecf20Sopenharmony_ci	movq %rdx, RIO;
4258c2ecf20Sopenharmony_ci
4268c2ecf20Sopenharmony_ci	enc_inpack2();
4278c2ecf20Sopenharmony_ci
4288c2ecf20Sopenharmony_ci	enc_rounds2(0);
4298c2ecf20Sopenharmony_ci	enc_fls2(8);
4308c2ecf20Sopenharmony_ci	enc_rounds2(8);
4318c2ecf20Sopenharmony_ci	enc_fls2(16);
4328c2ecf20Sopenharmony_ci	enc_rounds2(16);
4338c2ecf20Sopenharmony_ci	movl $24, RT2d; /* max */
4348c2ecf20Sopenharmony_ci
4358c2ecf20Sopenharmony_ci	cmpb $16, key_length(CTX);
4368c2ecf20Sopenharmony_ci	je .L__enc2_done;
4378c2ecf20Sopenharmony_ci
4388c2ecf20Sopenharmony_ci	enc_fls2(24);
4398c2ecf20Sopenharmony_ci	enc_rounds2(24);
4408c2ecf20Sopenharmony_ci	movl $32, RT2d; /* max */
4418c2ecf20Sopenharmony_ci
4428c2ecf20Sopenharmony_ci.L__enc2_done:
4438c2ecf20Sopenharmony_ci	test RXORbl, RXORbl;
4448c2ecf20Sopenharmony_ci	movq RDST, RIO;
4458c2ecf20Sopenharmony_ci	jnz .L__enc2_xor;
4468c2ecf20Sopenharmony_ci
4478c2ecf20Sopenharmony_ci	enc_outunpack2(mov, RT2);
4488c2ecf20Sopenharmony_ci
4498c2ecf20Sopenharmony_ci	movq RR12, %r12;
4508c2ecf20Sopenharmony_ci	popq %rbx;
4518c2ecf20Sopenharmony_ci	RET;
4528c2ecf20Sopenharmony_ci
4538c2ecf20Sopenharmony_ci.L__enc2_xor:
4548c2ecf20Sopenharmony_ci	enc_outunpack2(xor, RT2);
4558c2ecf20Sopenharmony_ci
4568c2ecf20Sopenharmony_ci	movq RR12, %r12;
4578c2ecf20Sopenharmony_ci	popq %rbx;
4588c2ecf20Sopenharmony_ci	RET;
4598c2ecf20Sopenharmony_ciSYM_FUNC_END(__camellia_enc_blk_2way)
4608c2ecf20Sopenharmony_ci
4618c2ecf20Sopenharmony_ciSYM_FUNC_START(camellia_dec_blk_2way)
4628c2ecf20Sopenharmony_ci	/* input:
4638c2ecf20Sopenharmony_ci	 *	%rdi: ctx, CTX
4648c2ecf20Sopenharmony_ci	 *	%rsi: dst
4658c2ecf20Sopenharmony_ci	 *	%rdx: src
4668c2ecf20Sopenharmony_ci	 */
4678c2ecf20Sopenharmony_ci	cmpl $16, key_length(CTX);
4688c2ecf20Sopenharmony_ci	movl $32, RT2d;
4698c2ecf20Sopenharmony_ci	movl $24, RXORd;
4708c2ecf20Sopenharmony_ci	cmovel RXORd, RT2d; /* max */
4718c2ecf20Sopenharmony_ci
4728c2ecf20Sopenharmony_ci	movq %rbx, RXOR;
4738c2ecf20Sopenharmony_ci	movq %r12, RR12;
4748c2ecf20Sopenharmony_ci	movq %rsi, RDST;
4758c2ecf20Sopenharmony_ci	movq %rdx, RIO;
4768c2ecf20Sopenharmony_ci
4778c2ecf20Sopenharmony_ci	dec_inpack2(RT2);
4788c2ecf20Sopenharmony_ci
4798c2ecf20Sopenharmony_ci	cmpb $24, RT2bl;
4808c2ecf20Sopenharmony_ci	je .L__dec2_rounds16;
4818c2ecf20Sopenharmony_ci
4828c2ecf20Sopenharmony_ci	dec_rounds2(24);
4838c2ecf20Sopenharmony_ci	dec_fls2(24);
4848c2ecf20Sopenharmony_ci
4858c2ecf20Sopenharmony_ci.L__dec2_rounds16:
4868c2ecf20Sopenharmony_ci	dec_rounds2(16);
4878c2ecf20Sopenharmony_ci	dec_fls2(16);
4888c2ecf20Sopenharmony_ci	dec_rounds2(8);
4898c2ecf20Sopenharmony_ci	dec_fls2(8);
4908c2ecf20Sopenharmony_ci	dec_rounds2(0);
4918c2ecf20Sopenharmony_ci
4928c2ecf20Sopenharmony_ci	movq RDST, RIO;
4938c2ecf20Sopenharmony_ci
4948c2ecf20Sopenharmony_ci	dec_outunpack2();
4958c2ecf20Sopenharmony_ci
4968c2ecf20Sopenharmony_ci	movq RR12, %r12;
4978c2ecf20Sopenharmony_ci	movq RXOR, %rbx;
4988c2ecf20Sopenharmony_ci	RET;
4998c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_dec_blk_2way)
500