18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright (C) 2012 Johannes Goetzfried
68c2ecf20Sopenharmony_ci *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
78c2ecf20Sopenharmony_ci *
88c2ecf20Sopenharmony_ci * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
98c2ecf20Sopenharmony_ci */
108c2ecf20Sopenharmony_ci
118c2ecf20Sopenharmony_ci#include <linux/linkage.h>
128c2ecf20Sopenharmony_ci#include <asm/frame.h>
138c2ecf20Sopenharmony_ci#include "glue_helper-asm-avx.S"
148c2ecf20Sopenharmony_ci
158c2ecf20Sopenharmony_ci.file "twofish-avx-x86_64-asm_64.S"
168c2ecf20Sopenharmony_ci
178c2ecf20Sopenharmony_ci.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
188c2ecf20Sopenharmony_ci.align 16
198c2ecf20Sopenharmony_ci.Lbswap128_mask:
208c2ecf20Sopenharmony_ci	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
218c2ecf20Sopenharmony_ci
228c2ecf20Sopenharmony_ci.section	.rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
238c2ecf20Sopenharmony_ci.align 16
248c2ecf20Sopenharmony_ci.Lxts_gf128mul_and_shl1_mask:
258c2ecf20Sopenharmony_ci	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
268c2ecf20Sopenharmony_ci
278c2ecf20Sopenharmony_ci.text
288c2ecf20Sopenharmony_ci
298c2ecf20Sopenharmony_ci/* structure of crypto context */
308c2ecf20Sopenharmony_ci#define s0	0
318c2ecf20Sopenharmony_ci#define s1	1024
328c2ecf20Sopenharmony_ci#define s2	2048
338c2ecf20Sopenharmony_ci#define s3	3072
348c2ecf20Sopenharmony_ci#define w	4096
358c2ecf20Sopenharmony_ci#define k	4128
368c2ecf20Sopenharmony_ci
378c2ecf20Sopenharmony_ci/**********************************************************************
388c2ecf20Sopenharmony_ci  8-way AVX twofish
398c2ecf20Sopenharmony_ci **********************************************************************/
408c2ecf20Sopenharmony_ci#define CTX %rdi
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_ci#define RA1 %xmm0
438c2ecf20Sopenharmony_ci#define RB1 %xmm1
448c2ecf20Sopenharmony_ci#define RC1 %xmm2
458c2ecf20Sopenharmony_ci#define RD1 %xmm3
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci#define RA2 %xmm4
488c2ecf20Sopenharmony_ci#define RB2 %xmm5
498c2ecf20Sopenharmony_ci#define RC2 %xmm6
508c2ecf20Sopenharmony_ci#define RD2 %xmm7
518c2ecf20Sopenharmony_ci
528c2ecf20Sopenharmony_ci#define RX0 %xmm8
538c2ecf20Sopenharmony_ci#define RY0 %xmm9
548c2ecf20Sopenharmony_ci
558c2ecf20Sopenharmony_ci#define RX1 %xmm10
568c2ecf20Sopenharmony_ci#define RY1 %xmm11
578c2ecf20Sopenharmony_ci
588c2ecf20Sopenharmony_ci#define RK1 %xmm12
598c2ecf20Sopenharmony_ci#define RK2 %xmm13
608c2ecf20Sopenharmony_ci
618c2ecf20Sopenharmony_ci#define RT %xmm14
628c2ecf20Sopenharmony_ci#define RR %xmm15
638c2ecf20Sopenharmony_ci
648c2ecf20Sopenharmony_ci#define RID1  %r13
658c2ecf20Sopenharmony_ci#define RID1d %r13d
668c2ecf20Sopenharmony_ci#define RID2  %rsi
678c2ecf20Sopenharmony_ci#define RID2d %esi
688c2ecf20Sopenharmony_ci
698c2ecf20Sopenharmony_ci#define RGI1   %rdx
708c2ecf20Sopenharmony_ci#define RGI1bl %dl
718c2ecf20Sopenharmony_ci#define RGI1bh %dh
728c2ecf20Sopenharmony_ci#define RGI2   %rcx
738c2ecf20Sopenharmony_ci#define RGI2bl %cl
748c2ecf20Sopenharmony_ci#define RGI2bh %ch
758c2ecf20Sopenharmony_ci
768c2ecf20Sopenharmony_ci#define RGI3   %rax
778c2ecf20Sopenharmony_ci#define RGI3bl %al
788c2ecf20Sopenharmony_ci#define RGI3bh %ah
798c2ecf20Sopenharmony_ci#define RGI4   %rbx
808c2ecf20Sopenharmony_ci#define RGI4bl %bl
818c2ecf20Sopenharmony_ci#define RGI4bh %bh
828c2ecf20Sopenharmony_ci
838c2ecf20Sopenharmony_ci#define RGS1  %r8
848c2ecf20Sopenharmony_ci#define RGS1d %r8d
858c2ecf20Sopenharmony_ci#define RGS2  %r9
868c2ecf20Sopenharmony_ci#define RGS2d %r9d
878c2ecf20Sopenharmony_ci#define RGS3  %r10
888c2ecf20Sopenharmony_ci#define RGS3d %r10d
898c2ecf20Sopenharmony_ci
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
928c2ecf20Sopenharmony_ci	movzbl		src ## bl,        RID1d;     \
938c2ecf20Sopenharmony_ci	movzbl		src ## bh,        RID2d;     \
948c2ecf20Sopenharmony_ci	shrq $16,	src;                         \
958c2ecf20Sopenharmony_ci	movl		t0(CTX, RID1, 4), dst ## d;  \
968c2ecf20Sopenharmony_ci	movl		t1(CTX, RID2, 4), RID2d;     \
978c2ecf20Sopenharmony_ci	movzbl		src ## bl,        RID1d;     \
988c2ecf20Sopenharmony_ci	xorl		RID2d,            dst ## d;  \
998c2ecf20Sopenharmony_ci	movzbl		src ## bh,        RID2d;     \
1008c2ecf20Sopenharmony_ci	interleave_op(il_reg);			     \
1018c2ecf20Sopenharmony_ci	xorl		t2(CTX, RID1, 4), dst ## d;  \
1028c2ecf20Sopenharmony_ci	xorl		t3(CTX, RID2, 4), dst ## d;
1038c2ecf20Sopenharmony_ci
1048c2ecf20Sopenharmony_ci#define dummy(d) /* do nothing */
1058c2ecf20Sopenharmony_ci
1068c2ecf20Sopenharmony_ci#define shr_next(reg) \
1078c2ecf20Sopenharmony_ci	shrq $16,	reg;
1088c2ecf20Sopenharmony_ci
1098c2ecf20Sopenharmony_ci#define G(gi1, gi2, x, t0, t1, t2, t3) \
1108c2ecf20Sopenharmony_ci	lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1);  \
1118c2ecf20Sopenharmony_ci	lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2);  \
1128c2ecf20Sopenharmony_ci	\
1138c2ecf20Sopenharmony_ci	lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none);      \
1148c2ecf20Sopenharmony_ci	shlq $32,	RGS2;                                        \
1158c2ecf20Sopenharmony_ci	orq		RGS1, RGS2;                                  \
1168c2ecf20Sopenharmony_ci	lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none);      \
1178c2ecf20Sopenharmony_ci	shlq $32,	RGS1;                                        \
1188c2ecf20Sopenharmony_ci	orq		RGS1, RGS3;
1198c2ecf20Sopenharmony_ci
1208c2ecf20Sopenharmony_ci#define round_head_2(a, b, x1, y1, x2, y2) \
1218c2ecf20Sopenharmony_ci	vmovq		b ## 1, RGI3;           \
1228c2ecf20Sopenharmony_ci	vpextrq $1,	b ## 1, RGI4;           \
1238c2ecf20Sopenharmony_ci	\
1248c2ecf20Sopenharmony_ci	G(RGI1, RGI2, x1, s0, s1, s2, s3);      \
1258c2ecf20Sopenharmony_ci	vmovq		a ## 2, RGI1;           \
1268c2ecf20Sopenharmony_ci	vpextrq $1,	a ## 2, RGI2;           \
1278c2ecf20Sopenharmony_ci	vmovq		RGS2, x1;               \
1288c2ecf20Sopenharmony_ci	vpinsrq $1,	RGS3, x1, x1;           \
1298c2ecf20Sopenharmony_ci	\
1308c2ecf20Sopenharmony_ci	G(RGI3, RGI4, y1, s1, s2, s3, s0);      \
1318c2ecf20Sopenharmony_ci	vmovq		b ## 2, RGI3;           \
1328c2ecf20Sopenharmony_ci	vpextrq $1,	b ## 2, RGI4;           \
1338c2ecf20Sopenharmony_ci	vmovq		RGS2, y1;               \
1348c2ecf20Sopenharmony_ci	vpinsrq $1,	RGS3, y1, y1;           \
1358c2ecf20Sopenharmony_ci	\
1368c2ecf20Sopenharmony_ci	G(RGI1, RGI2, x2, s0, s1, s2, s3);      \
1378c2ecf20Sopenharmony_ci	vmovq		RGS2, x2;               \
1388c2ecf20Sopenharmony_ci	vpinsrq $1,	RGS3, x2, x2;           \
1398c2ecf20Sopenharmony_ci	\
1408c2ecf20Sopenharmony_ci	G(RGI3, RGI4, y2, s1, s2, s3, s0);      \
1418c2ecf20Sopenharmony_ci	vmovq		RGS2, y2;               \
1428c2ecf20Sopenharmony_ci	vpinsrq $1,	RGS3, y2, y2;
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci#define encround_tail(a, b, c, d, x, y, prerotate) \
1458c2ecf20Sopenharmony_ci	vpaddd			x, y,   x; \
1468c2ecf20Sopenharmony_ci	vpaddd			x, RK1, RT;\
1478c2ecf20Sopenharmony_ci	prerotate(b);			   \
1488c2ecf20Sopenharmony_ci	vpxor			RT, c,  c; \
1498c2ecf20Sopenharmony_ci	vpaddd			y, x,   y; \
1508c2ecf20Sopenharmony_ci	vpaddd			y, RK2, y; \
1518c2ecf20Sopenharmony_ci	vpsrld $1,		c, RT;     \
1528c2ecf20Sopenharmony_ci	vpslld $(32 - 1),	c, c;      \
1538c2ecf20Sopenharmony_ci	vpor			c, RT,  c; \
1548c2ecf20Sopenharmony_ci	vpxor			d, y,   d; \
1558c2ecf20Sopenharmony_ci
1568c2ecf20Sopenharmony_ci#define decround_tail(a, b, c, d, x, y, prerotate) \
1578c2ecf20Sopenharmony_ci	vpaddd			x, y,   x; \
1588c2ecf20Sopenharmony_ci	vpaddd			x, RK1, RT;\
1598c2ecf20Sopenharmony_ci	prerotate(a);			   \
1608c2ecf20Sopenharmony_ci	vpxor			RT, c,  c; \
1618c2ecf20Sopenharmony_ci	vpaddd			y, x,   y; \
1628c2ecf20Sopenharmony_ci	vpaddd			y, RK2, y; \
1638c2ecf20Sopenharmony_ci	vpxor			d, y,   d; \
1648c2ecf20Sopenharmony_ci	vpsrld $1,		d, y;      \
1658c2ecf20Sopenharmony_ci	vpslld $(32 - 1),	d, d;      \
1668c2ecf20Sopenharmony_ci	vpor			d, y,   d; \
1678c2ecf20Sopenharmony_ci
1688c2ecf20Sopenharmony_ci#define rotate_1l(x) \
1698c2ecf20Sopenharmony_ci	vpslld $1,		x, RR;     \
1708c2ecf20Sopenharmony_ci	vpsrld $(32 - 1),	x, x;      \
1718c2ecf20Sopenharmony_ci	vpor			x, RR,  x;
1728c2ecf20Sopenharmony_ci
1738c2ecf20Sopenharmony_ci#define preload_rgi(c) \
1748c2ecf20Sopenharmony_ci	vmovq			c, RGI1; \
1758c2ecf20Sopenharmony_ci	vpextrq $1,		c, RGI2;
1768c2ecf20Sopenharmony_ci
1778c2ecf20Sopenharmony_ci#define encrypt_round(n, a, b, c, d, preload, prerotate) \
1788c2ecf20Sopenharmony_ci	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;                  \
1798c2ecf20Sopenharmony_ci	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;                  \
1808c2ecf20Sopenharmony_ci	round_head_2(a, b, RX0, RY0, RX1, RY1);                  \
1818c2ecf20Sopenharmony_ci	encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
1828c2ecf20Sopenharmony_ci	preload(c ## 1);                                         \
1838c2ecf20Sopenharmony_ci	encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
1848c2ecf20Sopenharmony_ci
1858c2ecf20Sopenharmony_ci#define decrypt_round(n, a, b, c, d, preload, prerotate) \
1868c2ecf20Sopenharmony_ci	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;                  \
1878c2ecf20Sopenharmony_ci	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;                  \
1888c2ecf20Sopenharmony_ci	round_head_2(a, b, RX0, RY0, RX1, RY1);                  \
1898c2ecf20Sopenharmony_ci	decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
1908c2ecf20Sopenharmony_ci	preload(c ## 1);                                         \
1918c2ecf20Sopenharmony_ci	decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
1928c2ecf20Sopenharmony_ci
1938c2ecf20Sopenharmony_ci#define encrypt_cycle(n) \
1948c2ecf20Sopenharmony_ci	encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
1958c2ecf20Sopenharmony_ci	encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l);
1968c2ecf20Sopenharmony_ci
1978c2ecf20Sopenharmony_ci#define encrypt_cycle_last(n) \
1988c2ecf20Sopenharmony_ci	encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
1998c2ecf20Sopenharmony_ci	encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy);
2008c2ecf20Sopenharmony_ci
2018c2ecf20Sopenharmony_ci#define decrypt_cycle(n) \
2028c2ecf20Sopenharmony_ci	decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
2038c2ecf20Sopenharmony_ci	decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l);
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ci#define decrypt_cycle_last(n) \
2068c2ecf20Sopenharmony_ci	decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
2078c2ecf20Sopenharmony_ci	decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy);
2088c2ecf20Sopenharmony_ci
2098c2ecf20Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
2108c2ecf20Sopenharmony_ci	vpunpckldq		x1, x0, t0; \
2118c2ecf20Sopenharmony_ci	vpunpckhdq		x1, x0, t2; \
2128c2ecf20Sopenharmony_ci	vpunpckldq		x3, x2, t1; \
2138c2ecf20Sopenharmony_ci	vpunpckhdq		x3, x2, x3; \
2148c2ecf20Sopenharmony_ci	\
2158c2ecf20Sopenharmony_ci	vpunpcklqdq		t1, t0, x0; \
2168c2ecf20Sopenharmony_ci	vpunpckhqdq		t1, t0, x1; \
2178c2ecf20Sopenharmony_ci	vpunpcklqdq		x3, t2, x2; \
2188c2ecf20Sopenharmony_ci	vpunpckhqdq		x3, t2, x3;
2198c2ecf20Sopenharmony_ci
2208c2ecf20Sopenharmony_ci#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
2218c2ecf20Sopenharmony_ci	vpxor		x0, wkey, x0; \
2228c2ecf20Sopenharmony_ci	vpxor		x1, wkey, x1; \
2238c2ecf20Sopenharmony_ci	vpxor		x2, wkey, x2; \
2248c2ecf20Sopenharmony_ci	vpxor		x3, wkey, x3; \
2258c2ecf20Sopenharmony_ci	\
2268c2ecf20Sopenharmony_ci	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
2278c2ecf20Sopenharmony_ci
2288c2ecf20Sopenharmony_ci#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
2298c2ecf20Sopenharmony_ci	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
2308c2ecf20Sopenharmony_ci	\
2318c2ecf20Sopenharmony_ci	vpxor		x0, wkey, x0; \
2328c2ecf20Sopenharmony_ci	vpxor		x1, wkey, x1; \
2338c2ecf20Sopenharmony_ci	vpxor		x2, wkey, x2; \
2348c2ecf20Sopenharmony_ci	vpxor		x3, wkey, x3;
2358c2ecf20Sopenharmony_ci
2368c2ecf20Sopenharmony_ci.align 8
2378c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__twofish_enc_blk8)
2388c2ecf20Sopenharmony_ci	/* input:
2398c2ecf20Sopenharmony_ci	 *	%rdi: ctx, CTX
2408c2ecf20Sopenharmony_ci	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
2418c2ecf20Sopenharmony_ci	 * output:
2428c2ecf20Sopenharmony_ci	 *	RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
2438c2ecf20Sopenharmony_ci	 */
2448c2ecf20Sopenharmony_ci
2458c2ecf20Sopenharmony_ci	vmovdqu w(CTX), RK1;
2468c2ecf20Sopenharmony_ci
2478c2ecf20Sopenharmony_ci	pushq %r13;
2488c2ecf20Sopenharmony_ci	pushq %rbx;
2498c2ecf20Sopenharmony_ci	pushq %rcx;
2508c2ecf20Sopenharmony_ci
2518c2ecf20Sopenharmony_ci	inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
2528c2ecf20Sopenharmony_ci	preload_rgi(RA1);
2538c2ecf20Sopenharmony_ci	rotate_1l(RD1);
2548c2ecf20Sopenharmony_ci	inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
2558c2ecf20Sopenharmony_ci	rotate_1l(RD2);
2568c2ecf20Sopenharmony_ci
2578c2ecf20Sopenharmony_ci	encrypt_cycle(0);
2588c2ecf20Sopenharmony_ci	encrypt_cycle(1);
2598c2ecf20Sopenharmony_ci	encrypt_cycle(2);
2608c2ecf20Sopenharmony_ci	encrypt_cycle(3);
2618c2ecf20Sopenharmony_ci	encrypt_cycle(4);
2628c2ecf20Sopenharmony_ci	encrypt_cycle(5);
2638c2ecf20Sopenharmony_ci	encrypt_cycle(6);
2648c2ecf20Sopenharmony_ci	encrypt_cycle_last(7);
2658c2ecf20Sopenharmony_ci
2668c2ecf20Sopenharmony_ci	vmovdqu (w+4*4)(CTX), RK1;
2678c2ecf20Sopenharmony_ci
2688c2ecf20Sopenharmony_ci	popq %rcx;
2698c2ecf20Sopenharmony_ci	popq %rbx;
2708c2ecf20Sopenharmony_ci	popq %r13;
2718c2ecf20Sopenharmony_ci
2728c2ecf20Sopenharmony_ci	outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
2738c2ecf20Sopenharmony_ci	outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
2748c2ecf20Sopenharmony_ci
2758c2ecf20Sopenharmony_ci	RET;
2768c2ecf20Sopenharmony_ciSYM_FUNC_END(__twofish_enc_blk8)
2778c2ecf20Sopenharmony_ci
2788c2ecf20Sopenharmony_ci.align 8
2798c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__twofish_dec_blk8)
2808c2ecf20Sopenharmony_ci	/* input:
2818c2ecf20Sopenharmony_ci	 *	%rdi: ctx, CTX
2828c2ecf20Sopenharmony_ci	 *	RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
2838c2ecf20Sopenharmony_ci	 * output:
2848c2ecf20Sopenharmony_ci	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
2858c2ecf20Sopenharmony_ci	 */
2868c2ecf20Sopenharmony_ci
2878c2ecf20Sopenharmony_ci	vmovdqu (w+4*4)(CTX), RK1;
2888c2ecf20Sopenharmony_ci
2898c2ecf20Sopenharmony_ci	pushq %r13;
2908c2ecf20Sopenharmony_ci	pushq %rbx;
2918c2ecf20Sopenharmony_ci
2928c2ecf20Sopenharmony_ci	inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
2938c2ecf20Sopenharmony_ci	preload_rgi(RC1);
2948c2ecf20Sopenharmony_ci	rotate_1l(RA1);
2958c2ecf20Sopenharmony_ci	inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
2968c2ecf20Sopenharmony_ci	rotate_1l(RA2);
2978c2ecf20Sopenharmony_ci
2988c2ecf20Sopenharmony_ci	decrypt_cycle(7);
2998c2ecf20Sopenharmony_ci	decrypt_cycle(6);
3008c2ecf20Sopenharmony_ci	decrypt_cycle(5);
3018c2ecf20Sopenharmony_ci	decrypt_cycle(4);
3028c2ecf20Sopenharmony_ci	decrypt_cycle(3);
3038c2ecf20Sopenharmony_ci	decrypt_cycle(2);
3048c2ecf20Sopenharmony_ci	decrypt_cycle(1);
3058c2ecf20Sopenharmony_ci	decrypt_cycle_last(0);
3068c2ecf20Sopenharmony_ci
3078c2ecf20Sopenharmony_ci	vmovdqu (w)(CTX), RK1;
3088c2ecf20Sopenharmony_ci
3098c2ecf20Sopenharmony_ci	popq %rbx;
3108c2ecf20Sopenharmony_ci	popq %r13;
3118c2ecf20Sopenharmony_ci
3128c2ecf20Sopenharmony_ci	outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
3138c2ecf20Sopenharmony_ci	outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
3148c2ecf20Sopenharmony_ci
3158c2ecf20Sopenharmony_ci	RET;
3168c2ecf20Sopenharmony_ciSYM_FUNC_END(__twofish_dec_blk8)
3178c2ecf20Sopenharmony_ci
3188c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_ecb_enc_8way)
3198c2ecf20Sopenharmony_ci	/* input:
3208c2ecf20Sopenharmony_ci	 *	%rdi: ctx, CTX
3218c2ecf20Sopenharmony_ci	 *	%rsi: dst
3228c2ecf20Sopenharmony_ci	 *	%rdx: src
3238c2ecf20Sopenharmony_ci	 */
3248c2ecf20Sopenharmony_ci	FRAME_BEGIN
3258c2ecf20Sopenharmony_ci
3268c2ecf20Sopenharmony_ci	movq %rsi, %r11;
3278c2ecf20Sopenharmony_ci
3288c2ecf20Sopenharmony_ci	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
3298c2ecf20Sopenharmony_ci
3308c2ecf20Sopenharmony_ci	call __twofish_enc_blk8;
3318c2ecf20Sopenharmony_ci
3328c2ecf20Sopenharmony_ci	store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
3338c2ecf20Sopenharmony_ci
3348c2ecf20Sopenharmony_ci	FRAME_END
3358c2ecf20Sopenharmony_ci	RET;
3368c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_ecb_enc_8way)
3378c2ecf20Sopenharmony_ci
3388c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_ecb_dec_8way)
3398c2ecf20Sopenharmony_ci	/* input:
3408c2ecf20Sopenharmony_ci	 *	%rdi: ctx, CTX
3418c2ecf20Sopenharmony_ci	 *	%rsi: dst
3428c2ecf20Sopenharmony_ci	 *	%rdx: src
3438c2ecf20Sopenharmony_ci	 */
3448c2ecf20Sopenharmony_ci	FRAME_BEGIN
3458c2ecf20Sopenharmony_ci
3468c2ecf20Sopenharmony_ci	movq %rsi, %r11;
3478c2ecf20Sopenharmony_ci
3488c2ecf20Sopenharmony_ci	load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
3498c2ecf20Sopenharmony_ci
3508c2ecf20Sopenharmony_ci	call __twofish_dec_blk8;
3518c2ecf20Sopenharmony_ci
3528c2ecf20Sopenharmony_ci	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
3538c2ecf20Sopenharmony_ci
3548c2ecf20Sopenharmony_ci	FRAME_END
3558c2ecf20Sopenharmony_ci	RET;
3568c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_ecb_dec_8way)
3578c2ecf20Sopenharmony_ci
3588c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_cbc_dec_8way)
3598c2ecf20Sopenharmony_ci	/* input:
3608c2ecf20Sopenharmony_ci	 *	%rdi: ctx, CTX
3618c2ecf20Sopenharmony_ci	 *	%rsi: dst
3628c2ecf20Sopenharmony_ci	 *	%rdx: src
3638c2ecf20Sopenharmony_ci	 */
3648c2ecf20Sopenharmony_ci	FRAME_BEGIN
3658c2ecf20Sopenharmony_ci
3668c2ecf20Sopenharmony_ci	pushq %r12;
3678c2ecf20Sopenharmony_ci
3688c2ecf20Sopenharmony_ci	movq %rsi, %r11;
3698c2ecf20Sopenharmony_ci	movq %rdx, %r12;
3708c2ecf20Sopenharmony_ci
3718c2ecf20Sopenharmony_ci	load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
3728c2ecf20Sopenharmony_ci
3738c2ecf20Sopenharmony_ci	call __twofish_dec_blk8;
3748c2ecf20Sopenharmony_ci
3758c2ecf20Sopenharmony_ci	store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
3768c2ecf20Sopenharmony_ci
3778c2ecf20Sopenharmony_ci	popq %r12;
3788c2ecf20Sopenharmony_ci
3798c2ecf20Sopenharmony_ci	FRAME_END
3808c2ecf20Sopenharmony_ci	RET;
3818c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_cbc_dec_8way)
3828c2ecf20Sopenharmony_ci
3838c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_ctr_8way)
3848c2ecf20Sopenharmony_ci	/* input:
3858c2ecf20Sopenharmony_ci	 *	%rdi: ctx, CTX
3868c2ecf20Sopenharmony_ci	 *	%rsi: dst
3878c2ecf20Sopenharmony_ci	 *	%rdx: src
3888c2ecf20Sopenharmony_ci	 *	%rcx: iv (little endian, 128bit)
3898c2ecf20Sopenharmony_ci	 */
3908c2ecf20Sopenharmony_ci	FRAME_BEGIN
3918c2ecf20Sopenharmony_ci
3928c2ecf20Sopenharmony_ci	pushq %r12;
3938c2ecf20Sopenharmony_ci
3948c2ecf20Sopenharmony_ci	movq %rsi, %r11;
3958c2ecf20Sopenharmony_ci	movq %rdx, %r12;
3968c2ecf20Sopenharmony_ci
3978c2ecf20Sopenharmony_ci	load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
3988c2ecf20Sopenharmony_ci		      RD2, RX0, RX1, RY0);
3998c2ecf20Sopenharmony_ci
4008c2ecf20Sopenharmony_ci	call __twofish_enc_blk8;
4018c2ecf20Sopenharmony_ci
4028c2ecf20Sopenharmony_ci	store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
4038c2ecf20Sopenharmony_ci
4048c2ecf20Sopenharmony_ci	popq %r12;
4058c2ecf20Sopenharmony_ci
4068c2ecf20Sopenharmony_ci	FRAME_END
4078c2ecf20Sopenharmony_ci	RET;
4088c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_ctr_8way)
4098c2ecf20Sopenharmony_ci
4108c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_xts_enc_8way)
4118c2ecf20Sopenharmony_ci	/* input:
4128c2ecf20Sopenharmony_ci	 *	%rdi: ctx, CTX
4138c2ecf20Sopenharmony_ci	 *	%rsi: dst
4148c2ecf20Sopenharmony_ci	 *	%rdx: src
4158c2ecf20Sopenharmony_ci	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
4168c2ecf20Sopenharmony_ci	 */
4178c2ecf20Sopenharmony_ci	FRAME_BEGIN
4188c2ecf20Sopenharmony_ci
4198c2ecf20Sopenharmony_ci	movq %rsi, %r11;
4208c2ecf20Sopenharmony_ci
4218c2ecf20Sopenharmony_ci	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
4228c2ecf20Sopenharmony_ci	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
4238c2ecf20Sopenharmony_ci		      RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
4248c2ecf20Sopenharmony_ci
4258c2ecf20Sopenharmony_ci	call __twofish_enc_blk8;
4268c2ecf20Sopenharmony_ci
4278c2ecf20Sopenharmony_ci	/* dst <= regs xor IVs(in dst) */
4288c2ecf20Sopenharmony_ci	store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
4298c2ecf20Sopenharmony_ci
4308c2ecf20Sopenharmony_ci	FRAME_END
4318c2ecf20Sopenharmony_ci	RET;
4328c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_xts_enc_8way)
4338c2ecf20Sopenharmony_ci
4348c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_xts_dec_8way)
4358c2ecf20Sopenharmony_ci	/* input:
4368c2ecf20Sopenharmony_ci	 *	%rdi: ctx, CTX
4378c2ecf20Sopenharmony_ci	 *	%rsi: dst
4388c2ecf20Sopenharmony_ci	 *	%rdx: src
4398c2ecf20Sopenharmony_ci	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
4408c2ecf20Sopenharmony_ci	 */
4418c2ecf20Sopenharmony_ci	FRAME_BEGIN
4428c2ecf20Sopenharmony_ci
4438c2ecf20Sopenharmony_ci	movq %rsi, %r11;
4448c2ecf20Sopenharmony_ci
4458c2ecf20Sopenharmony_ci	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
4468c2ecf20Sopenharmony_ci	load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2,
4478c2ecf20Sopenharmony_ci		      RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
4488c2ecf20Sopenharmony_ci
4498c2ecf20Sopenharmony_ci	call __twofish_dec_blk8;
4508c2ecf20Sopenharmony_ci
4518c2ecf20Sopenharmony_ci	/* dst <= regs xor IVs(in dst) */
4528c2ecf20Sopenharmony_ci	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
4538c2ecf20Sopenharmony_ci
4548c2ecf20Sopenharmony_ci	FRAME_END
4558c2ecf20Sopenharmony_ci	RET;
4568c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_xts_dec_8way)
457