18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright (C) 2012 Johannes Goetzfried
68c2ecf20Sopenharmony_ci *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
78c2ecf20Sopenharmony_ci *
88c2ecf20Sopenharmony_ci * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
98c2ecf20Sopenharmony_ci */
108c2ecf20Sopenharmony_ci
118c2ecf20Sopenharmony_ci#include <linux/linkage.h>
128c2ecf20Sopenharmony_ci#include <asm/frame.h>
138c2ecf20Sopenharmony_ci#include "glue_helper-asm-avx.S"
148c2ecf20Sopenharmony_ci
158c2ecf20Sopenharmony_ci.file "cast6-avx-x86_64-asm_64.S"
168c2ecf20Sopenharmony_ci
178c2ecf20Sopenharmony_ci.extern cast_s1
188c2ecf20Sopenharmony_ci.extern cast_s2
198c2ecf20Sopenharmony_ci.extern cast_s3
208c2ecf20Sopenharmony_ci.extern cast_s4
218c2ecf20Sopenharmony_ci
228c2ecf20Sopenharmony_ci/* structure of crypto context */
238c2ecf20Sopenharmony_ci#define km	0
248c2ecf20Sopenharmony_ci#define kr	(12*4*4)
258c2ecf20Sopenharmony_ci
268c2ecf20Sopenharmony_ci/* s-boxes */
278c2ecf20Sopenharmony_ci#define s1	cast_s1
288c2ecf20Sopenharmony_ci#define s2	cast_s2
298c2ecf20Sopenharmony_ci#define s3	cast_s3
308c2ecf20Sopenharmony_ci#define s4	cast_s4
318c2ecf20Sopenharmony_ci
328c2ecf20Sopenharmony_ci/**********************************************************************
338c2ecf20Sopenharmony_ci  8-way AVX cast6
348c2ecf20Sopenharmony_ci **********************************************************************/
358c2ecf20Sopenharmony_ci#define CTX %r15
368c2ecf20Sopenharmony_ci
378c2ecf20Sopenharmony_ci#define RA1 %xmm0
388c2ecf20Sopenharmony_ci#define RB1 %xmm1
398c2ecf20Sopenharmony_ci#define RC1 %xmm2
408c2ecf20Sopenharmony_ci#define RD1 %xmm3
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_ci#define RA2 %xmm4
438c2ecf20Sopenharmony_ci#define RB2 %xmm5
448c2ecf20Sopenharmony_ci#define RC2 %xmm6
458c2ecf20Sopenharmony_ci#define RD2 %xmm7
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci#define RX  %xmm8
488c2ecf20Sopenharmony_ci
498c2ecf20Sopenharmony_ci#define RKM  %xmm9
508c2ecf20Sopenharmony_ci#define RKR  %xmm10
518c2ecf20Sopenharmony_ci#define RKRF %xmm11
528c2ecf20Sopenharmony_ci#define RKRR %xmm12
538c2ecf20Sopenharmony_ci#define R32  %xmm13
548c2ecf20Sopenharmony_ci#define R1ST %xmm14
558c2ecf20Sopenharmony_ci
568c2ecf20Sopenharmony_ci#define RTMP %xmm15
578c2ecf20Sopenharmony_ci
588c2ecf20Sopenharmony_ci#define RID1  %rdi
598c2ecf20Sopenharmony_ci#define RID1d %edi
608c2ecf20Sopenharmony_ci#define RID2  %rsi
618c2ecf20Sopenharmony_ci#define RID2d %esi
628c2ecf20Sopenharmony_ci
638c2ecf20Sopenharmony_ci#define RGI1   %rdx
648c2ecf20Sopenharmony_ci#define RGI1bl %dl
658c2ecf20Sopenharmony_ci#define RGI1bh %dh
668c2ecf20Sopenharmony_ci#define RGI2   %rcx
678c2ecf20Sopenharmony_ci#define RGI2bl %cl
688c2ecf20Sopenharmony_ci#define RGI2bh %ch
698c2ecf20Sopenharmony_ci
708c2ecf20Sopenharmony_ci#define RGI3   %rax
718c2ecf20Sopenharmony_ci#define RGI3bl %al
728c2ecf20Sopenharmony_ci#define RGI3bh %ah
738c2ecf20Sopenharmony_ci#define RGI4   %rbx
748c2ecf20Sopenharmony_ci#define RGI4bl %bl
758c2ecf20Sopenharmony_ci#define RGI4bh %bh
768c2ecf20Sopenharmony_ci
778c2ecf20Sopenharmony_ci#define RFS1  %r8
788c2ecf20Sopenharmony_ci#define RFS1d %r8d
798c2ecf20Sopenharmony_ci#define RFS2  %r9
808c2ecf20Sopenharmony_ci#define RFS2d %r9d
818c2ecf20Sopenharmony_ci#define RFS3  %r10
828c2ecf20Sopenharmony_ci#define RFS3d %r10d
838c2ecf20Sopenharmony_ci
848c2ecf20Sopenharmony_ci
858c2ecf20Sopenharmony_ci#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
868c2ecf20Sopenharmony_ci	movzbl		src ## bh,     RID1d;    \
878c2ecf20Sopenharmony_ci	movzbl		src ## bl,     RID2d;    \
888c2ecf20Sopenharmony_ci	shrq $16,	src;                     \
898c2ecf20Sopenharmony_ci	movl		s1(, RID1, 4), dst ## d; \
908c2ecf20Sopenharmony_ci	op1		s2(, RID2, 4), dst ## d; \
918c2ecf20Sopenharmony_ci	movzbl		src ## bh,     RID1d;    \
928c2ecf20Sopenharmony_ci	movzbl		src ## bl,     RID2d;    \
938c2ecf20Sopenharmony_ci	interleave_op(il_reg);			 \
948c2ecf20Sopenharmony_ci	op2		s3(, RID1, 4), dst ## d; \
958c2ecf20Sopenharmony_ci	op3		s4(, RID2, 4), dst ## d;
968c2ecf20Sopenharmony_ci
978c2ecf20Sopenharmony_ci#define dummy(d) /* do nothing */
988c2ecf20Sopenharmony_ci
998c2ecf20Sopenharmony_ci#define shr_next(reg) \
1008c2ecf20Sopenharmony_ci	shrq $16,	reg;
1018c2ecf20Sopenharmony_ci
1028c2ecf20Sopenharmony_ci#define F_head(a, x, gi1, gi2, op0) \
1038c2ecf20Sopenharmony_ci	op0	a,	RKM,  x;                 \
1048c2ecf20Sopenharmony_ci	vpslld	RKRF,	x,    RTMP;              \
1058c2ecf20Sopenharmony_ci	vpsrld	RKRR,	x,    x;                 \
1068c2ecf20Sopenharmony_ci	vpor	RTMP,	x,    x;                 \
1078c2ecf20Sopenharmony_ci	\
1088c2ecf20Sopenharmony_ci	vmovq		x,    gi1;               \
1098c2ecf20Sopenharmony_ci	vpextrq $1,	x,    gi2;
1108c2ecf20Sopenharmony_ci
1118c2ecf20Sopenharmony_ci#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
1128c2ecf20Sopenharmony_ci	lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
1138c2ecf20Sopenharmony_ci	lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
1148c2ecf20Sopenharmony_ci	\
1158c2ecf20Sopenharmony_ci	lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none);     \
1168c2ecf20Sopenharmony_ci	shlq $32,	RFS2;                                      \
1178c2ecf20Sopenharmony_ci	orq		RFS1, RFS2;                                \
1188c2ecf20Sopenharmony_ci	lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none);     \
1198c2ecf20Sopenharmony_ci	shlq $32,	RFS1;                                      \
1208c2ecf20Sopenharmony_ci	orq		RFS1, RFS3;                                \
1218c2ecf20Sopenharmony_ci	\
1228c2ecf20Sopenharmony_ci	vmovq		RFS2, x;                                   \
1238c2ecf20Sopenharmony_ci	vpinsrq $1,	RFS3, x, x;
1248c2ecf20Sopenharmony_ci
1258c2ecf20Sopenharmony_ci#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
1268c2ecf20Sopenharmony_ci	F_head(b1, RX, RGI1, RGI2, op0);              \
1278c2ecf20Sopenharmony_ci	F_head(b2, RX, RGI3, RGI4, op0);              \
1288c2ecf20Sopenharmony_ci	\
1298c2ecf20Sopenharmony_ci	F_tail(b1, RX, RGI1, RGI2, op1, op2, op3);    \
1308c2ecf20Sopenharmony_ci	F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3);  \
1318c2ecf20Sopenharmony_ci	\
1328c2ecf20Sopenharmony_ci	vpxor		a1, RX,   a1;                 \
1338c2ecf20Sopenharmony_ci	vpxor		a2, RTMP, a2;
1348c2ecf20Sopenharmony_ci
1358c2ecf20Sopenharmony_ci#define F1_2(a1, b1, a2, b2) \
1368c2ecf20Sopenharmony_ci	F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
1378c2ecf20Sopenharmony_ci#define F2_2(a1, b1, a2, b2) \
1388c2ecf20Sopenharmony_ci	F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
1398c2ecf20Sopenharmony_ci#define F3_2(a1, b1, a2, b2) \
1408c2ecf20Sopenharmony_ci	F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
1418c2ecf20Sopenharmony_ci
1428c2ecf20Sopenharmony_ci#define qop(in, out, f) \
1438c2ecf20Sopenharmony_ci	F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
1448c2ecf20Sopenharmony_ci
1458c2ecf20Sopenharmony_ci#define get_round_keys(nn) \
1468c2ecf20Sopenharmony_ci	vbroadcastss	(km+(4*(nn)))(CTX), RKM;        \
1478c2ecf20Sopenharmony_ci	vpand		R1ST,               RKR,  RKRF; \
1488c2ecf20Sopenharmony_ci	vpsubq		RKRF,               R32,  RKRR; \
1498c2ecf20Sopenharmony_ci	vpsrldq $1,	RKR,                RKR;
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_ci#define Q(n) \
1528c2ecf20Sopenharmony_ci	get_round_keys(4*n+0); \
1538c2ecf20Sopenharmony_ci	qop(RD, RC, 1);        \
1548c2ecf20Sopenharmony_ci	\
1558c2ecf20Sopenharmony_ci	get_round_keys(4*n+1); \
1568c2ecf20Sopenharmony_ci	qop(RC, RB, 2);        \
1578c2ecf20Sopenharmony_ci	\
1588c2ecf20Sopenharmony_ci	get_round_keys(4*n+2); \
1598c2ecf20Sopenharmony_ci	qop(RB, RA, 3);        \
1608c2ecf20Sopenharmony_ci	\
1618c2ecf20Sopenharmony_ci	get_round_keys(4*n+3); \
1628c2ecf20Sopenharmony_ci	qop(RA, RD, 1);
1638c2ecf20Sopenharmony_ci
1648c2ecf20Sopenharmony_ci#define QBAR(n) \
1658c2ecf20Sopenharmony_ci	get_round_keys(4*n+3); \
1668c2ecf20Sopenharmony_ci	qop(RA, RD, 1);        \
1678c2ecf20Sopenharmony_ci	\
1688c2ecf20Sopenharmony_ci	get_round_keys(4*n+2); \
1698c2ecf20Sopenharmony_ci	qop(RB, RA, 3);        \
1708c2ecf20Sopenharmony_ci	\
1718c2ecf20Sopenharmony_ci	get_round_keys(4*n+1); \
1728c2ecf20Sopenharmony_ci	qop(RC, RB, 2);        \
1738c2ecf20Sopenharmony_ci	\
1748c2ecf20Sopenharmony_ci	get_round_keys(4*n+0); \
1758c2ecf20Sopenharmony_ci	qop(RD, RC, 1);
1768c2ecf20Sopenharmony_ci
1778c2ecf20Sopenharmony_ci#define shuffle(mask) \
1788c2ecf20Sopenharmony_ci	vpshufb		mask,            RKR, RKR;
1798c2ecf20Sopenharmony_ci
1808c2ecf20Sopenharmony_ci#define preload_rkr(n, do_mask, mask) \
1818c2ecf20Sopenharmony_ci	vbroadcastss	.L16_mask,                RKR;      \
1828c2ecf20Sopenharmony_ci	/* add 16-bit rotation to key rotations (mod 32) */ \
1838c2ecf20Sopenharmony_ci	vpxor		(kr+n*16)(CTX),           RKR, RKR; \
1848c2ecf20Sopenharmony_ci	do_mask(mask);
1858c2ecf20Sopenharmony_ci
1868c2ecf20Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
1878c2ecf20Sopenharmony_ci	vpunpckldq		x1, x0, t0; \
1888c2ecf20Sopenharmony_ci	vpunpckhdq		x1, x0, t2; \
1898c2ecf20Sopenharmony_ci	vpunpckldq		x3, x2, t1; \
1908c2ecf20Sopenharmony_ci	vpunpckhdq		x3, x2, x3; \
1918c2ecf20Sopenharmony_ci	\
1928c2ecf20Sopenharmony_ci	vpunpcklqdq		t1, t0, x0; \
1938c2ecf20Sopenharmony_ci	vpunpckhqdq		t1, t0, x1; \
1948c2ecf20Sopenharmony_ci	vpunpcklqdq		x3, t2, x2; \
1958c2ecf20Sopenharmony_ci	vpunpckhqdq		x3, t2, x3;
1968c2ecf20Sopenharmony_ci
1978c2ecf20Sopenharmony_ci#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
1988c2ecf20Sopenharmony_ci	vpshufb rmask, x0,	x0; \
1998c2ecf20Sopenharmony_ci	vpshufb rmask, x1,	x1; \
2008c2ecf20Sopenharmony_ci	vpshufb rmask, x2,	x2; \
2018c2ecf20Sopenharmony_ci	vpshufb rmask, x3,	x3; \
2028c2ecf20Sopenharmony_ci	\
2038c2ecf20Sopenharmony_ci	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ci#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
2068c2ecf20Sopenharmony_ci	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
2078c2ecf20Sopenharmony_ci	\
2088c2ecf20Sopenharmony_ci	vpshufb rmask,		x0, x0;       \
2098c2ecf20Sopenharmony_ci	vpshufb rmask,		x1, x1;       \
2108c2ecf20Sopenharmony_ci	vpshufb rmask,		x2, x2;       \
2118c2ecf20Sopenharmony_ci	vpshufb rmask,		x3, x3;
2128c2ecf20Sopenharmony_ci
2138c2ecf20Sopenharmony_ci.section	.rodata.cst16, "aM", @progbits, 16
2148c2ecf20Sopenharmony_ci.align 16
2158c2ecf20Sopenharmony_ci.Lxts_gf128mul_and_shl1_mask:
2168c2ecf20Sopenharmony_ci	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
2178c2ecf20Sopenharmony_ci.Lbswap_mask:
2188c2ecf20Sopenharmony_ci	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
2198c2ecf20Sopenharmony_ci.Lbswap128_mask:
2208c2ecf20Sopenharmony_ci	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2218c2ecf20Sopenharmony_ci.Lrkr_enc_Q_Q_QBAR_QBAR:
2228c2ecf20Sopenharmony_ci	.byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
2238c2ecf20Sopenharmony_ci.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
2248c2ecf20Sopenharmony_ci	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
2258c2ecf20Sopenharmony_ci.Lrkr_dec_Q_Q_Q_Q:
2268c2ecf20Sopenharmony_ci	.byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
2278c2ecf20Sopenharmony_ci.Lrkr_dec_Q_Q_QBAR_QBAR:
2288c2ecf20Sopenharmony_ci	.byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
2298c2ecf20Sopenharmony_ci.Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
2308c2ecf20Sopenharmony_ci	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2318c2ecf20Sopenharmony_ci
2328c2ecf20Sopenharmony_ci.section	.rodata.cst4.L16_mask, "aM", @progbits, 4
2338c2ecf20Sopenharmony_ci.align 4
2348c2ecf20Sopenharmony_ci.L16_mask:
2358c2ecf20Sopenharmony_ci	.byte 16, 16, 16, 16
2368c2ecf20Sopenharmony_ci
2378c2ecf20Sopenharmony_ci.section	.rodata.cst4.L32_mask, "aM", @progbits, 4
2388c2ecf20Sopenharmony_ci.align 4
2398c2ecf20Sopenharmony_ci.L32_mask:
2408c2ecf20Sopenharmony_ci	.byte 32, 0, 0, 0
2418c2ecf20Sopenharmony_ci
2428c2ecf20Sopenharmony_ci.section	.rodata.cst4.first_mask, "aM", @progbits, 4
2438c2ecf20Sopenharmony_ci.align 4
2448c2ecf20Sopenharmony_ci.Lfirst_mask:
2458c2ecf20Sopenharmony_ci	.byte 0x1f, 0, 0, 0
2468c2ecf20Sopenharmony_ci
2478c2ecf20Sopenharmony_ci.text
2488c2ecf20Sopenharmony_ci
2498c2ecf20Sopenharmony_ci.align 8
2508c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__cast6_enc_blk8)
2518c2ecf20Sopenharmony_ci	/* input:
2528c2ecf20Sopenharmony_ci	 *	%rdi: ctx
2538c2ecf20Sopenharmony_ci	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
2548c2ecf20Sopenharmony_ci	 * output:
2558c2ecf20Sopenharmony_ci	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
2568c2ecf20Sopenharmony_ci	 */
2578c2ecf20Sopenharmony_ci
2588c2ecf20Sopenharmony_ci	pushq %r15;
2598c2ecf20Sopenharmony_ci	pushq %rbx;
2608c2ecf20Sopenharmony_ci
2618c2ecf20Sopenharmony_ci	movq %rdi, CTX;
2628c2ecf20Sopenharmony_ci
2638c2ecf20Sopenharmony_ci	vmovdqa .Lbswap_mask, RKM;
2648c2ecf20Sopenharmony_ci	vmovd .Lfirst_mask, R1ST;
2658c2ecf20Sopenharmony_ci	vmovd .L32_mask, R32;
2668c2ecf20Sopenharmony_ci
2678c2ecf20Sopenharmony_ci	inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
2688c2ecf20Sopenharmony_ci	inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
2698c2ecf20Sopenharmony_ci
2708c2ecf20Sopenharmony_ci	preload_rkr(0, dummy, none);
2718c2ecf20Sopenharmony_ci	Q(0);
2728c2ecf20Sopenharmony_ci	Q(1);
2738c2ecf20Sopenharmony_ci	Q(2);
2748c2ecf20Sopenharmony_ci	Q(3);
2758c2ecf20Sopenharmony_ci	preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
2768c2ecf20Sopenharmony_ci	Q(4);
2778c2ecf20Sopenharmony_ci	Q(5);
2788c2ecf20Sopenharmony_ci	QBAR(6);
2798c2ecf20Sopenharmony_ci	QBAR(7);
2808c2ecf20Sopenharmony_ci	preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
2818c2ecf20Sopenharmony_ci	QBAR(8);
2828c2ecf20Sopenharmony_ci	QBAR(9);
2838c2ecf20Sopenharmony_ci	QBAR(10);
2848c2ecf20Sopenharmony_ci	QBAR(11);
2858c2ecf20Sopenharmony_ci
2868c2ecf20Sopenharmony_ci	popq %rbx;
2878c2ecf20Sopenharmony_ci	popq %r15;
2888c2ecf20Sopenharmony_ci
2898c2ecf20Sopenharmony_ci	vmovdqa .Lbswap_mask, RKM;
2908c2ecf20Sopenharmony_ci
2918c2ecf20Sopenharmony_ci	outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
2928c2ecf20Sopenharmony_ci	outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
2938c2ecf20Sopenharmony_ci
2948c2ecf20Sopenharmony_ci	RET;
2958c2ecf20Sopenharmony_ciSYM_FUNC_END(__cast6_enc_blk8)
2968c2ecf20Sopenharmony_ci
2978c2ecf20Sopenharmony_ci.align 8
2988c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__cast6_dec_blk8)
2998c2ecf20Sopenharmony_ci	/* input:
3008c2ecf20Sopenharmony_ci	 *	%rdi: ctx
3018c2ecf20Sopenharmony_ci	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
3028c2ecf20Sopenharmony_ci	 * output:
3038c2ecf20Sopenharmony_ci	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
3048c2ecf20Sopenharmony_ci	 */
3058c2ecf20Sopenharmony_ci
3068c2ecf20Sopenharmony_ci	pushq %r15;
3078c2ecf20Sopenharmony_ci	pushq %rbx;
3088c2ecf20Sopenharmony_ci
3098c2ecf20Sopenharmony_ci	movq %rdi, CTX;
3108c2ecf20Sopenharmony_ci
3118c2ecf20Sopenharmony_ci	vmovdqa .Lbswap_mask, RKM;
3128c2ecf20Sopenharmony_ci	vmovd .Lfirst_mask, R1ST;
3138c2ecf20Sopenharmony_ci	vmovd .L32_mask, R32;
3148c2ecf20Sopenharmony_ci
3158c2ecf20Sopenharmony_ci	inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
3168c2ecf20Sopenharmony_ci	inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
3178c2ecf20Sopenharmony_ci
3188c2ecf20Sopenharmony_ci	preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
3198c2ecf20Sopenharmony_ci	Q(11);
3208c2ecf20Sopenharmony_ci	Q(10);
3218c2ecf20Sopenharmony_ci	Q(9);
3228c2ecf20Sopenharmony_ci	Q(8);
3238c2ecf20Sopenharmony_ci	preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
3248c2ecf20Sopenharmony_ci	Q(7);
3258c2ecf20Sopenharmony_ci	Q(6);
3268c2ecf20Sopenharmony_ci	QBAR(5);
3278c2ecf20Sopenharmony_ci	QBAR(4);
3288c2ecf20Sopenharmony_ci	preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
3298c2ecf20Sopenharmony_ci	QBAR(3);
3308c2ecf20Sopenharmony_ci	QBAR(2);
3318c2ecf20Sopenharmony_ci	QBAR(1);
3328c2ecf20Sopenharmony_ci	QBAR(0);
3338c2ecf20Sopenharmony_ci
3348c2ecf20Sopenharmony_ci	popq %rbx;
3358c2ecf20Sopenharmony_ci	popq %r15;
3368c2ecf20Sopenharmony_ci
3378c2ecf20Sopenharmony_ci	vmovdqa .Lbswap_mask, RKM;
3388c2ecf20Sopenharmony_ci	outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
3398c2ecf20Sopenharmony_ci	outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
3408c2ecf20Sopenharmony_ci
3418c2ecf20Sopenharmony_ci	RET;
3428c2ecf20Sopenharmony_ciSYM_FUNC_END(__cast6_dec_blk8)
3438c2ecf20Sopenharmony_ci
3448c2ecf20Sopenharmony_ciSYM_FUNC_START(cast6_ecb_enc_8way)
3458c2ecf20Sopenharmony_ci	/* input:
3468c2ecf20Sopenharmony_ci	 *	%rdi: ctx
3478c2ecf20Sopenharmony_ci	 *	%rsi: dst
3488c2ecf20Sopenharmony_ci	 *	%rdx: src
3498c2ecf20Sopenharmony_ci	 */
3508c2ecf20Sopenharmony_ci	FRAME_BEGIN
3518c2ecf20Sopenharmony_ci	pushq %r15;
3528c2ecf20Sopenharmony_ci
3538c2ecf20Sopenharmony_ci	movq %rdi, CTX;
3548c2ecf20Sopenharmony_ci	movq %rsi, %r11;
3558c2ecf20Sopenharmony_ci
3568c2ecf20Sopenharmony_ci	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
3578c2ecf20Sopenharmony_ci
3588c2ecf20Sopenharmony_ci	call __cast6_enc_blk8;
3598c2ecf20Sopenharmony_ci
3608c2ecf20Sopenharmony_ci	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
3618c2ecf20Sopenharmony_ci
3628c2ecf20Sopenharmony_ci	popq %r15;
3638c2ecf20Sopenharmony_ci	FRAME_END
3648c2ecf20Sopenharmony_ci	RET;
3658c2ecf20Sopenharmony_ciSYM_FUNC_END(cast6_ecb_enc_8way)
3668c2ecf20Sopenharmony_ci
3678c2ecf20Sopenharmony_ciSYM_FUNC_START(cast6_ecb_dec_8way)
3688c2ecf20Sopenharmony_ci	/* input:
3698c2ecf20Sopenharmony_ci	 *	%rdi: ctx
3708c2ecf20Sopenharmony_ci	 *	%rsi: dst
3718c2ecf20Sopenharmony_ci	 *	%rdx: src
3728c2ecf20Sopenharmony_ci	 */
3738c2ecf20Sopenharmony_ci	FRAME_BEGIN
3748c2ecf20Sopenharmony_ci	pushq %r15;
3758c2ecf20Sopenharmony_ci
3768c2ecf20Sopenharmony_ci	movq %rdi, CTX;
3778c2ecf20Sopenharmony_ci	movq %rsi, %r11;
3788c2ecf20Sopenharmony_ci
3798c2ecf20Sopenharmony_ci	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
3808c2ecf20Sopenharmony_ci
3818c2ecf20Sopenharmony_ci	call __cast6_dec_blk8;
3828c2ecf20Sopenharmony_ci
3838c2ecf20Sopenharmony_ci	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
3848c2ecf20Sopenharmony_ci
3858c2ecf20Sopenharmony_ci	popq %r15;
3868c2ecf20Sopenharmony_ci	FRAME_END
3878c2ecf20Sopenharmony_ci	RET;
3888c2ecf20Sopenharmony_ciSYM_FUNC_END(cast6_ecb_dec_8way)
3898c2ecf20Sopenharmony_ci
3908c2ecf20Sopenharmony_ciSYM_FUNC_START(cast6_cbc_dec_8way)
3918c2ecf20Sopenharmony_ci	/* input:
3928c2ecf20Sopenharmony_ci	 *	%rdi: ctx
3938c2ecf20Sopenharmony_ci	 *	%rsi: dst
3948c2ecf20Sopenharmony_ci	 *	%rdx: src
3958c2ecf20Sopenharmony_ci	 */
3968c2ecf20Sopenharmony_ci	FRAME_BEGIN
3978c2ecf20Sopenharmony_ci	pushq %r12;
3988c2ecf20Sopenharmony_ci	pushq %r15;
3998c2ecf20Sopenharmony_ci
4008c2ecf20Sopenharmony_ci	movq %rdi, CTX;
4018c2ecf20Sopenharmony_ci	movq %rsi, %r11;
4028c2ecf20Sopenharmony_ci	movq %rdx, %r12;
4038c2ecf20Sopenharmony_ci
4048c2ecf20Sopenharmony_ci	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
4058c2ecf20Sopenharmony_ci
4068c2ecf20Sopenharmony_ci	call __cast6_dec_blk8;
4078c2ecf20Sopenharmony_ci
4088c2ecf20Sopenharmony_ci	store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
4098c2ecf20Sopenharmony_ci
4108c2ecf20Sopenharmony_ci	popq %r15;
4118c2ecf20Sopenharmony_ci	popq %r12;
4128c2ecf20Sopenharmony_ci	FRAME_END
4138c2ecf20Sopenharmony_ci	RET;
4148c2ecf20Sopenharmony_ciSYM_FUNC_END(cast6_cbc_dec_8way)
4158c2ecf20Sopenharmony_ci
4168c2ecf20Sopenharmony_ciSYM_FUNC_START(cast6_ctr_8way)
4178c2ecf20Sopenharmony_ci	/* input:
4188c2ecf20Sopenharmony_ci	 *	%rdi: ctx, CTX
4198c2ecf20Sopenharmony_ci	 *	%rsi: dst
4208c2ecf20Sopenharmony_ci	 *	%rdx: src
4218c2ecf20Sopenharmony_ci	 *	%rcx: iv (little endian, 128bit)
4228c2ecf20Sopenharmony_ci	 */
4238c2ecf20Sopenharmony_ci	FRAME_BEGIN
4248c2ecf20Sopenharmony_ci	pushq %r12;
4258c2ecf20Sopenharmony_ci	pushq %r15
4268c2ecf20Sopenharmony_ci
4278c2ecf20Sopenharmony_ci	movq %rdi, CTX;
4288c2ecf20Sopenharmony_ci	movq %rsi, %r11;
4298c2ecf20Sopenharmony_ci	movq %rdx, %r12;
4308c2ecf20Sopenharmony_ci
4318c2ecf20Sopenharmony_ci	load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
4328c2ecf20Sopenharmony_ci		      RD2, RX, RKR, RKM);
4338c2ecf20Sopenharmony_ci
4348c2ecf20Sopenharmony_ci	call __cast6_enc_blk8;
4358c2ecf20Sopenharmony_ci
4368c2ecf20Sopenharmony_ci	store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
4378c2ecf20Sopenharmony_ci
4388c2ecf20Sopenharmony_ci	popq %r15;
4398c2ecf20Sopenharmony_ci	popq %r12;
4408c2ecf20Sopenharmony_ci	FRAME_END
4418c2ecf20Sopenharmony_ci	RET;
4428c2ecf20Sopenharmony_ciSYM_FUNC_END(cast6_ctr_8way)
4438c2ecf20Sopenharmony_ci
4448c2ecf20Sopenharmony_ciSYM_FUNC_START(cast6_xts_enc_8way)
4458c2ecf20Sopenharmony_ci	/* input:
4468c2ecf20Sopenharmony_ci	 *	%rdi: ctx, CTX
4478c2ecf20Sopenharmony_ci	 *	%rsi: dst
4488c2ecf20Sopenharmony_ci	 *	%rdx: src
4498c2ecf20Sopenharmony_ci	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
4508c2ecf20Sopenharmony_ci	 */
4518c2ecf20Sopenharmony_ci	FRAME_BEGIN
4528c2ecf20Sopenharmony_ci	pushq %r15;
4538c2ecf20Sopenharmony_ci
4548c2ecf20Sopenharmony_ci	movq %rdi, CTX
4558c2ecf20Sopenharmony_ci	movq %rsi, %r11;
4568c2ecf20Sopenharmony_ci
4578c2ecf20Sopenharmony_ci	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
4588c2ecf20Sopenharmony_ci	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
4598c2ecf20Sopenharmony_ci		      RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
4608c2ecf20Sopenharmony_ci
4618c2ecf20Sopenharmony_ci	call __cast6_enc_blk8;
4628c2ecf20Sopenharmony_ci
4638c2ecf20Sopenharmony_ci	/* dst <= regs xor IVs(in dst) */
4648c2ecf20Sopenharmony_ci	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
4658c2ecf20Sopenharmony_ci
4668c2ecf20Sopenharmony_ci	popq %r15;
4678c2ecf20Sopenharmony_ci	FRAME_END
4688c2ecf20Sopenharmony_ci	RET;
4698c2ecf20Sopenharmony_ciSYM_FUNC_END(cast6_xts_enc_8way)
4708c2ecf20Sopenharmony_ci
4718c2ecf20Sopenharmony_ciSYM_FUNC_START(cast6_xts_dec_8way)
4728c2ecf20Sopenharmony_ci	/* input:
4738c2ecf20Sopenharmony_ci	 *	%rdi: ctx, CTX
4748c2ecf20Sopenharmony_ci	 *	%rsi: dst
4758c2ecf20Sopenharmony_ci	 *	%rdx: src
4768c2ecf20Sopenharmony_ci	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
4778c2ecf20Sopenharmony_ci	 */
4788c2ecf20Sopenharmony_ci	FRAME_BEGIN
4798c2ecf20Sopenharmony_ci	pushq %r15;
4808c2ecf20Sopenharmony_ci
4818c2ecf20Sopenharmony_ci	movq %rdi, CTX
4828c2ecf20Sopenharmony_ci	movq %rsi, %r11;
4838c2ecf20Sopenharmony_ci
4848c2ecf20Sopenharmony_ci	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
4858c2ecf20Sopenharmony_ci	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
4868c2ecf20Sopenharmony_ci		      RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
4878c2ecf20Sopenharmony_ci
4888c2ecf20Sopenharmony_ci	call __cast6_dec_blk8;
4898c2ecf20Sopenharmony_ci
4908c2ecf20Sopenharmony_ci	/* dst <= regs xor IVs(in dst) */
4918c2ecf20Sopenharmony_ci	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
4928c2ecf20Sopenharmony_ci
4938c2ecf20Sopenharmony_ci	popq %r15;
4948c2ecf20Sopenharmony_ci	FRAME_END
4958c2ecf20Sopenharmony_ci	RET;
4968c2ecf20Sopenharmony_ciSYM_FUNC_END(cast6_xts_dec_8way)
497